[llvm] [AMDGPU] Allocate i1 argument to SGPRs (PR #72461)
Jun Wang via llvm-commits
llvm-commits at lists.llvm.org
Mon May 13 15:47:06 PDT 2024
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/72461
>From 75f1a46f910dd86edb465d6f3f6b4cf494baebaf Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 15 Nov 2023 19:48:41 -0600
Subject: [PATCH 01/20] [AMDGPU] Allocate i1 argument to SGPRs
Currently i1 arguments are passed as 32-bit VGPRs. It would make more
sense to make use of SGPRs and pass these values as a wavesize bool mask.
---
llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 5 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 13 +++++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 23 +++++++++
llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 6 +++
llvm/test/CodeGen/AMDGPU/z_callee.ll | 33 ++++++++++++
llvm/test/CodeGen/AMDGPU/z_caller.ll | 43 ++++++++++++++++
llvm/test/CodeGen/AMDGPU/z_caller2.ll | 57 +++++++++++++++++++++
7 files changed, 179 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/z_callee.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/z_caller.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/z_caller2.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 4be64629ddac8..faf82d412eb0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -187,9 +187,12 @@ def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
// Calling convention for leaf functions
def CC_AMDGPU_Func : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
- CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+ CCIfType<[i1] , CCAssignToReg<
+ !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
+ >>,
+
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
!foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
>>>,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0a3a56e9b3a0b..88e387e1df609 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3668,6 +3668,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
+ // In code below (after call of AnalyzeCallOperands),
+ // if (!Subtarget->enableFlatScratch()), it would use either s[48:51] or
+ // s[0:3]. Therefore, before calling AnalyzeCallOperands, we may need to
+ // reserve these registers.
+ if (!Subtarget->enableFlatScratch()) {
+ if (IsChainCallConv)
+ CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+ AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}, 4);
+ else
+ CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+ AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+ }
+
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 08351c49b2231..c0c093d3f4975 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -861,6 +861,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+ // When calling convention allocates SGPR for i1 argument, we may
+ // have a SRPR_64 to SReg_32 copy for an outgoing i1 argument. Adjust
+ // the copy to avoid illegal copy.
+ if (AMDGPU::SGPR_64RegClass.contains(SrcReg)) {
+ auto sub0 = RI.getSubReg(SrcReg, AMDGPU::sub0);
+ if (sub0 != DestReg)
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg).addReg(sub0);
+ return;
+ }
+
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
@@ -894,6 +904,19 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+ // When an i1 argument is allocated to an SGPR_32, we may have a COPY
+ // from SGPR_32 to SReg_64. The following handles this case to avoid
+ // an illegal copy.
+ if(AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
+ auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
+ if (sub0 != SrcReg) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
+ }
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+ RI.getSubReg(DestReg, AMDGPU::sub1)).addImm(0);
+ return;
+ }
+
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 32dad0c425c04..e4b95b66287fd 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -481,6 +481,12 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() {
if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
continue;
+ // When the calling convention allocates i1 argument to SGPR,
+ // we may have a COPY with dst being an SGPR_32. This should
+ // not be lowered into V_CNDMASK_B32.
+ if(AMDGPU::SGPR_32RegClass.contains(DstReg))
+ continue;
+
Changed = true;
// Copy into a 32-bit vector register.
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
new file mode 100644
index 0000000000000..2fc4befa279f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_callee.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1(i1 %arg0) #0 {
+; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
+; Therefore, the "s_mov_b32 s5, 0" is generated.
+;
+; CIGFX89-LABEL: void_func_i1:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s5, 0
+; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i1 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
new file mode 100644
index 0000000000000..faf25e407fca2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1(i1) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_imm:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s3
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s4, -1
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_getpc_b64 s[8:9]
+; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_imm:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_mov_b32 s32, 0
+; GFX11-NEXT: s_getpc_b64 s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: s_endpgm
+ call void @external_void_func_i1(i1 true)
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
new file mode 100644
index 0000000000000..e63ae50b7e91c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1_signext(i1 signext) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_signext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s5
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_getpc_b64 s[8:9]
+; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_signext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s32, 0
+; GFX11-NEXT: s_getpc_b64 s[4:5]
+; GFX11-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
+; GFX11-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 1, v0
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT: s_endpgm
+ %var = load volatile i1, ptr addrspace(1) undef
+ call void @external_void_func_i1_signext(i1 signext %var)
+ ret void
+}
+
+
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
>From 67365b80a1ffe5962699cbe6cd3e96d7bc05cd47 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 15 Nov 2023 20:37:27 -0600
Subject: [PATCH 02/20] Fix format.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 +++++++----
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 13 +++++++------
llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 2 +-
3 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 88e387e1df609..9fce3de9e02d2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3674,11 +3674,14 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// reserve these registers.
if (!Subtarget->enableFlatScratch()) {
if (IsChainCallConv)
- CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
- AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}, 4);
+ CCInfo.AllocateRegBlock(
+ ArrayRef<MCPhysReg>{AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50,
+ AMDGPU::SGPR51},
+ 4);
else
- CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
- AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+ CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
+ AMDGPU::SGPR2, AMDGPU::SGPR3},
+ 4);
}
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c0c093d3f4975..3d89e6bcd6f3a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -876,7 +876,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addReg(SrcReg, getKillRegState(KillSrc));
return;
}
@@ -891,13 +891,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (DestReg == AMDGPU::VCC) {
if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addReg(SrcReg, getKillRegState(KillSrc));
} else {
// FIXME: Hack until VReg_1 removed.
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
- .addImm(0)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
}
return;
@@ -907,13 +907,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// When an i1 argument is allocated to an SGPR_32, we may have a COPY
// from SGPR_32 to SReg_64. The following handles this case to avoid
// an illegal copy.
- if(AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
+ if (AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
if (sub0 != SrcReg) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
}
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
- RI.getSubReg(DestReg, AMDGPU::sub1)).addImm(0);
+ RI.getSubReg(DestReg, AMDGPU::sub1))
+ .addImm(0);
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index e4b95b66287fd..b2022714d9edc 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -484,7 +484,7 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() {
// When the calling convention allocates i1 argument to SGPR,
// we may have a COPY with dst being an SGPR_32. This should
// not be lowered into V_CNDMASK_B32.
- if(AMDGPU::SGPR_32RegClass.contains(DstReg))
+ if (AMDGPU::SGPR_32RegClass.contains(DstReg))
continue;
Changed = true;
>From ae46c82f9561d50d85382db345e7eb627902cf14 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 30 Nov 2023 12:31:17 -0600
Subject: [PATCH 03/20] Creating a custom calling conv function for i1.
---
llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 9 +--
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 31 +++++++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 ++-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 24 ------
llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 13 +--
llvm/test/CodeGen/AMDGPU/z_callee.ll | 7 +-
llvm/test/CodeGen/AMDGPU/z_caller.ll | 6 +-
llvm/test/CodeGen/AMDGPU/z_caller2.ll | 4 +-
llvm/test/CodeGen/AMDGPU/z_return.ll | 80 +++++++++++++++++++
9 files changed, 137 insertions(+), 46 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/z_return.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index faf82d412eb0c..863d489be4e83 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -189,9 +189,7 @@ def CC_AMDGPU_Func : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
- CCIfType<[i1] , CCAssignToReg<
- !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
- >>,
+ CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
!foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
@@ -207,8 +205,9 @@ def CC_AMDGPU_Func : CallingConv<[
// Calling convention for leaf functions
def RetCC_AMDGPU_Func : CallingConv<[
- CCIfType<[i1], CCPromoteToType<i32>>,
- CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
+ CCIfType<[i16], CCIfExtend<CCPromoteToType<i32>>>,
+ CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
+
CCIfType<[i32, f32, i16, f16, v2i16, v2f16, bf16, v2bf16], CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d35a022ad6806..12c901ab2b45a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -29,6 +29,37 @@
using namespace llvm;
+static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ static bool IsWave64 = static_cast<const GCNSubtarget&>(State.getMachineFunction().getSubtarget()).isWave64();
+
+ static const MCPhysReg I1RegList1[] = {
+ AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
+ AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
+ AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
+ AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
+ AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29
+ };
+
+ static const MCPhysReg I1RegList2[] = {
+ AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3, AMDGPU::SGPR4,
+ AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7, AMDGPU::SGPR8, AMDGPU::SGPR9,
+ AMDGPU::SGPR10, AMDGPU::SGPR11, AMDGPU::SGPR12, AMDGPU::SGPR13,
+ AMDGPU::SGPR14, AMDGPU::SGPR15, AMDGPU::SGPR16, AMDGPU::SGPR17,
+ AMDGPU::SGPR18, AMDGPU::SGPR19, AMDGPU::SGPR20, AMDGPU::SGPR21,
+ AMDGPU::SGPR22, AMDGPU::SGPR23, AMDGPU::SGPR24, AMDGPU::SGPR25,
+ AMDGPU::SGPR26, AMDGPU::SGPR27, AMDGPU::SGPR28, AMDGPU::SGPR29
+ };
+
+ assert (LocVT == MVT::i1);
+ if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1) : State.AllocateReg(I1RegList2)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ return false; // not allocated
+}
+
#include "AMDGPUGenCallingConv.inc"
static cl::opt<bool> AMDGPUBypassSlowDiv(
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9fce3de9e02d2..d18ce7ce4d0ca 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3026,8 +3026,13 @@ SDValue SITargetLowering::LowerFormalArguments(
RC = &AMDGPU::VGPR_32RegClass;
else if (AMDGPU::SGPR_32RegClass.contains(Reg))
RC = &AMDGPU::SGPR_32RegClass;
- else
- llvm_unreachable("Unexpected register class in LowerFormalArguments!");
+ else {
+ if (VT == MVT::i1 && Subtarget->isWave64())
+ RC = &AMDGPU::SGPR_64RegClass;
+ else
+ llvm_unreachable("Unexpected register class in LowerFormalArguments!");
+ }
+
EVT ValVT = VA.getValVT();
Reg = MF.addLiveIn(Reg, RC);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3d89e6bcd6f3a..3db884b78e007 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -861,16 +861,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
- // When calling convention allocates SGPR for i1 argument, we may
- // have a SRPR_64 to SReg_32 copy for an outgoing i1 argument. Adjust
- // the copy to avoid illegal copy.
- if (AMDGPU::SGPR_64RegClass.contains(SrcReg)) {
- auto sub0 = RI.getSubReg(SrcReg, AMDGPU::sub0);
- if (sub0 != DestReg)
- BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg).addReg(sub0);
- return;
- }
-
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
@@ -904,20 +894,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
- // When an i1 argument is allocated to an SGPR_32, we may have a COPY
- // from SGPR_32 to SReg_64. The following handles this case to avoid
- // an illegal copy.
- if (AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
- auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
- if (sub0 != SrcReg) {
- BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
- }
- BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
- RI.getSubReg(DestReg, AMDGPU::sub1))
- .addImm(0);
- return;
- }
-
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index b2022714d9edc..00d3eabc1afc0 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -481,12 +481,6 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() {
if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
continue;
- // When the calling convention allocates i1 argument to SGPR,
- // we may have a COPY with dst being an SGPR_32. This should
- // not be lowered into V_CNDMASK_B32.
- if (AMDGPU::SGPR_32RegClass.contains(DstReg))
- continue;
-
Changed = true;
// Copy into a 32-bit vector register.
@@ -695,6 +689,13 @@ bool Vreg1LoweringHelper::lowerCopiesToI1() {
assert(!MI.getOperand(1).getSubReg());
if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
+ if (!SrcReg.isVirtual() && TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+ // When calling convention allocates SGPR for i1, for GPUs with wavefront size 64, i1
+ // return value is put in 64b SGPR.
+ assert(ST->isWave64());
+ continue;
+ }
+
assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
index 2fc4befa279f3..44af2c90f900b 100644
--- a/llvm/test/CodeGen/AMDGPU/z_callee.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_callee.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
define void @void_func_i1(i1 %arg0) #0 {
; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
@@ -11,7 +11,6 @@ define void @void_func_i1(i1 %arg0) #0 {
; CIGFX89-LABEL: void_func_i1:
; CIGFX89: ; %bb.0:
; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT: s_mov_b32 s5, 0
; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
; CIGFX89-NEXT: s_mov_b32 s6, -1
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
index faf25e407fca2..f9203cf078e47 100644
--- a/llvm/test/CodeGen/AMDGPU/z_caller.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_caller.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
declare hidden void @external_void_func_i1(i1) #0
@@ -17,7 +17,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s4, -1
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[8:9]
; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
index e63ae50b7e91c..1141476960250 100644
--- a/llvm/test/CodeGen/AMDGPU/z_caller2.ll
+++ b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
declare hidden void @external_void_func_i1_signext(i1 signext) #0
diff --git a/llvm/test/CodeGen/AMDGPU/z_return.ll b/llvm/test/CodeGen/AMDGPU/z_return.ll
new file mode 100644
index 0000000000000..6bf64da7a1b8f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_return.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define i1 @i1_func_void() #0 {
+ %val = load i1, ptr addrspace(1) undef
+ ret i1 %val
+}
+
+define void @test_call_i1_func_void() #0 {
+; CIGFX89-LABEL: test_call_i1_func_void:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s6, s33
+; CIGFX89-NEXT: s_mov_b32 s33, s32
+; CIGFX89-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; CIGFX89-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; CIGFX89-NEXT: s_mov_b64 exec, s[4:5]
+; CIGFX89-NEXT: s_addk_i32 s32, 0x400
+; CIGFX89-NEXT: s_getpc_b64 s[4:5]
+; CIGFX89-NEXT: s_add_u32 s4, s4, i1_func_void at gotpcrel32@lo+4
+; CIGFX89-NEXT: s_addc_u32 s5, s5, i1_func_void at gotpcrel32@hi+12
+; CIGFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIGFX89-NEXT: v_writelane_b32 v1, s30, 0
+; CIGFX89-NEXT: v_writelane_b32 v1, s31, 1
+; CIGFX89-NEXT: s_waitcnt lgkmcnt(0)
+; CIGFX89-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CIGFX89-NEXT: global_store_byte v[2:3], v0, off
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: v_readlane_b32 s31, v1, 1
+; CIGFX89-NEXT: v_readlane_b32 s30, v1, 0
+; CIGFX89-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; CIGFX89-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; CIGFX89-NEXT: s_mov_b64 exec, s[4:5]
+; CIGFX89-NEXT: s_addk_i32 s32, 0xfc00
+; CIGFX89-NEXT: s_mov_b32 s33, s6
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_i1_func_void:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v1, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v1, s31, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_readlane_b32 s31, v1, 1
+; GFX11-NEXT: v_readlane_b32 s30, v1, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: global_store_b8 v[2:3], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+
+ %val = call i1 @i1_func_void()
+ store volatile i1 %val, ptr addrspace(1) undef
+ ret void
+}
+
+attributes #0 = { nounwind }
+
+
>From 721c34d2cc19d054fc857fcf2ab568554fd5381f Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 30 Nov 2023 20:04:19 -0600
Subject: [PATCH 04/20] Fix formatting.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 44 ++++++++++---------
llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 7 +--
2 files changed, 27 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 12c901ab2b45a..6f8aa496f0120 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -29,31 +29,33 @@
using namespace llvm;
-static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- static bool IsWave64 = static_cast<const GCNSubtarget&>(State.getMachineFunction().getSubtarget()).isWave64();
+static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ static bool IsWave64 = static_cast<const GCNSubtarget &>(
+ State.getMachineFunction().getSubtarget())
+ .isWave64();
static const MCPhysReg I1RegList1[] = {
- AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
- AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
- AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
- AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
- AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29
- };
+ AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
+ AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
+ AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
+ AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
+ AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29};
static const MCPhysReg I1RegList2[] = {
- AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3, AMDGPU::SGPR4,
- AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7, AMDGPU::SGPR8, AMDGPU::SGPR9,
- AMDGPU::SGPR10, AMDGPU::SGPR11, AMDGPU::SGPR12, AMDGPU::SGPR13,
- AMDGPU::SGPR14, AMDGPU::SGPR15, AMDGPU::SGPR16, AMDGPU::SGPR17,
- AMDGPU::SGPR18, AMDGPU::SGPR19, AMDGPU::SGPR20, AMDGPU::SGPR21,
- AMDGPU::SGPR22, AMDGPU::SGPR23, AMDGPU::SGPR24, AMDGPU::SGPR25,
- AMDGPU::SGPR26, AMDGPU::SGPR27, AMDGPU::SGPR28, AMDGPU::SGPR29
- };
-
- assert (LocVT == MVT::i1);
- if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1) : State.AllocateReg(I1RegList2)) {
+ AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3,
+ AMDGPU::SGPR4, AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7,
+ AMDGPU::SGPR8, AMDGPU::SGPR9, AMDGPU::SGPR10, AMDGPU::SGPR11,
+ AMDGPU::SGPR12, AMDGPU::SGPR13, AMDGPU::SGPR14, AMDGPU::SGPR15,
+ AMDGPU::SGPR16, AMDGPU::SGPR17, AMDGPU::SGPR18, AMDGPU::SGPR19,
+ AMDGPU::SGPR20, AMDGPU::SGPR21, AMDGPU::SGPR22, AMDGPU::SGPR23,
+ AMDGPU::SGPR24, AMDGPU::SGPR25, AMDGPU::SGPR26, AMDGPU::SGPR27,
+ AMDGPU::SGPR28, AMDGPU::SGPR29};
+
+ assert(LocVT == MVT::i1);
+ if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1)
+ : State.AllocateReg(I1RegList2)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 00d3eabc1afc0..a04ce16cbddb6 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -689,9 +689,10 @@ bool Vreg1LoweringHelper::lowerCopiesToI1() {
assert(!MI.getOperand(1).getSubReg());
if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
- if (!SrcReg.isVirtual() && TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
- // When calling convention allocates SGPR for i1, for GPUs with wavefront size 64, i1
- // return value is put in 64b SGPR.
+ if (!SrcReg.isVirtual() &&
+ TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+ // When calling convention allocates SGPR for i1, for GPUs with
+ // wavefront size 64, i1 return value is put in 64b SGPR.
assert(ST->isWave64());
continue;
}
>From ca09dddea97071d08b9ad2a84d5a52a079a60f38 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 21 Dec 2023 16:13:47 -0600
Subject: [PATCH 05/20] Fixed (1) problems for global-isel wrt both incoming
args and return value (2) a problem in AMDCallingConv.td when no sgprs are
available.
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 20 +-
llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 2 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 13 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
.../irtranslator-call-return-values.ll | 20 +-
.../AMDGPU/GlobalISel/irtranslator-call.ll | 42 +-
.../GlobalISel/irtranslator-function-args.ll | 243 ++++++++++--
.../GlobalISel/irtranslator-invariant.ll | 6 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll | 48 +--
.../CodeGen/AMDGPU/GlobalISel/localizer.ll | 6 +-
...amdgpu-codegenprepare-fold-binop-select.ll | 278 ++++++-------
llvm/test/CodeGen/AMDGPU/function-args.ll | 371 +++++++++++++++---
llvm/test/CodeGen/AMDGPU/function-returns.ll | 5 +
llvm/test/CodeGen/AMDGPU/z_callee.ll | 32 --
llvm/test/CodeGen/AMDGPU/z_caller.ll | 43 --
llvm/test/CodeGen/AMDGPU/z_caller2.ll | 57 ---
llvm/test/CodeGen/AMDGPU/z_return.ll | 80 ----
17 files changed, 754 insertions(+), 514 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/z_callee.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/z_caller.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/z_caller2.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/z_return.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 7e1f041fa1093..5e1b551a853eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -124,7 +124,15 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
if (VA.getLocVT().getSizeInBits() < 32) {
// 16-bit types are reported as legal for 32-bit registers. We need to do
// a 32-bit copy, and truncate to avoid the verifier complaining about it.
- auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
+ unsigned CopyToBits = 32;
+
+ // When function return type is i1, it may be in a 64b register.
+ if (VA.getLocVT().getSizeInBits() == 1) {
+ if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)
+ CopyToBits = 64;
+ }
+
+ auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
// If we have signext/zeroext, it applies to the whole 32-bit register
// before truncation.
@@ -233,7 +241,15 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
const CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
- Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+ Register ExtReg;
+
+ if (VA.getLocVT().getSizeInBits() == 1 &&
+ MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {
+ ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0);
+ } else {
+ ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+ }
+
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 863d489be4e83..0a197e4a786cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -191,6 +191,8 @@ def CC_AMDGPU_Func : CallingConv<[
CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
+ CCIfType<[i1], CCPromoteToType<i32>>,
+
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
!foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
>>>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 6f8aa496f0120..02cb248836df1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -32,18 +32,17 @@ using namespace llvm;
static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State) {
- static bool IsWave64 = static_cast<const GCNSubtarget &>(
- State.getMachineFunction().getSubtarget())
- .isWave64();
+ static bool IsWave64 =
+ State.getMachineFunction().getSubtarget<GCNSubtarget>().isWave64();
- static const MCPhysReg I1RegList1[] = {
+ static const MCPhysReg SGPRArgsWave64[] = {
AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5,
AMDGPU::SGPR6_SGPR7, AMDGPU::SGPR8_SGPR9, AMDGPU::SGPR10_SGPR11,
AMDGPU::SGPR12_SGPR13, AMDGPU::SGPR14_SGPR15, AMDGPU::SGPR16_SGPR17,
AMDGPU::SGPR18_SGPR19, AMDGPU::SGPR20_SGPR21, AMDGPU::SGPR22_SGPR23,
AMDGPU::SGPR24_SGPR25, AMDGPU::SGPR26_SGPR27, AMDGPU::SGPR28_SGPR29};
- static const MCPhysReg I1RegList2[] = {
+ static const MCPhysReg SGPRArgsWave32[] = {
AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3,
AMDGPU::SGPR4, AMDGPU::SGPR5, AMDGPU::SGPR6, AMDGPU::SGPR7,
AMDGPU::SGPR8, AMDGPU::SGPR9, AMDGPU::SGPR10, AMDGPU::SGPR11,
@@ -54,8 +53,8 @@ static bool CC_AMDGPU_Custom_I1(unsigned ValNo, MVT ValVT, MVT LocVT,
AMDGPU::SGPR28, AMDGPU::SGPR29};
assert(LocVT == MVT::i1);
- if (unsigned Reg = IsWave64 ? State.AllocateReg(I1RegList1)
- : State.AllocateReg(I1RegList2)) {
+ if (unsigned Reg = IsWave64 ? State.AllocateReg(SGPRArgsWave64)
+ : State.AllocateReg(SGPRArgsWave32)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d18ce7ce4d0ca..297d38385852f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3028,7 +3028,7 @@ SDValue SITargetLowering::LowerFormalArguments(
RC = &AMDGPU::SGPR_32RegClass;
else {
if (VT == MVT::i1 && Subtarget->isWave64())
- RC = &AMDGPU::SGPR_64RegClass;
+ RC = Subtarget->getBoolRC();
else
llvm_unreachable("Unexpected register class in LowerFormalArguments!");
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 37f2118572d84..3db0acceec0b3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -198,9 +198,9 @@ define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32)
; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32)
; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32)
- ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
- ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+ ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GCN-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: S_ENDPGM 0
@@ -275,10 +275,9 @@ define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32)
; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32)
; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32)
- ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
- ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY19]], 1
- ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32)
+ ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GCN-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
; GCN-NEXT: G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
@@ -336,10 +335,9 @@ define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32)
; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32)
; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32)
- ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0
- ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GCN-NEXT: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY19]], 1
- ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32)
+ ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GCN-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
; GCN-NEXT: G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 392b0ae6823e4..e546144ce3373 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -368,12 +368,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
- ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
- ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4)
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s1)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT]](s64)
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32)
@@ -381,7 +381,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0
call void @external_void_func_i1(i1 true)
@@ -426,12 +426,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[SEXT]](s32)
- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
- ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
- ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4)
+ ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[SEXT]](s64)
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32)
@@ -439,7 +439,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_signext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_signext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0
%var = load volatile i1, ptr addrspace(1) undef
@@ -485,12 +485,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32)
- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
- ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
- ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32)
@@ -498,7 +498,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_zeroext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0
%var = load volatile i1, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 6d32d4c720c99..2c8f22ed57ab2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -3,6 +3,7 @@
; the frame info, so some functions have manually added stack object
; checks.
; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=GFX1100 -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s
; FIXME: pre-VI should have same ABI without legal i16 operations.
define void @void_func_empty_arg({} %arg0, i32 %arg1) #0 {
@@ -34,10 +35,10 @@ define void @void_func_empty_array([0 x i8] %arg0, i32 %arg1) #0 {
define void @void_func_i1(i1 %arg0) #0 {
; CHECK-LABEL: name: void_func_i1
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -48,11 +49,10 @@ define void @void_func_i1(i1 %arg0) #0 {
define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
; CHECK-LABEL: name: void_func_i1_zeroext
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 1
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
@@ -68,11 +68,10 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
define void @void_func_i1_signext(i1 signext %arg0) #0 {
; CHECK-LABEL: name: void_func_i1_signext
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 1
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
@@ -89,10 +88,10 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; CHECK-LABEL: name: i1_arg_i1_use
; CHECK: bb.1.bb:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
- ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -1986,25 +1985,25 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; CHECK-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28
; CHECK-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
; CHECK-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
- ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.4, align 16, addrspace 5)
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5)
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32)
- ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s1) from %fixed-stack.3, align 4, addrspace 5)
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD1]](s32)
- ; CHECK-NEXT: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.2, align 8, addrspace 5)
- ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD2]](s16)
- ; CHECK-NEXT: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
- ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.1, align 4, addrspace 5)
- ; CHECK-NEXT: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
- ; CHECK-NEXT: [[LOAD4:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s16) from %fixed-stack.0, align 16, addrspace 5)
+ ; CHECK-NEXT: [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s64)
+ ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
+ ; CHECK-NEXT: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.1, align 8, addrspace 5)
+ ; CHECK-NEXT: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
+ ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.0, align 4, addrspace 5)
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+
; CHECK-NEXT: G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
+ ; CHECK-NEXT: G_STORE [[LOAD2]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: G_STORE [[LOAD3]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
- ; CHECK-NEXT: G_STORE [[LOAD4]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: SI_RETURN
store volatile <32 x i32> %arg0, ptr addrspace(1) undef
store volatile i1 %arg1, ptr addrspace(1) undef
@@ -3230,6 +3229,196 @@ define void @void_func_v2p3_inreg(<2 x ptr addrspace(3)> inreg %arg0) #0 {
; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: SI_RETURN
store <2 x ptr addrspace(3)> %arg0, ptr addrspace(1) undef
+; Check calling convention for i1 args
+define void @many_i1_args(
+ i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+ i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+ i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+ i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; CHECK-LABEL: name: many_i1_args
+; CHECK: bb.1 (%ir-block.0):
+; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
+; CHECK-NEXT: {{ $}}
+; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr2_sgpr3
+; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
+; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
+; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr8_sgpr9
+; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
+; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr10_sgpr11
+; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
+; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr12_sgpr13
+; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
+; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s64) = COPY $sgpr14_sgpr15
+; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s64)
+; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
+; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s64)
+; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
+; CHECK-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s64)
+; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
+; CHECK-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s64)
+; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
+; CHECK-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s64)
+; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
+; CHECK-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s64)
+; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
+; CHECK-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s64)
+; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
+; CHECK-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s64)
+; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr0
+; CHECK-NEXT: [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr1
+; CHECK-NEXT: [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr2
+; CHECK-NEXT: [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr3
+; CHECK-NEXT: [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr4
+; CHECK-NEXT: [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr5
+; CHECK-NEXT: [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr6
+; CHECK-NEXT: [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr7
+; CHECK-NEXT: [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; CHECK-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr8
+; CHECK-NEXT: [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; CHECK-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr9
+; CHECK-NEXT: [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; CHECK-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr10
+; CHECK-NEXT: [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; CHECK-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr11
+; CHECK-NEXT: [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; CHECK-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr12
+; CHECK-NEXT: [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; CHECK-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr13
+; CHECK-NEXT: [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; CHECK-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr14
+; CHECK-NEXT: [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; CHECK-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr15
+; CHECK-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; CHECK-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr16
+; CHECK-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; CHECK: G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+;
+; GFX11-LABEL: name: many_i1_args
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
+; GFX11-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
+; GFX11-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr5
+; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
+; GFX11-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX11-NEXT: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
+; GFX11-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr7
+; GFX11-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
+; GFX11-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr8
+; GFX11-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
+; GFX11-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr9
+; GFX11-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
+; GFX11-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr10
+; GFX11-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
+; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr11
+; GFX11-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
+; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr12
+; GFX11-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
+; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr13
+; GFX11-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $sgpr14
+; GFX11-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $sgpr15
+; GFX11-NEXT: [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $sgpr16
+; GFX11-NEXT: [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $sgpr17
+; GFX11-NEXT: [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $sgpr18
+; GFX11-NEXT: [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $sgpr19
+; GFX11-NEXT: [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $sgpr20
+; GFX11-NEXT: [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $sgpr21
+; GFX11-NEXT: [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $sgpr22
+; GFX11-NEXT: [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $sgpr23
+; GFX11-NEXT: [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $sgpr24
+; GFX11-NEXT: [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $sgpr25
+; GFX11-NEXT: [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; GFX11-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $sgpr26
+; GFX11-NEXT: [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; GFX11-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $sgpr27
+; GFX11-NEXT: [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; GFX11-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $sgpr28
+; GFX11-NEXT: [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; GFX11-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $sgpr29
+; GFX11-NEXT: [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX11-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX11-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX11: G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ store volatile i1 %arg2, ptr addrspace(1) undef
+ store volatile i1 %arg3, ptr addrspace(1) undef
+ store volatile i1 %arg4, ptr addrspace(1) undef
+ store volatile i1 %arg5, ptr addrspace(1) undef
+ store volatile i1 %arg6, ptr addrspace(1) undef
+ store volatile i1 %arg7, ptr addrspace(1) undef
+
+ store volatile i1 %arg8, ptr addrspace(1) undef
+ store volatile i1 %arg9, ptr addrspace(1) undef
+ store volatile i1 %arg10, ptr addrspace(1) undef
+ store volatile i1 %arg11, ptr addrspace(1) undef
+ store volatile i1 %arg12, ptr addrspace(1) undef
+ store volatile i1 %arg13, ptr addrspace(1) undef
+ store volatile i1 %arg14, ptr addrspace(1) undef
+ store volatile i1 %arg15, ptr addrspace(1) undef
+
+ store volatile i1 %arg16, ptr addrspace(1) undef
+ store volatile i1 %arg17, ptr addrspace(1) undef
+ store volatile i1 %arg18, ptr addrspace(1) undef
+ store volatile i1 %arg19, ptr addrspace(1) undef
+ store volatile i1 %arg20, ptr addrspace(1) undef
+ store volatile i1 %arg21, ptr addrspace(1) undef
+ store volatile i1 %arg22, ptr addrspace(1) undef
+ store volatile i1 %arg23, ptr addrspace(1) undef
+
+ store volatile i1 %arg24, ptr addrspace(1) undef
+ store volatile i1 %arg25, ptr addrspace(1) undef
+ store volatile i1 %arg26, ptr addrspace(1) undef
+ store volatile i1 %arg27, ptr addrspace(1) undef
+ store volatile i1 %arg28, ptr addrspace(1) undef
+ store volatile i1 %arg29, ptr addrspace(1) undef
+ store volatile i1 %arg30, ptr addrspace(1) undef
+ store volatile i1 %arg31, ptr addrspace(1) undef
+
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index ec07b0b1d4f45..ac1eb4e2adda0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -22,10 +22,10 @@ define i32 @load_const_i32_gv() {
define i32 @load_select_const_i32_gv(i1 %cond) {
; CHECK-LABEL: name: load_select_const_i32_gv
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv0
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv1
; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(p1) = G_SELECT [[TRUNC]](s1), [[GV]], [[GV1]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 4caf83774bbba..979590fd11688 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -11,8 +11,8 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
; GFX7-LABEL: v_div_fmas_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX7-NEXT: s_and_b32 s4, 1, s0
+; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX7-NEXT: s_nop 3
; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -20,8 +20,8 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
; GFX8-LABEL: v_div_fmas_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX8-NEXT: s_and_b32 s4, 1, s0
+; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX8-NEXT: s_nop 3
; GFX8-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -29,32 +29,32 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
; GFX10_W32-LABEL: v_div_fmas_f32:
; GFX10_W32: ; %bb.0:
; GFX10_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX10_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10_W32-NEXT: s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
; GFX10_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W32-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_W64-LABEL: v_div_fmas_f32:
; GFX10_W64: ; %bb.0:
; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX10_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX10_W64-NEXT: s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX10_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W64-NEXT: s_setpc_b64 s[30:31]
;
; GFX11_W32-LABEL: v_div_fmas_f32:
; GFX11_W32: ; %bb.0:
; GFX11_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W32-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11_W32-NEXT: s_and_b32 s0, 1, s0
+; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX11_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX11_W32-NEXT: s_setpc_b64 s[30:31]
;
; GFX11_W64-LABEL: v_div_fmas_f32:
; GFX11_W64: ; %bb.0:
; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX11_W64-NEXT: s_and_b32 s0, 1, s0
+; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX11_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX11_W64-NEXT: s_setpc_b64 s[30:31]
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
@@ -65,8 +65,8 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
; GFX7-LABEL: v_div_fmas_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX7-NEXT: s_and_b32 s4, 1, s0
+; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX7-NEXT: s_nop 3
; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -74,8 +74,8 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
; GFX8-LABEL: v_div_fmas_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX8-NEXT: s_and_b32 s4, 1, s0
+; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX8-NEXT: s_nop 3
; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -83,32 +83,32 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
; GFX10_W32-LABEL: v_div_fmas_f64:
; GFX10_W32: ; %bb.0:
; GFX10_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX10_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10_W32-NEXT: s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10_W32-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_W64-LABEL: v_div_fmas_f64:
; GFX10_W64: ; %bb.0:
; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX10_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX10_W64-NEXT: s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10_W64-NEXT: s_setpc_b64 s[30:31]
;
; GFX11_W32-LABEL: v_div_fmas_f64:
; GFX11_W32: ; %bb.0:
; GFX11_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W32-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11_W32-NEXT: s_and_b32 s0, 1, s0
+; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX11_W32-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX11_W32-NEXT: s_setpc_b64 s[30:31]
;
; GFX11_W64-LABEL: v_div_fmas_f64:
; GFX11_W64: ; %bb.0:
; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX11_W64-NEXT: s_and_b32 s0, 1, s0
+; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX11_W64-NEXT: s_setpc_b64 s[30:31]
%result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 36bac87889cac..1cff9ba4d2340 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -168,9 +168,9 @@ define void @localize_internal_globals(i1 %cond) {
; GFX9-LABEL: localize_internal_globals:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT: s_and_b32 s4, 1, s0
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GFX9-NEXT: s_cbranch_execnz .LBB2_3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 5c40a4ce13e31..9beec51710598 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -10,11 +10,10 @@ define i32 @select_sdiv_lhs_const_i32(i1 %cond) {
; GCN-LABEL: select_sdiv_lhs_const_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x1e848
-; GCN-NEXT: v_mov_b32_e32 v2, 0x30d40
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: s_mov_b32 s6, 0x30d40
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x1e848
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i32 5, i32 8
%op = sdiv i32 1000000, %select
@@ -29,11 +28,10 @@ define i32 @select_sdiv_rhs_const_i32(i1 %cond) {
; GCN-LABEL: select_sdiv_rhs_const_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x2710
-; GCN-NEXT: v_mov_b32_e32 v2, 0x3e8
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: s_movk_i32 s6, 0x3e8
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x2710
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i32 42000, i32 420000
%op = sdiv i32 %select, 42
@@ -48,11 +46,10 @@ define <2 x i32> @select_sdiv_lhs_const_v2i32(i1 %cond) {
; GCN-LABEL: select_sdiv_lhs_const_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x22b
-; GCN-NEXT: v_mov_b32_e32 v2, 0x29a
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: s_movk_i32 s6, 0x29a
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x22b
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, 0x594
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, <2 x i32> <i32 5, i32 undef>, <2 x i32> <i32 6, i32 7>
@@ -68,14 +65,13 @@ define <2 x i32> @select_sdiv_rhs_const_v2i32(i1 %cond) {
; GCN-LABEL: select_sdiv_rhs_const_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3661c
-; GCN-NEXT: v_mov_b32_e32 v2, 0x307dd
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, 0x23b02a
-; GCN-NEXT: v_mov_b32_e32 v2, 0x13e3a0c
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_mov_b32 s6, 0x307dd
+; GCN-NEXT: s_mov_b32 s5, 0x13e3a0c
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x3661c
+; GCN-NEXT: s_cselect_b32 s5, s5, 0x23b02a
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, <2 x i32> <i32 8342123, i32 834212353>, <2 x i32> <i32 9355456, i32 93554321>
%op = sdiv <2 x i32> %select, <i32 42, i32 40>
@@ -126,40 +122,41 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
; GCN-LABEL: select_sdiv_lhs_opaque_const0_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, s4
-; GCN-NEXT: v_cndmask_b32_e32 v0, 5, v1, vcc
-; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT: s_cselect_b32 s4, s6, 5
+; GCN-NEXT: s_ashr_i32 s5, s4, 31
+; GCN-NEXT: s_add_i32 s4, s4, s5
+; GCN-NEXT: s_xor_b32 s4, s4, s5
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT: s_sub_i32 s6, 0, s4
+; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT: v_mul_lo_u32 v1, s6, v0
+; GCN-NEXT: s_mov_b32 s6, 0xf4240
+; GCN-NEXT: v_mul_hi_u32 v1, v0, v1
; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v0
-; GCN-NEXT: s_mov_b32 s4, 0xf4240
-; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_mul_lo_u32 v3, v3, v2
-; GCN-NEXT: v_mul_hi_u32 v3, v2, v3
-; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT: v_mul_lo_u32 v3, v2, v0
-; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s6
+; GCN-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NEXT: s_mul_i32 s6, s6, s4
+; GCN-NEXT: s_sub_i32 s6, 0xf4240, s6
+; GCN-NEXT: s_sub_i32 s7, s6, s4
+; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT: s_cmp_ge_u32 s6, s4
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT: s_cselect_b32 s6, s7, s6
+; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT: s_cmp_ge_u32 s6, s4
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, s5, v0
+; GCN-NEXT: v_subrev_u32_e32 v0, vcc, s5, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 5
%op = sdiv i32 1000000, %select
@@ -208,40 +205,41 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
; GCN-LABEL: select_sdiv_lhs_opaque_const1_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, s4
-; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 5, vcc
-; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT: s_cselect_b32 s4, 5, s6
+; GCN-NEXT: s_ashr_i32 s5, s4, 31
+; GCN-NEXT: s_add_i32 s4, s4, s5
+; GCN-NEXT: s_xor_b32 s4, s4, s5
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT: s_sub_i32 s6, 0, s4
+; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT: v_mul_lo_u32 v1, s6, v0
+; GCN-NEXT: s_mov_b32 s6, 0xf4240
+; GCN-NEXT: v_mul_hi_u32 v1, v0, v1
; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v0
-; GCN-NEXT: s_mov_b32 s4, 0xf4240
-; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_mul_lo_u32 v3, v3, v2
-; GCN-NEXT: v_mul_hi_u32 v3, v2, v3
-; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT: v_mul_lo_u32 v3, v2, v0
-; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s6
+; GCN-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NEXT: s_mul_i32 s6, s6, s4
+; GCN-NEXT: s_sub_i32 s6, 0xf4240, s6
+; GCN-NEXT: s_sub_i32 s7, s6, s4
+; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT: s_cmp_ge_u32 s6, s4
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT: s_cselect_b32 s6, s7, s6
+; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT: s_cmp_ge_u32 s6, s4
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, s5, v0
+; GCN-NEXT: v_subrev_u32_e32 v0, vcc, s5, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i32 5, i32 ptrtoint (ptr addrspace(1) @gv to i32)
%op = sdiv i32 1000000, %select
@@ -257,18 +255,15 @@ define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) {
; GCN-LABEL: select_sdiv_rhs_opaque_const0_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x392fa
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0x30c30c31
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: s_mov_b32 s4, 0x30c30c31
-; GCN-NEXT: v_mul_hi_i32 v0, v0, s4
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x392fa
+; GCN-NEXT: v_mul_hi_i32 v0, s4, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v0, 3, v0
; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
@@ -287,18 +282,15 @@ define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) {
; GCN-LABEL: select_sdiv_rhs_opaque_const1_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0xa410
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, gv at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, gv at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dword s6, s[6:7], 0x0
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0x30c30c31
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: s_mov_b32 s4, 0x30c30c31
-; GCN-NEXT: v_mul_hi_i32 v0, v0, s4
+; GCN-NEXT: s_cselect_b32 s4, 0xa410, s6
+; GCN-NEXT: v_mul_hi_i32 v0, s4, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v0, 3, v0
; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
@@ -316,11 +308,10 @@ define i32 @select_add_lhs_const_i32(i1 %cond) {
; GCN-LABEL: select_add_lhs_const_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0xf4248
-; GCN-NEXT: v_mov_b32_e32 v2, 0xf4245
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: s_mov_b32 s6, 0xf4245
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0xf4248
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i32 5, i32 8
%op = add i32 1000000, %select
@@ -335,11 +326,9 @@ define float @select_fadd_lhs_const_i32_fmf(i1 %cond) {
; GCN-LABEL: select_fadd_lhs_const_i32_fmf:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x40a00000
-; GCN-NEXT: v_mov_b32_e32 v2, 0x40400000
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: v_mov_b32_e32 v0, 0x40a00000
+; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, float 2.0, float 4.0
%op = fadd nnan nsz float 1.0, %select
@@ -351,12 +340,10 @@ define i32 @select_mul_lhs_const_i32(i1 %cond) {
; GCN-LABEL: select_mul_lhs_const_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x1f40
-; GCN-NEXT: v_mov_b32_e32 v2, 0x1388
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-NEXT: s_movk_i32 s6, 0x1388
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x1f40
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; IR-LABEL: @select_mul_lhs_const_i32(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
; IR-NEXT: ret i32 [[OP]]
@@ -370,12 +357,10 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) {
; GCN-LABEL: select_mul_rhs_const_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x1f40
-; GCN-NEXT: v_mov_b32_e32 v2, 0x1388
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-NEXT: s_movk_i32 s6, 0x1388
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x1f40
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; IR-LABEL: @select_mul_rhs_const_i32(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
; IR-NEXT: ret i32 [[OP]]
@@ -411,9 +396,7 @@ define i16 @select_add_trunc_select(i1 %cond) {
; GCN-LABEL: select_add_trunc_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 47, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
; IR-LABEL: @select_add_trunc_select(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50
@@ -432,9 +415,9 @@ define i32 @select_add_sext_select(i1 %cond) {
; GCN-LABEL: select_add_sext_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 29, vcc
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, 29, 50
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i16 -13, i16 8
%trunc = sext i16 %select to i32
@@ -450,9 +433,9 @@ define i32 @select_add_zext_select(i1 %cond) {
; GCN-LABEL: select_add_zext_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, 47, 50
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i16 5, i16 8
%trunc = zext i16 %select to i32
@@ -468,11 +451,10 @@ define i32 @select_add_bitcast_select(i1 %cond) {
; GCN-LABEL: select_add_bitcast_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x4000002a
-; GCN-NEXT: v_mov_b32_e32 v2, 0x3f80002a
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: s_mov_b32 s6, 0x3f80002a
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT: s_cselect_b32 s4, s6, 0x4000002a
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, float 1.0, float 2.0
%trunc = bitcast float %select to i32
@@ -493,10 +475,8 @@ define <2 x half> @multi_use_cast_regression(i1 %cond) {
; GCN-LABEL: multi_use_cast_regression:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3c00
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
; GCN-NEXT: v_sub_f32_e32 v1, 1.0, v0
; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 3b2f15c8340a6..9fca84ef2667c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -8,7 +8,7 @@ define void @void_func_i1(i1 %arg0) #0 {
; CIGFX89-LABEL: void_func_i1:
; CIGFX89: ; %bb.0:
; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0
+; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
; CIGFX89-NEXT: s_mov_b32 s6, -1
; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -18,7 +18,7 @@ define void @void_func_i1(i1 %arg0) #0 {
; GFX11-LABEL: void_func_i1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
@@ -31,6 +31,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
; CIGFX89-LABEL: void_func_i1_zeroext:
; CIGFX89: ; %bb.0:
; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; CIGFX89-NEXT: v_or_b32_e32 v0, 12, v0
; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
; CIGFX89-NEXT: s_mov_b32 s6, -1
@@ -41,9 +42,11 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
; GFX11-LABEL: void_func_i1_zeroext:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_or_b32_e32 v0, 12, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, 12, v0
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ext = zext i1 %arg0 to i32
@@ -56,7 +59,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
; CI-LABEL: void_func_i1_signext:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_add_i32_e32 v0, vcc, 12, v0
+; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CI-NEXT: v_sub_i32_e32 v0, vcc, 12, v0
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -66,7 +70,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
; VI-LABEL: void_func_i1_signext:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 12, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-NEXT: v_sub_u32_e32 v0, vcc, 12, v0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -76,7 +81,8 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
; GFX9-LABEL: void_func_i1_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v0, 12, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: v_sub_u32_e32 v0, 12, v0
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -86,9 +92,11 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
; GFX11-LABEL: void_func_i1_signext:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_nc_u32_e32 v0, 12, v0
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ext = sext i1 %arg0 to i32
@@ -101,9 +109,7 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; CIGFX89-LABEL: i1_arg_i1_use:
; CIGFX89: ; %bb.0: ; %bb
; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0
-; CIGFX89-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CIGFX89-NEXT: s_xor_b64 s[6:7], vcc, -1
+; CIGFX89-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
; CIGFX89-NEXT: s_cbranch_execz .LBB3_2
; CIGFX89-NEXT: ; %bb.1: ; %bb1
@@ -119,11 +125,9 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; GFX11-LABEL: i1_arg_i1_use:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: s_xor_b32 s1, vcc_lo, -1
+; GFX11-NEXT: s_xor_b32 s1, s0, -1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s1
; GFX11-NEXT: s_cbranch_execz .LBB3_2
; GFX11-NEXT: ; %bb.1: ; %bb1
@@ -2774,13 +2778,11 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
+; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12
; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12
+; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2789,15 +2791,15 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; CI-NEXT: v_and_b32_e32 v0, 1, v17
-; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v20
+; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_store_short v17, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v18, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_short v16, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2818,13 +2820,12 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16
-; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:4
+; VI-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2833,14 +2834,13 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, 1, v20
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_short v17, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v16, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_short v18, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v17, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_short v19, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2859,15 +2859,12 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16
-; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:20
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2876,14 +2873,13 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v20
-; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_short v17, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v16, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_short v18, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v17, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_short v19, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -2892,16 +2888,15 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: scratch_load_u8 v32, off, s32 offset:4
-; GFX11-NEXT: scratch_load_u16 v33, off, s32 offset:8
-; GFX11-NEXT: scratch_load_u16 v34, off, s32 offset:12
-; GFX11-NEXT: scratch_load_u16 v35, off, s32 offset:16
-; GFX11-NEXT: scratch_load_u16 v36, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v33, off, s32 offset:4
+; GFX11-NEXT: scratch_load_u16 v34, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v35, off, s32 offset:12
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt vmcnt(5)
+; GFX11-NEXT: v_cndmask_b32_e64 v32, 0, 1, s0
+; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -2910,8 +2905,6 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: v_and_b32_e32 v16, 1, v32
; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
@@ -2920,7 +2913,7 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v16, off, s[0:3], 0 dlc
+; GFX11-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc
@@ -4634,7 +4627,6 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
ret void
}
-
define void @void_func_bf16(bfloat %arg0) #0 {
; CI-LABEL: void_func_bf16:
; CI: ; %bb.0:
@@ -4891,4 +4883,275 @@ define void @void_func_v16bf16(<16 x bfloat> %arg0) #0 {
ret void
}
+define void @many_i1_args(
+ i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+ i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+ i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+ i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; GFX9-LABEL: many_i1_args:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_xor_saveexec_b64 vcc, -1
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, vcc
+; GFX9-NEXT: v_writelane_b32 v19, s30, 0
+; GFX9-NEXT: v_writelane_b32 v19, s31, 1
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX9-NEXT: s_mov_b32 s31, 0xf000
+; GFX9-NEXT: s_mov_b32 s30, -1
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[10:11]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[12:13]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[14:15]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[16:17]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[18:19]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[20:21]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[22:23]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[24:25]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[26:27]
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[28:29]
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v1
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v3
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v4
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v5
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v6
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v7
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v8
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v9
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v10
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v11
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v12
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v13
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v14
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v15
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v16
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v17
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v18
+; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v19, 1
+; GFX9-NEXT: v_readlane_b32 s30, v19, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: many_i1_args:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1
+; GFX11-NEXT: s_mov_b32 s30, -1
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_mov_b32 s31, 0x31016000
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s6
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s7
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s9
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s11
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s12
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s13
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s14
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s15
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s16
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s17
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s18
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s19
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s20
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s21
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s22
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s24
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s25
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s26
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s27
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s28
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s29
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v1, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ store volatile i1 %arg2, ptr addrspace(1) undef
+ store volatile i1 %arg3, ptr addrspace(1) undef
+ store volatile i1 %arg4, ptr addrspace(1) undef
+ store volatile i1 %arg5, ptr addrspace(1) undef
+ store volatile i1 %arg6, ptr addrspace(1) undef
+ store volatile i1 %arg7, ptr addrspace(1) undef
+
+ store volatile i1 %arg8, ptr addrspace(1) undef
+ store volatile i1 %arg9, ptr addrspace(1) undef
+ store volatile i1 %arg10, ptr addrspace(1) undef
+ store volatile i1 %arg11, ptr addrspace(1) undef
+ store volatile i1 %arg12, ptr addrspace(1) undef
+ store volatile i1 %arg13, ptr addrspace(1) undef
+ store volatile i1 %arg14, ptr addrspace(1) undef
+ store volatile i1 %arg15, ptr addrspace(1) undef
+
+ store volatile i1 %arg16, ptr addrspace(1) undef
+ store volatile i1 %arg17, ptr addrspace(1) undef
+ store volatile i1 %arg18, ptr addrspace(1) undef
+ store volatile i1 %arg19, ptr addrspace(1) undef
+ store volatile i1 %arg20, ptr addrspace(1) undef
+ store volatile i1 %arg21, ptr addrspace(1) undef
+ store volatile i1 %arg22, ptr addrspace(1) undef
+ store volatile i1 %arg23, ptr addrspace(1) undef
+
+ store volatile i1 %arg24, ptr addrspace(1) undef
+ store volatile i1 %arg25, ptr addrspace(1) undef
+ store volatile i1 %arg26, ptr addrspace(1) undef
+ store volatile i1 %arg27, ptr addrspace(1) undef
+ store volatile i1 %arg28, ptr addrspace(1) undef
+ store volatile i1 %arg29, ptr addrspace(1) undef
+ store volatile i1 %arg30, ptr addrspace(1) undef
+ store volatile i1 %arg31, ptr addrspace(1) undef
+
+ ret void
+}
+
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 401cbce00ac9a..df2163c4f9578 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -12,6 +12,8 @@ define i1 @i1_func_void() #0 {
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
; GFX789-NEXT: s_waitcnt vmcnt(0)
+; GFX789-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX789-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
; GFX789-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_func_void:
@@ -21,6 +23,9 @@ define i1 @i1_func_void() #0 {
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load i1, ptr addrspace(1) undef
ret i1 %val
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
deleted file mode 100644
index 44af2c90f900b..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_callee.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-define void @void_func_i1(i1 %arg0) #0 {
-; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
-; Therefore, the "s_mov_b32 s5, 0" is generated.
-;
-; CIGFX89-LABEL: void_func_i1:
-; CIGFX89: ; %bb.0:
-; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT: s_mov_b32 s6, -1
-; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT: s_waitcnt vmcnt(0)
-; CIGFX89-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: void_func_i1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- store i1 %arg0, ptr addrspace(1) undef
- ret void
-}
-
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
deleted file mode 100644
index f9203cf078e47..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_caller.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-
-declare hidden void @external_void_func_i1(i1) #0
-
-define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
-; GFX9-LABEL: test_call_external_void_func_i1_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b64 s[4:5], -1
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[8:9]
-; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT: s_endpgm
-;
-; GFX11-LABEL: test_call_external_void_func_i1_imm:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s0, -1
-; GFX11-NEXT: s_mov_b32 s32, 0
-; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_endpgm
- call void @external_void_func_i1(i1 true)
- ret void
-}
-
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
deleted file mode 100644
index 1141476960250..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_caller2.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-
-declare hidden void @external_void_func_i1_signext(i1 signext) #0
-
-define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
-; GFX9-LABEL: test_call_external_void_func_i1_signext:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[8:9]
-; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT: s_endpgm
-;
-; GFX11-LABEL: test_call_external_void_func_i1_signext:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s32, 0
-; GFX11-NEXT: s_getpc_b64 s[4:5]
-; GFX11-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 1, v0
-; GFX11-NEXT: s_mov_b32 s0, s2
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX11-NEXT: s_endpgm
- %var = load volatile i1, ptr addrspace(1) undef
- call void @external_void_func_i1_signext(i1 signext %var)
- ret void
-}
-
-
-
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_return.ll b/llvm/test/CodeGen/AMDGPU/z_return.ll
deleted file mode 100644
index 6bf64da7a1b8f..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/z_return.ll
+++ /dev/null
@@ -1,80 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-
-define i1 @i1_func_void() #0 {
- %val = load i1, ptr addrspace(1) undef
- ret i1 %val
-}
-
-define void @test_call_i1_func_void() #0 {
-; CIGFX89-LABEL: test_call_i1_func_void:
-; CIGFX89: ; %bb.0:
-; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT: s_mov_b32 s6, s33
-; CIGFX89-NEXT: s_mov_b32 s33, s32
-; CIGFX89-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CIGFX89-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
-; CIGFX89-NEXT: s_mov_b64 exec, s[4:5]
-; CIGFX89-NEXT: s_addk_i32 s32, 0x400
-; CIGFX89-NEXT: s_getpc_b64 s[4:5]
-; CIGFX89-NEXT: s_add_u32 s4, s4, i1_func_void at gotpcrel32@lo+4
-; CIGFX89-NEXT: s_addc_u32 s5, s5, i1_func_void at gotpcrel32@hi+12
-; CIGFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; CIGFX89-NEXT: v_writelane_b32 v1, s30, 0
-; CIGFX89-NEXT: v_writelane_b32 v1, s31, 1
-; CIGFX89-NEXT: s_waitcnt lgkmcnt(0)
-; CIGFX89-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CIGFX89-NEXT: global_store_byte v[2:3], v0, off
-; CIGFX89-NEXT: s_waitcnt vmcnt(0)
-; CIGFX89-NEXT: v_readlane_b32 s31, v1, 1
-; CIGFX89-NEXT: v_readlane_b32 s30, v1, 0
-; CIGFX89-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CIGFX89-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
-; CIGFX89-NEXT: s_mov_b64 exec, s[4:5]
-; CIGFX89-NEXT: s_addk_i32 s32, 0xfc00
-; CIGFX89-NEXT: s_mov_b32 s33, s6
-; CIGFX89-NEXT: s_waitcnt vmcnt(0)
-; CIGFX89-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: test_call_i1_func_void:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s2, s33
-; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, i1_func_void at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, i1_func_void at gotpcrel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v1, s30, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_writelane_b32 v1, s31, 1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_readlane_b32 s31, v1, 1
-; GFX11-NEXT: v_readlane_b32 s30, v1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: global_store_b8 v[2:3], v0, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: s_mov_b32 s33, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-
- %val = call i1 @i1_func_void()
- store volatile i1 %val, ptr addrspace(1) undef
- ret void
-}
-
-attributes #0 = { nounwind }
-
-
>From f26afca5d23d1ad9bf02883cbd2ccfb97414457b Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Tue, 16 Jan 2024 16:22:20 -0600
Subject: [PATCH 06/20] Minor changes based on code review.
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 11 +++++------
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
2 files changed, 6 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 5e1b551a853eb..94c62f8ddc0e1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -127,7 +127,7 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
unsigned CopyToBits = 32;
// When function return type is i1, it may be in a 64b register.
- if (VA.getLocVT().getSizeInBits() == 1) {
+ if (VA.getLocVT() == MVT::i1) {
if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)
CopyToBits = 64;
}
@@ -241,15 +241,14 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
const CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
- Register ExtReg;
- if (VA.getLocVT().getSizeInBits() == 1 &&
+ if (VA.getLocVT() == MVT::i1 &&
MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {
- ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0);
- } else {
- ExtReg = extendRegisterMin32(*this, ValVReg, VA);
+ MIRBuilder.buildCopy(PhysReg, ValVReg);
+ return;
}
+ Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 297d38385852f..54797b29d8965 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3027,7 +3027,7 @@ SDValue SITargetLowering::LowerFormalArguments(
else if (AMDGPU::SGPR_32RegClass.contains(Reg))
RC = &AMDGPU::SGPR_32RegClass;
else {
- if (VT == MVT::i1 && Subtarget->isWave64())
+ if (VT == MVT::i1)
RC = Subtarget->getBoolRC();
else
llvm_unreachable("Unexpected register class in LowerFormalArguments!");
>From 26fa9cc68172db8d26d13427ebddca5c16355e8a Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 22 Jan 2024 16:23:43 -0600
Subject: [PATCH 07/20] Additional change based on code review.
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 94c62f8ddc0e1..53dbae7765803 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -128,7 +128,7 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
// When function return type is i1, it may be in a 64b register.
if (VA.getLocVT() == MVT::i1) {
- if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)
+ if (MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
CopyToBits = 64;
}
@@ -243,7 +243,7 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
MIB.addUse(PhysReg, RegState::Implicit);
if (VA.getLocVT() == MVT::i1 &&
- MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {
+ MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
MIRBuilder.buildCopy(PhysReg, ValVReg);
return;
}
>From 3b323d98e74d89ada3cf9c1338ef9ef89a62e84d Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 31 Jan 2024 12:54:51 -0600
Subject: [PATCH 08/20] Changing a vector of 4 registers to a single register.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 54797b29d8965..0fdb3c4e36c67 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3679,14 +3679,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// reserve these registers.
if (!Subtarget->enableFlatScratch()) {
if (IsChainCallConv)
- CCInfo.AllocateRegBlock(
- ArrayRef<MCPhysReg>{AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50,
- AMDGPU::SGPR51},
- 4);
+ CCInfo.AllocateReg(AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51);
else
- CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
- AMDGPU::SGPR2, AMDGPU::SGPR3},
- 4);
+ CCInfo.AllocateReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3);
}
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
>From b4c0bb9e5e5d8fcbbe861682edd910db48266189 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 2 Feb 2024 16:07:43 -0600
Subject: [PATCH 09/20] Update some test files.
---
.../AMDGPU/GlobalISel/irtranslator-call.ll | 21 +-
.../GlobalISel/irtranslator-function-args.ll | 173 +-
.../GlobalISel/irtranslator-invariant.ll | 4 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll | 16 +-
.../CodeGen/AMDGPU/GlobalISel/localizer.ll | 2 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 1923 ++++++-----------
llvm/test/CodeGen/AMDGPU/call-args-inreg.ll | 325 ++-
.../CodeGen/AMDGPU/call-argument-types.ll | 155 +-
.../CodeGen/AMDGPU/combine_andor_with_cmps.ll | 474 ++--
.../dagcombine-v1i8-extractvecelt-crash.ll | 13 +-
.../AMDGPU/divergence-driven-trunc-to-i1.ll | 42 +-
llvm/test/CodeGen/AMDGPU/extract-load-i1.ll | 2 +
llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 130 +-
.../CodeGen/AMDGPU/fneg-modifier-casting.ll | 344 +--
14 files changed, 1395 insertions(+), 2229 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index e546144ce3373..d0a17bc48c185 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -368,12 +368,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s1)
- ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT]](s64)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[C]](s1)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
- ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
- ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32)
@@ -426,12 +425,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
- ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[SEXT]](s64)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
- ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
- ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32)
@@ -485,12 +483,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s1)
- ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
- ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
- ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 2c8f22ed57ab2..d239b7271dd89 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -35,9 +35,9 @@ define void @void_func_empty_array([0 x i8] %arg0, i32 %arg1) #0 {
define void @void_func_i1(i1 %arg0) #0 {
; CHECK-LABEL: name: void_func_i1
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: liveins: $sgpr16_sgpr17
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
@@ -49,9 +49,9 @@ define void @void_func_i1(i1 %arg0) #0 {
define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
; CHECK-LABEL: name: void_func_i1_zeroext
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: liveins: $sgpr16_sgpr17
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -68,9 +68,9 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
define void @void_func_i1_signext(i1 signext %arg0) #0 {
; CHECK-LABEL: name: void_func_i1_signext
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: liveins: $sgpr16_sgpr17
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -88,9 +88,9 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; CHECK-LABEL: name: i1_arg_i1_use
; CHECK: bb.1.bb:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
- ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: liveins: $sgpr16_sgpr17
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -1988,7 +1988,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5)
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32)
- ; CHECK-NEXT: [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s64)
; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5)
@@ -2781,8 +2781,8 @@ define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr16
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -3229,6 +3229,9 @@ define void @void_func_v2p3_inreg(<2 x ptr addrspace(3)> inreg %arg0) #0 {
; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: SI_RETURN
store <2 x ptr addrspace(3)> %arg0, ptr addrspace(1) undef
+ ret void
+}
+
; Check calling convention for i1 args
define void @many_i1_args(
i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
@@ -3237,71 +3240,71 @@ define void @many_i1_args(
i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
; CHECK-LABEL: name: many_i1_args
; CHECK: bb.1 (%ir-block.0):
-; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
+; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
; CHECK-NEXT: {{ $}}
-; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
-; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr2_sgpr3
+; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
-; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
-; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
-; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr8_sgpr9
+; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
-; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr10_sgpr11
+; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
-; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr12_sgpr13
+; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
-; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s64) = COPY $sgpr14_sgpr15
-; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s64)
-; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s64)
-; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
-; CHECK-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s64)
-; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
-; CHECK-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s64)
-; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
-; CHECK-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s64)
-; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
-; CHECK-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s64)
-; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
-; CHECK-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s64)
-; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
-; CHECK-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s64)
-; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr0
+; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr0
+; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
+; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr1
+; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
+; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr2
+; CHECK-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
+; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr3
+; CHECK-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
+; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr4
+; CHECK-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
+; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr5
+; CHECK-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
+; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr6
+; CHECK-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr7
+; CHECK-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr8
; CHECK-NEXT: [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
-; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr1
+; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr9
; CHECK-NEXT: [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
-; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr2
+; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr10
; CHECK-NEXT: [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
-; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr3
+; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr11
; CHECK-NEXT: [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
-; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr4
+; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr12
; CHECK-NEXT: [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
-; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr5
+; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr13
; CHECK-NEXT: [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
-; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr6
+; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr14
; CHECK-NEXT: [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
-; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr7
+; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr15
; CHECK-NEXT: [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
-; CHECK-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr8
+; CHECK-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr16
; CHECK-NEXT: [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
-; CHECK-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr9
+; CHECK-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr17
; CHECK-NEXT: [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
-; CHECK-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr10
+; CHECK-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr18
; CHECK-NEXT: [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
-; CHECK-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr11
+; CHECK-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr19
; CHECK-NEXT: [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
-; CHECK-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr12
+; CHECK-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr20
; CHECK-NEXT: [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
-; CHECK-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr13
+; CHECK-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr21
; CHECK-NEXT: [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
-; CHECK-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr14
+; CHECK-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr22
; CHECK-NEXT: [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
-; CHECK-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr15
+; CHECK-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr23
; CHECK-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
-; CHECK-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr16
+; CHECK-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr24
; CHECK-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
;
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -3311,71 +3314,71 @@ define void @many_i1_args(
;
; GFX11-LABEL: name: many_i1_args
; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
+; GFX11-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX11-NEXT: {{ $}}
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr17
; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
-; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr18
; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
-; GFX11-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+; GFX11-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr19
; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
-; GFX11-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX11-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr20
; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
-; GFX11-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr5
+; GFX11-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr21
; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
-; GFX11-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX11-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr22
; GFX11-NEXT: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
-; GFX11-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr7
+; GFX11-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr23
; GFX11-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
-; GFX11-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr8
+; GFX11-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr24
; GFX11-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
-; GFX11-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr9
+; GFX11-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr25
; GFX11-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
-; GFX11-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr10
+; GFX11-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr26
; GFX11-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
-; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr11
+; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr27
; GFX11-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
-; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr12
+; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr28
; GFX11-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
-; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr13
+; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr29
; GFX11-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
-; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $sgpr14
+; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX11-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
-; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $sgpr15
+; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX11-NEXT: [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
-; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $sgpr16
+; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX11-NEXT: [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
-; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $sgpr17
+; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX11-NEXT: [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
-; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $sgpr18
+; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr4
; GFX11-NEXT: [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
-; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $sgpr19
+; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr5
; GFX11-NEXT: [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
-; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $sgpr20
+; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr6
; GFX11-NEXT: [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
-; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $sgpr21
+; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr7
; GFX11-NEXT: [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
-; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $sgpr22
+; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr8
; GFX11-NEXT: [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
-; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $sgpr23
+; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr9
; GFX11-NEXT: [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
-; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $sgpr24
+; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr10
; GFX11-NEXT: [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
-; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $sgpr25
+; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr11
; GFX11-NEXT: [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
-; GFX11-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $sgpr26
+; GFX11-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr12
; GFX11-NEXT: [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
-; GFX11-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $sgpr27
+; GFX11-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr13
; GFX11-NEXT: [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
-; GFX11-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $sgpr28
+; GFX11-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr14
; GFX11-NEXT: [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
-; GFX11-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $sgpr29
+; GFX11-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr15
; GFX11-NEXT: [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
-; GFX11-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr16
; GFX11-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
-; GFX11-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr17
; GFX11-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
;
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index ac1eb4e2adda0..6360c5c2cbb2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -22,9 +22,9 @@ define i32 @load_const_i32_gv() {
define i32 @load_select_const_i32_gv(i1 %cond) {
; CHECK-LABEL: name: load_select_const_i32_gv
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv0
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 979590fd11688..44014f2546814 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -11,7 +11,7 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
; GFX7-LABEL: v_div_fmas_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_and_b32 s4, 1, s0
+; GFX7-NEXT: s_and_b32 s4, 1, s4
; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX7-NEXT: s_nop 3
; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2
@@ -20,7 +20,7 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
; GFX8-LABEL: v_div_fmas_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s4, 1, s0
+; GFX8-NEXT: s_and_b32 s4, 1, s4
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX8-NEXT: s_nop 3
; GFX8-NEXT: v_div_fmas_f32 v0, v0, v1, v2
@@ -29,7 +29,7 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
; GFX10_W32-LABEL: v_div_fmas_f32:
; GFX10_W32: ; %bb.0:
; GFX10_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT: s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT: s_and_b32 s4, 1, s4
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
; GFX10_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W32-NEXT: s_setpc_b64 s[30:31]
@@ -37,7 +37,7 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
; GFX10_W64-LABEL: v_div_fmas_f32:
; GFX10_W64: ; %bb.0:
; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT: s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT: s_and_b32 s4, 1, s4
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX10_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W64-NEXT: s_setpc_b64 s[30:31]
@@ -65,7 +65,7 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
; GFX7-LABEL: v_div_fmas_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_and_b32 s4, 1, s0
+; GFX7-NEXT: s_and_b32 s4, 1, s4
; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX7-NEXT: s_nop 3
; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
@@ -74,7 +74,7 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
; GFX8-LABEL: v_div_fmas_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s4, 1, s0
+; GFX8-NEXT: s_and_b32 s4, 1, s4
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX8-NEXT: s_nop 3
; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
@@ -83,7 +83,7 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
; GFX10_W32-LABEL: v_div_fmas_f64:
; GFX10_W32: ; %bb.0:
; GFX10_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W32-NEXT: s_and_b32 s4, 1, s0
+; GFX10_W32-NEXT: s_and_b32 s4, 1, s4
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10_W32-NEXT: s_setpc_b64 s[30:31]
@@ -91,7 +91,7 @@ define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
; GFX10_W64-LABEL: v_div_fmas_f64:
; GFX10_W64: ; %bb.0:
; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT: s_and_b32 s4, 1, s0
+; GFX10_W64-NEXT: s_and_b32 s4, 1, s4
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10_W64-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 1cff9ba4d2340..4d04d6b7570c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -168,7 +168,7 @@ define void @localize_internal_globals(i1 %cond) {
; GFX9-LABEL: localize_internal_globals:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s4, 1, s0
+; GFX9-NEXT: s_and_b32 s4, 1, s4
; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4
; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index a86a3f6f279d7..a8a6f1954edd1 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -26343,37 +26343,37 @@ define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_false_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_mov_b64 s[0:1], 0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_false_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, 0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_false_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_false_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_false_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_false_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp false bfloat %a, %b
ret i1 %op
@@ -26387,8 +26387,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_eq_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_oeq_bf16:
@@ -26398,8 +26397,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_eq_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_oeq_bf16:
@@ -26407,8 +26405,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_oeq_bf16:
@@ -26416,8 +26413,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_oeq_bf16:
@@ -26425,8 +26421,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_oeq_bf16:
@@ -26435,8 +26430,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp oeq bfloat %a, %b
ret i1 %op
@@ -26450,8 +26444,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ogt_bf16:
@@ -26461,8 +26454,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ogt_bf16:
@@ -26470,8 +26462,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ogt_bf16:
@@ -26479,8 +26470,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ogt_bf16:
@@ -26488,8 +26478,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ogt_bf16:
@@ -26498,8 +26487,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ogt bfloat %a, %b
ret i1 %op
@@ -26513,8 +26501,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_oge_bf16:
@@ -26524,8 +26511,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_ge_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_oge_bf16:
@@ -26533,8 +26519,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_oge_bf16:
@@ -26542,8 +26527,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_oge_bf16:
@@ -26551,8 +26535,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_ge_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_oge_bf16:
@@ -26561,8 +26544,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp oge bfloat %a, %b
ret i1 %op
@@ -26576,8 +26558,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_olt_bf16:
@@ -26587,8 +26568,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_lt_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_olt_bf16:
@@ -26596,8 +26576,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_lt_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_olt_bf16:
@@ -26605,8 +26584,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_lt_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_olt_bf16:
@@ -26614,8 +26592,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_olt_bf16:
@@ -26624,8 +26601,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_lt_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp olt bfloat %a, %b
ret i1 %op
@@ -26639,8 +26615,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_le_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ole_bf16:
@@ -26650,8 +26625,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_le_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ole_bf16:
@@ -26659,8 +26633,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_le_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ole_bf16:
@@ -26668,8 +26641,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ole_bf16:
@@ -26677,8 +26649,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_le_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ole_bf16:
@@ -26687,8 +26658,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_le_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ole bfloat %a, %b
ret i1 %op
@@ -26702,8 +26672,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_lg_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_one_bf16:
@@ -26713,8 +26682,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_lg_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_one_bf16:
@@ -26722,8 +26690,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_lg_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_one_bf16:
@@ -26731,8 +26698,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_lg_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_one_bf16:
@@ -26740,8 +26706,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_lg_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_one_bf16:
@@ -26750,8 +26715,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_lg_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp one bfloat %a, %b
ret i1 %op
@@ -26765,8 +26729,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_uno_bf16:
@@ -26776,8 +26739,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_uno_bf16:
@@ -26785,8 +26747,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_uno_bf16:
@@ -26794,8 +26755,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_uno_bf16:
@@ -26803,8 +26763,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_uno_bf16:
@@ -26813,8 +26772,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp uno bfloat %a, %b
ret i1 %op
@@ -26828,8 +26786,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_nlg_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ueq_bf16:
@@ -26839,8 +26796,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_nlg_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ueq_bf16:
@@ -26848,8 +26804,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_nlg_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ueq_bf16:
@@ -26857,8 +26812,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_nlg_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ueq_bf16:
@@ -26866,8 +26820,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_nlg_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ueq_bf16:
@@ -26876,8 +26829,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ueq bfloat %a, %b
ret i1 %op
@@ -26891,8 +26843,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_nle_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ugt_bf16:
@@ -26902,8 +26853,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_nle_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ugt_bf16:
@@ -26911,8 +26861,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_nle_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ugt_bf16:
@@ -26920,8 +26869,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_nle_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ugt_bf16:
@@ -26929,8 +26877,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_nle_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ugt_bf16:
@@ -26939,8 +26886,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nle_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ugt bfloat %a, %b
ret i1 %op
@@ -26954,8 +26900,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_uge_bf16:
@@ -26965,8 +26910,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_nlt_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_uge_bf16:
@@ -26974,8 +26918,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_nlt_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_uge_bf16:
@@ -26983,8 +26926,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_nlt_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_uge_bf16:
@@ -26992,8 +26934,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_nlt_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_uge_bf16:
@@ -27002,8 +26943,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp uge bfloat %a, %b
ret i1 %op
@@ -27017,8 +26957,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_nge_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ult_bf16:
@@ -27028,8 +26967,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_nge_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ult_bf16:
@@ -27037,8 +26975,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_nge_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ult_bf16:
@@ -27046,8 +26983,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_nge_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ult_bf16:
@@ -27055,8 +26991,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_nge_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ult_bf16:
@@ -27065,8 +27000,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nge_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ult bfloat %a, %b
ret i1 %op
@@ -27080,8 +27014,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_ngt_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ule_bf16:
@@ -27091,8 +27024,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_ngt_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ule_bf16:
@@ -27100,8 +27032,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_ngt_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ule_bf16:
@@ -27109,8 +27040,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_ngt_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ule_bf16:
@@ -27118,8 +27048,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_ngt_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ule_bf16:
@@ -27128,8 +27057,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ngt_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ule bfloat %a, %b
ret i1 %op
@@ -27143,8 +27071,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_cmp_neq_f32_e64 s[0:1], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_une_bf16:
@@ -27154,8 +27081,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: v_cmp_neq_f32_e64 s[0:1], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_une_bf16:
@@ -27163,8 +27089,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_cmp_neq_f32_e64 s[0:1], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_une_bf16:
@@ -27172,8 +27097,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_cmp_neq_f32_e64 s[0:1], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_une_bf16:
@@ -27181,8 +27105,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_neq_f32_e64 s0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_une_bf16:
@@ -27191,8 +27114,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_neq_f32_e64 s0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp une bfloat %a, %b
ret i1 %op
@@ -27202,37 +27124,37 @@ define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_true_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: s_mov_b64 s[0:1], -1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_true_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, 1
+; GFX7-NEXT: s_mov_b64 s[0:1], -1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_true_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, 1
+; GFX8-NEXT: s_mov_b64 s[0:1], -1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_true_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-NEXT: s_mov_b64 s[0:1], -1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_true_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, 1
+; GFX10-NEXT: s_mov_b32 s0, -1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_true_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, 1
+; GFX11-NEXT: s_mov_b32 s0, -1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp true bfloat %a, %b
ret i1 %op
@@ -33554,56 +33476,39 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-LABEL: v_select_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, v0, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, bfloat %a, bfloat %b
ret bfloat %op
@@ -33613,60 +33518,46 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-LABEL: v_select_fneg_lhs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_fneg_lhs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, -1.0, v1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_fneg_lhs_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_fneg_lhs_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_fneg_lhs_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, v0, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_fneg_lhs_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg bfloat %a
%op = select i1 %cond, bfloat %neg.a, bfloat %b
@@ -33677,60 +33568,46 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-LABEL: v_select_fneg_rhs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mul_f32_e32 v2, -1.0, v2
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_fneg_rhs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, -1.0, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_fneg_rhs_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_fneg_rhs_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_fneg_rhs_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, v0, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_fneg_rhs_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%neg.b = fneg bfloat %b
%op = select i1 %cond, bfloat %a, bfloat %neg.b
@@ -33741,89 +33618,69 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
; GCN-LABEL: v_select_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[4:5]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, v0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v2, s4
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, v0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, v2, s0
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
@@ -34205,22 +34062,14 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
; GCN-LABEL: v_select_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: v_alignbit_b32 v1, v4, v3, 16
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v5, v2, s[4:5]
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
@@ -34229,22 +34078,14 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
; GFX7-LABEL: v_select_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v5, v2, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
@@ -34253,37 +34094,29 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
; GFX8-LABEL: v_select_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v1, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v3bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, v1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
ret <3 x bfloat> %op
@@ -34293,26 +34126,16 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
; GCN-LABEL: v_select_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT: v_alignbit_b32 v2, v6, v5, 16
-; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT: v_alignbit_b32 v4, v8, v7, 16
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: v_alignbit_b32 v1, v5, v4, 16
+; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
+; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -34322,26 +34145,16 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
; GFX7-LABEL: v_select_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v4, 16
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -34351,37 +34164,29 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
; GFX8-LABEL: v_select_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v1, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, v1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
ret <4 x bfloat> %op
@@ -34391,35 +34196,21 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
; GCN-LABEL: v_select_v6bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT: v_alignbit_b32 v2, v8, v7, 16
-; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT: v_alignbit_b32 v4, v10, v9, 16
-; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GCN-NEXT: v_alignbit_b32 v6, v12, v11, 16
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: v_alignbit_b32 v1, v7, v6, 16
+; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT: v_alignbit_b32 v3, v9, v8, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16
+; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -34431,35 +34222,21 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
; GFX7-LABEL: v_select_v6bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v9
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v12
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v9
+; GFX7-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v11
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v6, 16
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v8, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v10, 16
+; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -34471,41 +34248,33 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
; GFX8-LABEL: v_select_v6bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v6bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v6bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v1, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v2, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v6bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v2, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
ret <6 x bfloat> %op
@@ -34515,44 +34284,26 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
; GCN-LABEL: v_select_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT: v_alignbit_b32 v2, v10, v9, 16
-; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT: v_alignbit_b32 v4, v12, v11, 16
-; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GCN-NEXT: v_alignbit_b32 v6, v14, v13, 16
-; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; GCN-NEXT: v_alignbit_b32 v8, v16, v15, 16
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: v_alignbit_b32 v1, v9, v8, 16
+; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT: v_alignbit_b32 v3, v11, v10, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GCN-NEXT: v_alignbit_b32 v5, v13, v12, 16
+; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; GCN-NEXT: v_alignbit_b32 v7, v15, v14, 16
+; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -34566,44 +34317,26 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
; GFX7-LABEL: v_select_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v12
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v9, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v11
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v9, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v13
-; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v9, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v15
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v9
+; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v11
+; GFX7-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v13
+; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v15
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v8, 16
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v10, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v12, 16
+; GFX7-NEXT: v_alignbit_b32 v7, v7, v14, 16
+; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -34617,44 +34350,37 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
; GFX8-LABEL: v_select_v8bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v8bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v8bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v1, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
ret <8 x bfloat> %op
@@ -34664,81 +34390,44 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GCN-LABEL: v_select_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v17
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v19
+; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; GCN-NEXT: v_alignbit_b32 v1, v1, v16, 16
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21
+; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19
+; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v23
+; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v21
+; GCN-NEXT: v_alignbit_b32 v5, v5, v20, 16
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v23
+; GCN-NEXT: v_alignbit_b32 v7, v7, v22, 16
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:4
-; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16
-; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; GCN-NEXT: v_alignbit_b32 v14, v19, v20, 16
-; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v10
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v12
+; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v25
+; GCN-NEXT: v_alignbit_b32 v9, v9, v24, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32
+; GCN-NEXT: v_alignbit_b32 v11, v16, v26, 16
+; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16
+; GCN-NEXT: v_alignbit_b32 v13, v17, v28, 16
+; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16
+; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v12, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v8, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -34753,9 +34442,10 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_alignbit_b32 v14, v14, v16, 16
-; GCN-NEXT: v_cndmask_b32_e32 v15, v14, v15, vcc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; GCN-NEXT: v_alignbit_b32 v15, v15, v30, 16
+; GCN-NEXT: v_cndmask_b32_e64 v15, v15, v14, s[4:5]
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -34763,77 +34453,44 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-LABEL: v_select_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v21
-; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v17, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v23
-; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26
-; GFX7-NEXT: v_alignbit_b32 v8, v8, v17, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_alignbit_b32 v10, v10, v17, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v28
-; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27
-; GFX7-NEXT: v_alignbit_b32 v17, v17, v18, 16
-; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29
-; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v16, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_alignbit_b32 v16, v16, v26, 16
+; GFX7-NEXT: v_alignbit_b32 v12, v13, v12, 16
+; GFX7-NEXT: v_alignbit_b32 v13, v17, v28, 16
+; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v19
+; GFX7-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v21
+; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v23
+; GFX7-NEXT: v_alignbit_b32 v8, v9, v8, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v12, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v12, v16, v10, s[4:5]
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v18, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v20, 16
+; GFX7-NEXT: v_alignbit_b32 v7, v7, v22, 16
+; GFX7-NEXT: v_alignbit_b32 v9, v9, v24, 16
+; GFX7-NEXT: v_alignbit_b32 v14, v15, v14, 16
+; GFX7-NEXT: v_cndmask_b32_e64 v9, v9, v8, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -34844,15 +34501,12 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v18
-; GFX7-NEXT: v_alignbit_b32 v12, v12, v16, 16
-; GFX7-NEXT: v_cndmask_b32_e32 v15, v12, v15, vcc
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX7-NEXT: v_alignbit_b32 v10, v10, v30, 16
+; GFX7-NEXT: v_cndmask_b32_e64 v15, v10, v14, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
@@ -34862,58 +34516,53 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX8-LABEL: v_select_v16bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v16bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v1, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v4, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v5, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v14, v6, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v16bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v9, v1 :: v_dual_cndmask_b32 v1, v10, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v12, v4
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, v1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v4, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v5, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
ret <16 x bfloat> %op
@@ -34923,220 +34572,152 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GCN-LABEL: v_select_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v9
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v15
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v17
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v19
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16
-; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:12
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v21
-; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v25
-; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v27
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29
-; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_alignbit_b32 v14, v14, v20, 16
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:24
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5
+; GCN-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v7
+; GCN-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v9
+; GCN-NEXT: v_alignbit_b32 v4, v4, v8, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v11
+; GCN-NEXT: v_alignbit_b32 v5, v5, v10, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; GCN-NEXT: v_alignbit_b32 v6, v6, v12, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15
+; GCN-NEXT: v_alignbit_b32 v7, v7, v14, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v17
+; GCN-NEXT: v_alignbit_b32 v8, v8, v16, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GCN-NEXT: v_alignbit_b32 v9, v9, v18, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v21
+; GCN-NEXT: v_alignbit_b32 v10, v10, v20, 16
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v23
+; GCN-NEXT: v_alignbit_b32 v11, v11, v22, 16
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25
+; GCN-NEXT: v_alignbit_b32 v12, v12, v24, 16
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v27
+; GCN-NEXT: v_alignbit_b32 v13, v13, v26, 16
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v29
+; GCN-NEXT: v_alignbit_b32 v14, v14, v28, 16
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:24
; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36
-; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v17
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GCN-NEXT: v_alignbit_b32 v17, v17, v19, 16
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_alignbit_b32 v16, v16, v18, 16
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v19
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40
-; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GCN-NEXT: v_alignbit_b32 v18, v20, v18, 16
+; GCN-NEXT: v_alignbit_b32 v17, v17, v20, 16
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT: v_alignbit_b32 v19, v19, v20, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_alignbit_b32 v19, v19, v21, 16
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
+; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:52
+; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68
-; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64
-; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v22
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
-; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76
-; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_alignbit_b32 v21, v21, v23, 16
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68
+; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
-; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80
-; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GCN-NEXT: v_alignbit_b32 v23, v23, v24, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v24
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_alignbit_b32 v23, v23, v25, 16
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:76
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84
+; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100
-; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96
-; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; GCN-NEXT: v_alignbit_b32 v25, v25, v26, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v26
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
-; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108
-; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_alignbit_b32 v25, v25, v27, 16
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:100
+; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29
-; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116
-; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112
-; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GCN-NEXT: v_alignbit_b32 v27, v27, v28, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v28
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30
-; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_alignbit_b32 v27, v27, v29, 16
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; GCN-NEXT: v_alignbit_b32 v29, v29, v30, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16
+; GCN-NEXT: v_alignbit_b32 v29, v29, v32, 16
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_alignbit_b32 v30, v31, v30, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16
-; GCN-NEXT: v_cndmask_b32_e32 v31, v31, v30, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v14, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v28, v28, v13, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v27, v27, v12, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v26, v26, v11, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v25, v25, v10, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v24, v24, v9, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v23, v23, v8, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v22, v22, v7, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v13, v21, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v11, v20, v5, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v4, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v7, v18, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v5, v17, v2, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v15, v0, vcc
+; GCN-NEXT: v_alignbit_b32 v31, v31, v33, 16
+; GCN-NEXT: v_cndmask_b32_e64 v31, v31, v30, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v29, v29, v14, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v28, v28, v13, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v27, v27, v12, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v26, v26, v11, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v25, v25, v10, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v24, v24, v9, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v23, v23, v8, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v22, v22, v7, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v13, v21, v6, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v11, v20, v5, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v9, v19, v4, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v7, v18, v3, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v3, v16, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v15, v0, s[4:5]
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -35174,244 +34755,174 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GFX7-LABEL: v_select_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
-; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16
-; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76
-; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; GFX7-NEXT: v_alignbit_b32 v27, v28, v27, 16
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; GFX7-NEXT: v_alignbit_b32 v23, v24, v23, 16
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_alignbit_b32 v19, v20, v19, 16
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32
-; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
-; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
-; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
-; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100
-; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68
-; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92
-; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108
-; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v15
+; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8
+; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v10, 16
+; GFX7-NEXT: v_alignbit_b32 v6, v6, v12, 16
+; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_alignbit_b32 v7, v7, v14, 16
+; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v8, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_alignbit_b32 v8, v8, v16, 16
+; GFX7-NEXT: v_alignbit_b32 v24, v25, v24, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_alignbit_b32 v20, v21, v20, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_alignbit_b32 v28, v29, v28, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_alignbit_b32 v18, v19, v18, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_alignbit_b32 v22, v23, v22, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_alignbit_b32 v26, v27, v26, 16
+; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52
+; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104
+; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:44
+; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60
+; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120
+; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80
+; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(13)
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: s_waitcnt vmcnt(12)
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: s_waitcnt vmcnt(12)
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX7-NEXT: s_waitcnt vmcnt(11)
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_alignbit_b32 v9, v9, v10, 16
+; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16
; GFX7-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GFX7-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_alignbit_b32 v10, v10, v11, 16
+; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_alignbit_b32 v11, v11, v12, 16
+; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX7-NEXT: v_alignbit_b32 v9, v9, v10, 16
-; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v4, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_alignbit_b32 v12, v12, v13, 16
+; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT: v_alignbit_b32 v10, v10, v31, 16
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_cndmask_b32_e32 v10, v10, v5, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_alignbit_b32 v13, v13, v14, 16
+; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v4, s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_alignbit_b32 v14, v14, v15, 16
+; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_cndmask_b32_e64 v14, v14, v5, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v5, v11, v2, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v12, v12, v31, 16
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_alignbit_b32 v15, v15, v16, 16
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_cndmask_b32_e64 v15, v15, v6, s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v16, v16, v31, 16
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_alignbit_b32 v16, v16, v17, 16
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_cndmask_b32_e64 v16, v16, v7, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v7, v12, v3, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v10, v1, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v9, v0, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v18, v18, v31, 16
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_alignbit_b32 v17, v17, v31, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT: v_cndmask_b32_e64 v17, v17, v8, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v13
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v20, v20, v31, 16
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX7-NEXT: v_alignbit_b32 v19, v19, v31, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT: v_cndmask_b32_e64 v19, v19, v18, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v22, v22, v31, 16
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GFX7-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
+; GFX7-NEXT: v_alignbit_b32 v21, v21, v31, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT: v_cndmask_b32_e64 v21, v21, v20, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v24, v24, v31, 16
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GFX7-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc
+; GFX7-NEXT: v_alignbit_b32 v23, v23, v31, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT: v_cndmask_b32_e64 v23, v23, v22, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v26, v26, v31, 16
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GFX7-NEXT: v_cndmask_b32_e32 v25, v26, v25, vcc
+; GFX7-NEXT: v_alignbit_b32 v25, v25, v31, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v24, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v28, v28, v31, 16
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GFX7-NEXT: v_cndmask_b32_e32 v27, v28, v27, vcc
+; GFX7-NEXT: v_alignbit_b32 v27, v27, v31, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT: v_cndmask_b32_e64 v27, v27, v26, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v30, v30, v31, 16
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_cndmask_b32_e32 v29, v30, v29, vcc
+; GFX7-NEXT: v_alignbit_b32 v29, v29, v31, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v28, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX7-NEXT: v_alignbit_b32 v31, v31, v32, 16
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
+; GFX7-NEXT: v_alignbit_b32 v30, v31, v30, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX7-NEXT: v_alignbit_b32 v32, v32, v33, 16
-; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_alignbit_b32 v31, v31, v32, 16
+; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v30, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -35419,103 +34930,93 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GFX8-LABEL: v_select_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc
-; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32
-; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5]
+; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v14, v30, v14, s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v15, v16, v15, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5]
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v14, v30, v14, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v15, v16, v15, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc_lo
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cndmask_b32_e32 v14, v31, v15, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v1, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v20, v4, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v21, v5, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v22, v6, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v23, v7, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v24, v8, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v25, v9, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v26, v10, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v27, v11, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v14, v30, v14, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v15, v31, v15, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v17, v1 :: v_dual_cndmask_b32 v1, v18, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v19, v3 :: v_dual_cndmask_b32 v3, v20, v4
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v21, v5 :: v_dual_cndmask_b32 v5, v22, v6
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v23, v7 :: v_dual_cndmask_b32 v7, v24, v8
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v25, v9 :: v_dual_cndmask_b32 v9, v26, v10
-; GFX11-NEXT: v_dual_cndmask_b32 v10, v27, v11 :: v_dual_cndmask_b32 v11, v28, v12
-; GFX11-NEXT: v_dual_cndmask_b32 v12, v29, v13 :: v_dual_cndmask_b32 v13, v30, v14
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v20, v4, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v5, v21, v5, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v6, v22, v6, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v7, v23, v7, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v8, v24, v8, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v9, v25, v9, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v10, v26, v10, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v11, v27, v11, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v12, v28, v12, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v13, v29, v13, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v14, v30, v14, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16
+; GFX11-NEXT: v_cndmask_b32_e64 v15, v31, v15, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
ret <32 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index 8766303d7ee6e..283cbd6aa61c6 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -45,13 +45,13 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s16, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_i8_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_i8_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i8_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i8_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -107,12 +107,12 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_i16_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_i16_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i16_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i16_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -168,12 +168,12 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_i32_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_i32_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i32_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -228,13 +228,13 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s16, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_i64_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_i64_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i64_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i64_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -289,13 +289,13 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s16, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_v2i32_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -350,14 +350,14 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s2, s16
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s18, s16
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v3i32_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: s_getpc_b64 s[20:21]
+; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v3i32_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -412,15 +412,15 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s3, s17
-; GFX9-NEXT: s_mov_b32 s2, s16
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s19, s17
+; GFX9-NEXT: s_mov_b32 s18, s16
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v4i32_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: s_getpc_b64 s[20:21]
+; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -475,19 +475,19 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s22, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s3, s17
-; GFX9-NEXT: s_mov_b32 s2, s16
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
-; GFX9-NEXT: s_mov_b32 s16, s18
-; GFX9-NEXT: s_mov_b32 s17, s19
-; GFX9-NEXT: s_mov_b32 s18, s20
-; GFX9-NEXT: s_mov_b32 s19, s21
+; GFX9-NEXT: s_mov_b32 s23, s21
+; GFX9-NEXT: s_mov_b32 s22, s20
+; GFX9-NEXT: s_mov_b32 s21, s19
+; GFX9-NEXT: s_mov_b32 s20, s18
+; GFX9-NEXT: s_mov_b32 s19, s17
+; GFX9-NEXT: s_mov_b32 s18, s16
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[22:23]
-; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23]
+; GFX9-NEXT: s_getpc_b64 s[24:25]
+; GFX9-NEXT: s_add_u32 s24, s24, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s25, s25, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[24:25]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -553,12 +553,12 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_f16_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_f16_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f16_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f16_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -614,12 +614,12 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_bf16_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_bf16_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_bf16_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_bf16_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -675,12 +675,12 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_f32_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_f32_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f32_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -735,13 +735,13 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s16, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_f64_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_f64_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f64_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f64_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -797,12 +797,12 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_v2f16_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2f16_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2f16_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -859,12 +859,12 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_v2bf16_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_v2bf16_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2bf16_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2bf16_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -919,13 +919,13 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
; GFX9-NEXT: v_writelane_b32 v40, s16, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_v3f16_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -980,13 +980,13 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
; GFX9-NEXT: v_writelane_b32 v40, s16, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_v4f16_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_v4f16_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v4f16_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v4f16_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -1041,13 +1041,13 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s16, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_p0_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_p0_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p0_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p0_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -1102,13 +1102,13 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
; GFX9-NEXT: v_writelane_b32 v40, s16, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_p1_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_p1_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p1_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p1_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -1164,12 +1164,12 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_p3_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_p3_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p3_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p3_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -1224,15 +1224,15 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s3, s17
-; GFX9-NEXT: s_mov_b32 s2, s16
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s19, s17
+; GFX9-NEXT: s_mov_b32 s18, s16
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2p1_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2p1_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: s_getpc_b64 s[20:21]
+; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v2p1_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v2p1_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -1287,13 +1287,13 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
; GFX9-NEXT: v_writelane_b32 v40, s16, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_v2p5_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_v2p5_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2p5_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2p5_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -1348,16 +1348,16 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
; GFX9-NEXT: v_writelane_b32 v40, s19, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s3, s17
-; GFX9-NEXT: s_mov_b32 s2, s16
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
-; GFX9-NEXT: s_mov_b32 s16, s18
+; GFX9-NEXT: s_mov_b32 s20, s18
+; GFX9-NEXT: s_mov_b32 s19, s17
+; GFX9-NEXT: s_mov_b32 s18, s16
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[20:21]
-; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT: s_getpc_b64 s[22:23]
+; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
@@ -1412,23 +1412,24 @@ define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, vcc
; GFX9-NEXT: v_writelane_b32 v40, s29, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s3, s17
-; GFX9-NEXT: s_mov_b32 s2, s16
-; GFX9-NEXT: s_mov_b32 s1, s7
-; GFX9-NEXT: s_mov_b32 s0, s6
-; GFX9-NEXT: s_mov_b32 s16, s18
-; GFX9-NEXT: s_mov_b32 s17, s19
-; GFX9-NEXT: s_mov_b32 s18, s20
-; GFX9-NEXT: s_mov_b32 s19, s21
-; GFX9-NEXT: s_mov_b32 s20, s22
-; GFX9-NEXT: s_mov_b32 s21, s23
-; GFX9-NEXT: s_mov_b32 s22, s24
-; GFX9-NEXT: s_mov_b32 s23, s25
-; GFX9-NEXT: s_mov_b32 s24, s26
-; GFX9-NEXT: s_mov_b32 s25, s27
-; GFX9-NEXT: s_mov_b32 s26, s28
+; GFX9-NEXT: s_mov_b32 s30, s28
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_mov_b32 s29, s27
+; GFX9-NEXT: s_mov_b32 s28, s26
+; GFX9-NEXT: s_mov_b32 s27, s25
+; GFX9-NEXT: s_mov_b32 s26, s24
+; GFX9-NEXT: s_mov_b32 s25, s23
+; GFX9-NEXT: s_mov_b32 s24, s22
+; GFX9-NEXT: s_mov_b32 s23, s21
+; GFX9-NEXT: s_mov_b32 s22, s20
+; GFX9-NEXT: s_mov_b32 s21, s19
+; GFX9-NEXT: s_mov_b32 s20, s18
+; GFX9-NEXT: s_mov_b32 s19, s17
+; GFX9-NEXT: s_mov_b32 s18, s16
+; GFX9-NEXT: s_mov_b32 s17, s7
+; GFX9-NEXT: s_mov_b32 s16, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s30
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 vcc
; GFX9-NEXT: s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg at rel32@lo+4
@@ -1513,22 +1514,6 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inre
; GFX9-NEXT: v_writelane_b32 v40, s23, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s3, s7
-; GFX9-NEXT: s_mov_b32 s2, s6
-; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: s_mov_b32 s4, s8
-; GFX9-NEXT: s_mov_b32 s5, s9
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
-; GFX9-NEXT: s_mov_b32 s8, s15
-; GFX9-NEXT: s_mov_b32 s9, s16
-; GFX9-NEXT: s_mov_b32 s10, s17
-; GFX9-NEXT: s_mov_b32 s11, s18
-; GFX9-NEXT: s_mov_b32 s15, s19
-; GFX9-NEXT: s_mov_b32 s16, s20
-; GFX9-NEXT: s_mov_b32 s17, s21
-; GFX9-NEXT: s_mov_b32 s18, s22
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[24:25]
; GFX9-NEXT: s_add_u32 s24, s24, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 725c2d71ac5e3..5882ae7a31c09 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -71,12 +71,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: v_mov_b32_e32 v0, 1
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT: s_getpc_b64 s[6:7]
+; VI-NEXT: s_add_u32 s6, s6, external_void_func_i1 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s7, s7, external_void_func_i1 at rel32@hi+12
+; VI-NEXT: s_swappc_b64 s[30:31], s[6:7]
; VI-NEXT: s_endpgm
;
; CI-LABEL: test_call_external_void_func_i1_imm:
@@ -89,12 +89,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: v_mov_b32_e32 v0, 1
+; CI-NEXT: s_mov_b64 s[4:5], -1
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT: s_getpc_b64 s[6:7]
+; CI-NEXT: s_add_u32 s6, s6, external_void_func_i1 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s7, s7, external_void_func_i1 at rel32@hi+12
+; CI-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i1_imm:
@@ -107,23 +107,23 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: s_getpc_b64 s[6:7]
+; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i1 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i1_imm:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v0, 1
+; GFX11-NEXT: s_mov_b32 s0, -1
; GFX11-NEXT: s_mov_b32 s32, 0
-; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT: s_getpc_b64 s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i1_imm:
@@ -131,14 +131,14 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; HSA-NEXT: s_add_i32 s4, s4, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_add_u32 s0, s0, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: v_mov_b32_e32 v0, 1
+; HSA-NEXT: s_mov_b64 s[4:5], -1
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT: s_getpc_b64 s[4:5]
-; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT: s_getpc_b64 s[6:7]
+; HSA-NEXT: s_add_u32 s6, s6, external_void_func_i1 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s7, s7, external_void_func_i1 at rel32@hi+12
+; HSA-NEXT: s_swappc_b64 s[30:31], s[6:7]
; HSA-NEXT: s_endpgm
call void @external_void_func_i1(i1 true)
ret void
@@ -160,11 +160,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; VI-NEXT: v_bfe_i32 v0, v0, 0, 1
-; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT: s_getpc_b64 s[6:7]
+; VI-NEXT: s_add_u32 s6, s6, external_void_func_i1_signext at rel32@lo+4
+; VI-NEXT: s_addc_u32 s7, s7, external_void_func_i1_signext at rel32@hi+12
+; VI-NEXT: v_and_b32_e32 v0, 1, v0
+; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; VI-NEXT: s_swappc_b64 s[30:31], s[6:7]
; VI-NEXT: s_endpgm
;
; CI-LABEL: test_call_external_void_func_i1_signext:
@@ -182,11 +183,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; CI-NEXT: v_bfe_i32 v0, v0, 0, 1
-; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT: s_getpc_b64 s[6:7]
+; CI-NEXT: s_add_u32 s6, s6, external_void_func_i1_signext at rel32@lo+4
+; CI-NEXT: s_addc_u32 s7, s7, external_void_func_i1_signext at rel32@hi+12
+; CI-NEXT: v_and_b32_e32 v0, 1, v0
+; CI-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; CI-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i1_signext:
@@ -204,11 +206,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: s_getpc_b64 s[6:7]
+; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i1_signext at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i1_signext:
@@ -218,11 +221,13 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext at rel32@hi+12
-; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 1
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_getpc_b64 s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+4
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i1_signext:
@@ -237,12 +242,14 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; HSA-NEXT: s_add_u32 s0, s0, s9
; HSA-NEXT: s_addc_u32 s1, s1, 0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[4:5]
-; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; HSA-NEXT: v_bfe_i32 v0, v0, 0, 1
-; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT: s_getpc_b64 s[6:7]
+; HSA-NEXT: s_add_u32 s6, s6, external_void_func_i1_signext at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s7, s7, external_void_func_i1_signext at rel32@hi+12
+; HSA-NEXT: v_and_b32_e32 v0, 1, v0
+; HSA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; HSA-NEXT: s_swappc_b64 s[30:31], s[6:7]
; HSA-NEXT: s_endpgm
+
%var = load volatile i1, ptr addrspace(1) undef
call void @external_void_func_i1_signext(i1 signext %var)
ret void
@@ -265,11 +272,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; VI-NEXT: s_getpc_b64 s[6:7]
+; VI-NEXT: s_add_u32 s6, s6, external_void_func_i1_zeroext at rel32@lo+4
+; VI-NEXT: s_addc_u32 s7, s7, external_void_func_i1_zeroext at rel32@hi+12
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; VI-NEXT: s_swappc_b64 s[30:31], s[6:7]
; VI-NEXT: s_endpgm
;
; CI-LABEL: test_call_external_void_func_i1_zeroext:
@@ -287,11 +295,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; CI-NEXT: s_getpc_b64 s[6:7]
+; CI-NEXT: s_add_u32 s6, s6, external_void_func_i1_zeroext at rel32@lo+4
+; CI-NEXT: s_addc_u32 s7, s7, external_void_func_i1_zeroext at rel32@hi+12
; CI-NEXT: v_and_b32_e32 v0, 1, v0
-; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; CI-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: test_call_external_void_func_i1_zeroext:
@@ -309,11 +318,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; GFX9-NEXT: s_getpc_b64 s[6:7]
+; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i1_zeroext at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i1_zeroext at rel32@hi+12
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i1_zeroext:
@@ -323,11 +333,13 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext at rel32@hi+12
+; GFX11-NEXT: s_getpc_b64 s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+4
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+12
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i1_zeroext:
@@ -342,11 +354,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; HSA-NEXT: s_add_u32 s0, s0, s9
; HSA-NEXT: s_addc_u32 s1, s1, 0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[4:5]
-; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; HSA-NEXT: s_getpc_b64 s[6:7]
+; HSA-NEXT: s_add_u32 s6, s6, external_void_func_i1_zeroext at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s7, s7, external_void_func_i1_zeroext at rel32@hi+12
; HSA-NEXT: v_and_b32_e32 v0, 1, v0
-; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; HSA-NEXT: s_swappc_b64 s[30:31], s[6:7]
; HSA-NEXT: s_endpgm
%var = load volatile i1, ptr addrspace(1) undef
call void @external_void_func_i1_zeroext(i1 zeroext %var)
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 10d71a315fbf9..66a04ed26ddb7 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -11,8 +11,7 @@ define i1 @test1(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_i32_e64 s0, 0x3e8, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp slt i32 %arg1, 1000
%cmp2 = icmp slt i32 %arg2, 1000
@@ -25,8 +24,7 @@ define i1 @test2(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_u32_e64 s0, 0x3e8, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, 1000
%cmp2 = icmp ult i32 %arg2, 1000
@@ -39,8 +37,7 @@ define i1 @test3(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0x3e9, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_i32_e64 s0, 0x3e9, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sle i32 %arg1, 1000
%cmp2 = icmp sle i32 %arg2, 1000
@@ -53,8 +50,7 @@ define i1 @test4(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x3e9, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_u32_e64 s0, 0x3e9, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ule i32 %arg1, 1000
%cmp2 = icmp ule i32 %arg2, 1000
@@ -67,8 +63,7 @@ define i1 @test5(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_i32_e64 s0, 0x3e8, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sgt i32 %arg1, 1000
%cmp2 = icmp sgt i32 %arg2, 1000
@@ -81,8 +76,7 @@ define i1 @test6(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, 0x3e8, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ugt i32 %arg1, 1000
%cmp2 = icmp ugt i32 %arg2, 1000
@@ -95,8 +89,7 @@ define i1 @test7(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0x3e7, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_i32_e64 s0, 0x3e7, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sge i32 %arg1, 1000
%cmp2 = icmp sge i32 %arg2, 1000
@@ -109,8 +102,7 @@ define i1 @test8(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3e7, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, 0x3e7, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp uge i32 %arg1, 1000
%cmp2 = icmp uge i32 %arg2, 1000
@@ -123,8 +115,7 @@ define i1 @test9(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp slt i32 %arg1, %arg3
%cmp2 = icmp slt i32 %arg2, %arg3
@@ -137,8 +128,7 @@ define i1 @test10(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %arg3
%cmp2 = icmp ult i32 %arg2, %arg3
@@ -151,8 +141,7 @@ define i1 @test11(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_le_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_le_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sle i32 %arg1, %arg3
%cmp2 = icmp sle i32 %arg2, %arg3
@@ -165,8 +154,7 @@ define i1 @test12(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_le_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_le_u32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ule i32 %arg1, %arg3
%cmp2 = icmp ule i32 %arg2, %arg3
@@ -179,8 +167,7 @@ define i1 @test13(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sgt i32 %arg1, %arg3
%cmp2 = icmp sgt i32 %arg2, %arg3
@@ -193,8 +180,7 @@ define i1 @test14(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_u32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ugt i32 %arg1, %arg3
%cmp2 = icmp ugt i32 %arg2, %arg3
@@ -207,8 +193,7 @@ define i1 @test15(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_ge_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sge i32 %arg1, %arg3
%cmp2 = icmp sge i32 %arg2, %arg3
@@ -221,8 +206,7 @@ define i1 @test16(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_ge_u32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp uge i32 %arg1, %arg3
%cmp2 = icmp uge i32 %arg2, %arg3
@@ -235,8 +219,7 @@ define i1 @test17(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_i32_e64 s0, 0x3e8, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp slt i32 %arg1, 1000
%cmp2 = icmp slt i32 %arg2, 1000
@@ -249,8 +232,7 @@ define i1 @test18(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_u32_e64 s0, 0x3e8, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, 1000
%cmp2 = icmp ult i32 %arg2, 1000
@@ -263,8 +245,7 @@ define i1 @test19(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0x3e9, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_i32_e64 s0, 0x3e9, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sle i32 %arg1, 1000
%cmp2 = icmp sle i32 %arg2, 1000
@@ -277,8 +258,7 @@ define i1 @test20(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x3e9, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_u32_e64 s0, 0x3e9, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ule i32 %arg1, 1000
%cmp2 = icmp ule i32 %arg2, 1000
@@ -291,8 +271,7 @@ define i1 @test21(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_i32_e64 s0, 0x3e8, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sgt i32 %arg1, 1000
%cmp2 = icmp sgt i32 %arg2, 1000
@@ -305,8 +284,7 @@ define i1 @test22(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3e8, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, 0x3e8, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ugt i32 %arg1, 1000
%cmp2 = icmp ugt i32 %arg2, 1000
@@ -319,8 +297,7 @@ define i1 @test23(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0x3e7, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_i32_e64 s0, 0x3e7, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sge i32 %arg1, 1000
%cmp2 = icmp sge i32 %arg2, 1000
@@ -333,8 +310,7 @@ define i1 @test24(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3e7, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, 0x3e7, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp uge i32 %arg1, 1000
%cmp2 = icmp uge i32 %arg2, 1000
@@ -347,8 +323,7 @@ define i1 @test25(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp slt i32 %arg1, %arg3
%cmp2 = icmp slt i32 %arg2, %arg3
@@ -361,8 +336,7 @@ define i1 @test26(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %arg3
%cmp2 = icmp ult i32 %arg2, %arg3
@@ -375,8 +349,7 @@ define i1 @test27(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_le_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_le_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sle i32 %arg1, %arg3
%cmp2 = icmp sle i32 %arg2, %arg3
@@ -389,8 +362,7 @@ define i1 @test28(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_le_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_le_u32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ule i32 %arg1, %arg3
%cmp2 = icmp ule i32 %arg2, %arg3
@@ -403,8 +375,7 @@ define i1 @test29(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sgt i32 %arg1, %arg3
%cmp2 = icmp sgt i32 %arg2, %arg3
@@ -417,8 +388,7 @@ define i1 @test30(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_u32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ugt i32 %arg1, %arg3
%cmp2 = icmp ugt i32 %arg2, %arg3
@@ -431,8 +401,7 @@ define i1 @test31(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_ge_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sge i32 %arg1, %arg3
%cmp2 = icmp sge i32 %arg2, %arg3
@@ -445,8 +414,7 @@ define i1 @test32(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_ge_u32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp uge i32 %arg1, %arg3
%cmp2 = icmp uge i32 %arg2, %arg3
@@ -459,8 +427,7 @@ define i1 @test33(i32 %arg1, i32 %arg2) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v1, 0x3e8, v1
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, v1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_i32_e64 s0, v1, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp slt i32 %arg1, %arg2
%cmp2 = icmp slt i32 %arg1, 1000
@@ -633,8 +600,7 @@ define i1 @test42(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_u32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg3, %arg1
%cmp2 = icmp ult i32 %arg3, %arg2
@@ -647,8 +613,7 @@ define i1 @test43(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_u32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg3, %arg1
%cmp2 = icmp ult i32 %arg3, %arg2
@@ -661,8 +626,7 @@ define i1 @test44(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ugt i32 %arg3, %arg1
%cmp2 = icmp ugt i32 %arg3, %arg2
@@ -675,8 +639,7 @@ define i1 @test45(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_u32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ugt i32 %arg3, %arg1
%cmp2 = icmp ugt i32 %arg3, %arg2
@@ -689,8 +652,7 @@ define i1 @test46(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp slt i32 %arg3, %arg1
%cmp2 = icmp sgt i32 %arg2, %arg3
@@ -703,8 +665,7 @@ define i1 @test47(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sgt i32 %arg1, %arg3
%cmp2 = icmp slt i32 %arg3, %arg2
@@ -717,8 +678,7 @@ define i1 @test48(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp slt i32 %arg1, %arg3
%cmp2 = icmp sgt i32 %arg3, %arg2
@@ -731,8 +691,7 @@ define i1 @test49(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sgt i32 %arg3, %arg1
%cmp2 = icmp slt i32 %arg2, %arg3
@@ -745,8 +704,7 @@ define i1 @test50(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp slt i32 %arg3, %arg1
%cmp2 = icmp sgt i32 %arg2, %arg3
@@ -759,8 +717,7 @@ define i1 @test51(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sgt i32 %arg1, %arg3
%cmp2 = icmp slt i32 %arg3, %arg2
@@ -773,8 +730,7 @@ define i1 @test52(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp slt i32 %arg1, %arg3
%cmp2 = icmp sgt i32 %arg3, %arg2
@@ -787,8 +743,7 @@ define i1 @test53(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_i32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_i32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp sgt i32 %arg3, %arg1
%cmp2 = icmp slt i32 %arg2, %arg3
@@ -801,8 +756,7 @@ define i1 @test54(float %arg1, float %arg2, float %arg3) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_f32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp olt float %arg1, %arg3
%cmp2 = fcmp olt float %arg2, %arg3
@@ -815,8 +769,7 @@ define i1 @test55(double %arg1, double %arg2, double %arg3) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ole double %arg1, %arg3
%cmp2 = fcmp ole double %arg2, %arg3
@@ -829,8 +782,7 @@ define i1 @test56(double %arg1, double %arg2, double %arg3) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ogt double %arg1, %arg3
%cmp2 = fcmp ogt double %arg2, %arg3
@@ -843,8 +795,7 @@ define i1 @test57(float %arg1, float %arg2, float %arg3) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_ge_f32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp oge float %arg1, %arg3
%cmp2 = fcmp oge float %arg2, %arg3
@@ -857,16 +808,14 @@ define i1 @test58(double %arg1, double %arg2, double %arg3) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nle_f64_e64 s0, v[0:1], v[4:5]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test58:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ugt double %arg1, %arg3
%cmp2 = fcmp ugt double %arg2, %arg3
@@ -879,16 +828,14 @@ define i1 @test59(float %arg1, float %arg2, float %arg3) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test59:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_ge_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp uge float %arg1, %arg3
%cmp2 = fcmp uge float %arg2, %arg3
@@ -901,16 +848,14 @@ define i1 @test60(float %arg1, float %arg2, float %arg3) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ngt_f32_e64 s0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test60:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_le_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ule float %arg1, %arg3
%cmp2 = fcmp ule float %arg2, %arg3
@@ -923,16 +868,14 @@ define i1 @test61(double %arg1, double %arg2, double %arg3) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nge_f64_e64 s0, v[0:1], v[4:5]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test61:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ult double %arg1, %arg3
%cmp2 = fcmp ult double %arg2, %arg3
@@ -946,8 +889,7 @@ define i1 @test62(float %arg1, float %arg2, float %arg3) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_f32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan float %arg1, 1.0
%add2 = fadd nnan float %arg2, 2.0
@@ -964,8 +906,7 @@ define i1 @test63(double %arg1, double %arg2, double %arg3) #0 {
; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 2.0
; GCN-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan double %arg1, 1.0
%add2 = fadd nnan double %arg2, 2.0
@@ -982,8 +923,7 @@ define i1 @test64(double %arg1, double %arg2, double %arg3) #0 {
; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 2.0
; GCN-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan double %arg1, 1.0
%add2 = fadd nnan double %arg2, 2.0
@@ -999,8 +939,7 @@ define i1 @test65(float %arg1, float %arg2, float %arg3) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_ge_f32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan float %arg1, 1.0
%add2 = fadd nnan float %arg2, 2.0
@@ -1017,8 +956,7 @@ define i1 @test66(double %arg1, double %arg2, double %arg3) {
; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 2.0
; GCN-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan double %arg1, 1.0
%add2 = fadd nnan double %arg2, 2.0
@@ -1034,8 +972,7 @@ define i1 @test67(float %arg1, float %arg2, float %arg3) #0 {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_ge_f32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan float %arg1, 1.0
%add2 = fadd nnan float %arg2, 2.0
@@ -1051,8 +988,7 @@ define i1 @test68(float %arg1, float %arg2, float %arg3) #0 {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_le_f32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan float %arg1, 1.0
%add2 = fadd nnan float %arg2, 2.0
@@ -1069,8 +1005,7 @@ define i1 @test69(double %arg1, double %arg2, double %arg3) {
; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 2.0
; GCN-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan double %arg1, 1.0
%add2 = fadd nnan double %arg2, 2.0
@@ -1086,16 +1021,14 @@ define i1 @test70(float %arg1, float %arg2, float %arg3) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_lt_f32_e64 s0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test70:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call float @llvm.canonicalize.f32(float %arg1)
%var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1112,8 +1045,7 @@ define i1 @test71(double %arg1, double %arg2, double %arg3) {
; GCN-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GCN-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GCN-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%var1 = call double @llvm.canonicalize.f64(double %arg1)
%var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1130,8 +1062,7 @@ define i1 @test72(double %arg1, double %arg2, double %arg3) {
; GCN-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GCN-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GCN-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%var1 = call double @llvm.canonicalize.f64(double %arg1)
%var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1147,16 +1078,14 @@ define i1 @test73(float %arg1, float %arg2, float %arg3) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test73:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_ge_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call float @llvm.canonicalize.f32(float %arg1)
%var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1173,8 +1102,7 @@ define i1 @test74(double %arg1, double %arg2, double %arg3) {
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nle_f64_e64 s0, v[0:1], v[4:5]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test74:
@@ -1183,8 +1111,7 @@ define i1 @test74(double %arg1, double %arg2, double %arg3) {
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11NONANS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11NONANS-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call double @llvm.canonicalize.f64(double %arg1)
%var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1200,16 +1127,14 @@ define i1 @test75(float %arg1, float %arg2, float %arg3) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test75:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_ge_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call float @llvm.canonicalize.f32(float %arg1)
%var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1225,16 +1150,14 @@ define i1 @test76(float %arg1, float %arg2, float %arg3) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ngt_f32_e64 s0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test76:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_le_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call float @llvm.canonicalize.f32(float %arg1)
%var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1251,8 +1174,7 @@ define i1 @test77(double %arg1, double %arg2, double %arg3) {
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nge_f64_e64 s0, v[0:1], v[4:5]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test77:
@@ -1261,8 +1183,7 @@ define i1 @test77(double %arg1, double %arg2, double %arg3) {
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11NONANS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call double @llvm.canonicalize.f64(double %arg1)
%var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1277,8 +1198,7 @@ define i1 @test78(float %arg1, float %arg2, float %arg3) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_f32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp olt float %arg1, %arg3
%cmp2 = fcmp ogt float %arg3, %arg2
@@ -1291,16 +1211,14 @@ define i1 @test79(float %arg1, float %arg2, float %arg3) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nge_f32_e64 s0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test79:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ult float %arg1, %arg3
%cmp2 = fcmp ugt float %arg3, %arg2
@@ -1314,8 +1232,7 @@ define i1 @test80(float %arg1, float %arg2, float %arg3) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_ge_f32_e64 s0, v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan float %arg1, 1.0
%add2 = fadd nnan float %arg2, 2.0
@@ -1332,8 +1249,7 @@ define i1 @test81(double %arg1, double %arg2, double %arg3) {
; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 2.0
; GCN-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan double %arg1, 1.0
%add2 = fadd nnan double %arg2, 2.0
@@ -1350,8 +1266,7 @@ define i1 @test82(double %arg1, double %arg2, double %arg3) {
; GCN-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GCN-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GCN-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%var1 = call double @llvm.canonicalize.f64(double %arg1)
%var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1367,16 +1282,14 @@ define i1 @test83(float %arg1, float %arg2, float %arg3) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ngt_f32_e64 s0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test83:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_le_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call float @llvm.canonicalize.f32(float %arg1)
%var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1393,16 +1306,14 @@ define i1 @test84(half %arg1, half %arg2, half %arg3) {
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_lt_f16_e64 s0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test84:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f16_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call half @llvm.canonicalize.f16(half %arg1)
%var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1487,16 +1398,14 @@ define i1 @test87(half %arg1, half %arg2, half %arg3) {
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ge_f16_e64 s0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test87:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_ge_f16_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call half @llvm.canonicalize.f16(half %arg1)
%var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1547,16 +1456,14 @@ define i1 @test89(half %arg1, half %arg2, half %arg3) {
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nlt_f16_e64 s0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test89:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_ge_f16_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call half @llvm.canonicalize.f16(half %arg1)
%var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1573,16 +1480,14 @@ define i1 @test90(half %arg1, half %arg2, half %arg3) {
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ngt_f16_e64 s0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test90:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_le_f16_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call half @llvm.canonicalize.f16(half %arg1)
%var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1631,8 +1536,7 @@ define i1 @test92(i32 %arg1, i32 %arg2, i32 %arg3, i32 %C) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min3_u32 v0, v0, v1, v2
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v3
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1651,7 +1555,6 @@ define i1 @test93(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v4
; GCN-NEXT: v_cmp_gt_u32_e64 s0, v1, v4
; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1671,8 +1574,7 @@ define i1 @test94(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %ar
; GCN-NEXT: v_min3_u32 v0, v0, v1, v2
; GCN-NEXT: v_min_u32_e32 v0, v0, v4
; GCN-NEXT: v_min3_u32 v0, v5, v6, v0
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v8
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v8
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1697,8 +1599,7 @@ define i1 @test95(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_maxmin_u32 v0, v0, v1, v2
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v4
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1713,8 +1614,7 @@ define i1 @test96(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_minmax_u32 v0, v0, v1, v2
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v4
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1730,8 +1630,7 @@ define i1 @test97(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_u32_e32 v0, v0, v1
; GCN-NEXT: v_max3_u32 v0, v0, v2, v3
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v4
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1749,8 +1648,7 @@ define i1 @test98(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_u32_e32 v2, v2, v3
; GCN-NEXT: v_minmax_u32 v0, v0, v1, v2
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v4
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1768,8 +1666,7 @@ define i1 @test99(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_u32_e32 v2, v2, v3
; GCN-NEXT: v_min3_u32 v0, v0, v1, v2
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v4
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1787,8 +1684,7 @@ define i1 @test100(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_max_u32_e32 v2, v2, v3
; GCN-NEXT: v_maxmin_u32 v0, v0, v1, v2
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v4
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v4
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1807,8 +1703,7 @@ define i1 @test101(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
; GCN-NEXT: v_minmax_u32 v1, v3, v4, v5
; GCN-NEXT: v_min3_u32 v0, v0, v2, v1
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v6
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v6
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1831,8 +1726,7 @@ define i1 @test102(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
; GCN-NEXT: v_min_u32_e32 v1, v2, v3
; GCN-NEXT: v_min3_u32 v0, v0, v5, v1
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v6
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v6
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1859,7 +1753,6 @@ define i1 @test103(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, v2, v6
; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v6
; GCN-NEXT: s_or_b32 s0, s0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1892,7 +1785,6 @@ define i1 @test104(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: s_or_b32 s1, s2, vcc_lo
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1931,7 +1823,6 @@ define i1 @test105(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
; GCN-NEXT: s_and_b32 s0, vcc_lo, s0
; GCN-NEXT: s_or_b32 s1, s2, s1
; GCN-NEXT: s_and_b32 s0, s0, s1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C
%cmp2 = icmp ult i32 %arg2, %C
@@ -1968,7 +1859,6 @@ define i1 @test106(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: s_or_b32 s0, s2, s0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %C1
%cmp2 = icmp ult i32 %arg2, %C1
@@ -2001,8 +1891,7 @@ define i1 @test107(float %arg1, float %arg2, float %arg3, float %C) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min3_f32 v0, v0, v1, v2
-; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v3
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_f32_e64 s0, v0, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp olt float %arg1, %C
%cmp2 = fcmp olt float %arg2, %C
@@ -2017,16 +1906,14 @@ define i1 @test108(float %arg1, float %arg2, float %arg3, float %C) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2
-; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v3
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_nge_f32_e64 s0, v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test108:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max3_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v3
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v0, v3
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ult float %arg1, %C
%cmp2 = fcmp ult float %arg2, %C
@@ -2046,7 +1933,6 @@ define i1 @test109(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4
; GFX11-NEXT: v_cmp_gt_f32_e64 s0, v1, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test109:
@@ -2056,7 +1942,6 @@ define i1 @test109(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4
; GFX11NONANS-NEXT: v_cmp_gt_f32_e64 s0, v1, v4
; GFX11NONANS-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp olt float %arg1, %C
%cmp2 = fcmp olt float %arg2, %C
@@ -2078,7 +1963,6 @@ define i1 @test110(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8
; GCN-NEXT: v_cmp_gt_f32_e64 s0, v1, v8
; GCN-NEXT: s_and_b32 s0, vcc_lo, s0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan float %arg1, %C1
%add2 = fadd nnan float %arg2, %C2
@@ -2099,12 +1983,12 @@ define i1 @test111(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT: v_dual_min_f32 v2, v2, v3 :: v_dual_max_f32 v3, v4, v4
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v3
; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2
-; GFX11-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX11-NEXT: v_max_f32_e32 v1, v4, v4
+; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
; GFX11-NEXT: v_min3_f32 v0, v5, v6, v0
-; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_lt_f32_e64 s0, v0, v8
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test111:
@@ -2114,8 +1998,7 @@ define i1 @test111(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
; GFX11NONANS-NEXT: v_min3_f32 v0, v0, v1, v2
; GFX11NONANS-NEXT: v_min_f32_e32 v0, v0, v4
; GFX11NONANS-NEXT: v_min3_f32 v0, v5, v6, v0
-; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v0, v8
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp olt float %arg1, %C
%cmp2 = fcmp olt float %arg2, %C
@@ -2141,13 +2024,13 @@ define i1 @test112(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v4, v8
-; GFX11-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_min_f32 v2, v2, v3
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v3
; GFX11-NEXT: v_max_f32_e32 v3, v6, v6
; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2
-; GFX11-NEXT: v_min3_f32 v0, v0, v5, v3
+; GFX11-NEXT: v_max_f32_e32 v1, v5, v5
+; GFX11-NEXT: v_min3_f32 v0, v0, v1, v3
; GFX11-NEXT: v_cmp_lt_f32_e64 s0, v0, v8
; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test112:
@@ -2157,8 +2040,7 @@ define i1 @test112(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
; GFX11NONANS-NEXT: v_min3_f32 v0, v0, v1, v2
; GFX11NONANS-NEXT: v_min_f32_e32 v0, v0, v4
; GFX11NONANS-NEXT: v_min3_f32 v0, v5, v6, v0
-; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v0, v8
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp olt float %arg1, %C
%cmp2 = fcmp olt float %arg2, %C
@@ -2187,15 +2069,13 @@ define i1 @test113(float %arg1, float %arg2, float %arg3, float %C) {
; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
; GFX11-NEXT: v_cmp_nge_f32_e64 s0, v0, v3
; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test113:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_maxmin_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v3
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v0, v3
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ult float %arg1, %C
%cmp2 = fcmp ult float %arg2, %C
@@ -2214,7 +2094,6 @@ define i1 @test114(float %arg1, float %arg2, float %arg3, float %C) {
; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
; GFX11-NEXT: v_cmp_gt_f32_e64 s0, v0, v3
; GFX11-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test114:
@@ -2224,7 +2103,6 @@ define i1 @test114(float %arg1, float %arg2, float %arg3, float %C) {
; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
; GFX11NONANS-NEXT: v_cmp_gt_f32_e64 s0, v0, v3
; GFX11NONANS-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ogt float %arg1, %C
%cmp2 = fcmp ogt float %arg2, %C
@@ -2244,7 +2122,6 @@ define i1 @test115(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4
; GFX11-NEXT: v_cmp_nge_f32_e64 s0, v1, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test115:
@@ -2252,8 +2129,7 @@ define i1 @test115(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f32_e32 v2, v2, v3
; GFX11NONANS-NEXT: v_min3_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v0, v4
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp olt float %arg1, %C
%cmp2 = fcmp olt float %arg2, %C
@@ -2287,7 +2163,6 @@ define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: s_or_b32 s1, s2, vcc_lo
; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test116:
@@ -2304,7 +2179,6 @@ define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
; GFX11NONANS-NEXT: s_or_b32 s0, s0, s1
; GFX11NONANS-NEXT: s_or_b32 s1, s2, vcc_lo
; GFX11NONANS-NEXT: s_or_b32 s0, s0, s1
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp olt float %arg1, %C
%cmp2 = fcmp olt float %arg2, %C
@@ -2348,7 +2222,6 @@ define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: s_or_b32 s0, s2, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test117:
@@ -2366,7 +2239,6 @@ define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
; GFX11NONANS-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11NONANS-NEXT: s_or_b32 s0, s0, s1
; GFX11NONANS-NEXT: s_or_b32 s0, s2, s0
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp olt float %arg1, %C1
%cmp2 = fcmp olt float %arg2, %C1
@@ -2403,8 +2275,7 @@ define i1 @test118(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
; GCN-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: v_max3_f32 v0, v0, v2, v3
-; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_f32_e64 s0, v0, v8
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan float %arg1, %C1
%add2 = fadd nnan float %arg2, %C2
@@ -2428,8 +2299,7 @@ define i1 @test119(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
; GCN-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
; GCN-NEXT: v_min_f32_e32 v2, v2, v3
; GCN-NEXT: v_minmax_f32 v0, v0, v1, v2
-; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_f32_e64 s0, v0, v8
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan float %arg1, %C1
%add2 = fadd nnan float %arg2, %C2
@@ -2453,8 +2323,7 @@ define i1 @test120(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
; GCN-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
; GCN-NEXT: v_max_f32_e32 v2, v2, v3
; GCN-NEXT: v_min3_f32 v0, v0, v1, v2
-; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_f32_e64 s0, v0, v8
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan float %arg1, %C1
%add2 = fadd nnan float %arg2, %C2
@@ -2478,8 +2347,7 @@ define i1 @test121(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
; GCN-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
; GCN-NEXT: v_max_f32_e32 v2, v2, v3
; GCN-NEXT: v_maxmin_f32 v0, v0, v1, v2
-; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_f32_e64 s0, v0, v8
; GCN-NEXT: s_setpc_b64 s[30:31]
%add1 = fadd nnan float %arg1, %C1
%add2 = fadd nnan float %arg2, %C2
@@ -2500,8 +2368,7 @@ define i1 @test122(double %arg1, double %arg2, double %arg3) #1 {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ult double %arg1, %arg3
%cmp2 = fcmp ult double %arg2, %arg3
@@ -2516,8 +2383,7 @@ define i1 @test123(double %arg1, double %arg2, double %arg3) #1 {
; GCN-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GCN-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GCN-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT: v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%var1 = call double @llvm.canonicalize.f64(double %arg1)
%var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -2536,7 +2402,6 @@ define i1 @test124(i32 %arg1, i64 %arg2) {
; GCN-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0x3e8, v[1:2]
; GCN-NEXT: v_cmp_gt_i32_e64 s0, 0x3e8, v0
; GCN-NEXT: s_or_b32 s0, s0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp slt i32 %arg1, 1000
%cmp2 = icmp slt i64 %arg2, 1000
@@ -2551,7 +2416,6 @@ define i1 @test125(i32 %arg1, i32 %arg2) {
; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x3e8, v0
; GCN-NEXT: v_cmp_eq_u32_e64 s0, 0x3e8, v1
; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp eq i32 %arg1, 1000
%cmp2 = icmp eq i32 %arg2, 1000
@@ -2566,7 +2430,6 @@ define i1 @test126(i32 %arg1, i32 %arg2) {
; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x3e8, v0
; GCN-NEXT: v_cmp_ne_u32_e64 s0, 0x3e8, v1
; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ne i32 %arg1, 1000
%cmp2 = icmp ne i32 %arg2, 1000
@@ -2581,7 +2444,6 @@ define i1 @test127(i64 %arg1, i64 %arg2, i64 %arg3) {
; GCN-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
; GCN-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[4:5]
; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i64 %arg1, %arg3
%cmp2 = icmp ult i64 %arg2, %arg3
@@ -2596,7 +2458,6 @@ define i1 @test128(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v2
; GCN-NEXT: v_cmp_lt_u32_e64 s0, v2, v1
; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %arg3
%cmp2 = icmp ult i32 %arg3, %arg2
@@ -2611,7 +2472,6 @@ define i1 @test129(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v2
; GCN-NEXT: v_cmp_le_u32_e64 s0, v1, v2
; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %arg3
%cmp2 = icmp ule i32 %arg2, %arg3
@@ -2626,7 +2486,6 @@ define i1 @test130(i32 %arg1, i32 %arg2, i32 %arg3) {
; GCN-NEXT: v_cmp_le_u32_e32 vcc_lo, v2, v0
; GCN-NEXT: v_cmp_gt_u32_e64 s0, v1, v2
; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ule i32 %arg3, %arg1
%cmp2 = icmp ugt i32 %arg2, %arg3
@@ -2641,7 +2500,6 @@ define i1 @test131(i16 %arg1, i32 %arg2) {
; GCN-NEXT: v_cmp_gt_u16_e32 vcc_lo, 10, v0
; GCN-NEXT: v_cmp_gt_u32_e64 s0, 10, v1
; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i16 %arg1, 10
%cmp2 = icmp ult i32 %arg2, 10
@@ -2659,7 +2517,6 @@ define i1 @test132(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
; GCN-NEXT: s_or_b32 s1, s1, vcc_lo
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, %arg3
%cmp2 = icmp ult i32 %arg2, %arg3
@@ -2677,7 +2534,6 @@ define i1 @test133(i32 %arg1, i32 %arg2) {
; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v0
; GCN-NEXT: v_cmp_gt_u32_e64 s0, 0x3e8, v1
; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp1 = icmp ult i32 %arg1, 100
%cmp2 = icmp ult i32 %arg2, 1000
@@ -2692,15 +2548,13 @@ define i1 @test134(float %arg1, float %arg2, float %arg3) #0 {
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
; GFX11-NEXT: v_cmp_gt_f32_e64 s0, v2, v1
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test134:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp olt float %arg1, %arg3
%cmp2 = fcmp ogt float %arg3, %arg2
@@ -2715,15 +2569,13 @@ define i1 @test135(float %arg1, float %arg2, float %arg3) #0 {
; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v2
; GFX11-NEXT: v_cmp_nle_f32_e64 s0, v2, v1
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test135:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ult float %arg1, %arg3
%cmp2 = fcmp ugt float %arg3, %arg2
@@ -2740,7 +2592,6 @@ define i1 @test136(double %arg1, double %arg2, double %arg3) {
; GFX11-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
; GFX11-NEXT: v_cmp_ge_f64_e64 s0, v[4:5], v[2:3]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test136:
@@ -2749,8 +2600,7 @@ define i1 @test136(double %arg1, double %arg2, double %arg3) {
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11NONANS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call double @llvm.canonicalize.f64(double %arg1)
%var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -2768,15 +2618,13 @@ define i1 @test137(float %arg1, float %arg2, float %arg3) {
; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2
; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, v2, v1
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test137:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_le_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call float @llvm.canonicalize.f32(float %arg1)
%var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -2793,15 +2641,13 @@ define i1 @test138(float %arg1, float %arg2, float %arg3) #0 {
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
; GFX11-NEXT: v_cmp_lt_f32_e64 s0, v1, v2
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test138:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp olt float %arg1, %arg3
%cmp2 = fcmp olt float %arg2, %arg3
@@ -2816,15 +2662,13 @@ define i1 @test139(double %arg1, double %arg2, double %arg3) #0 {
; GFX11-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
; GFX11-NEXT: v_cmp_le_f64_e64 s0, v[2:3], v[4:5]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test139:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ole double %arg1, %arg3
%cmp2 = fcmp ole double %arg2, %arg3
@@ -2839,15 +2683,13 @@ define i1 @test140(double %arg1, double %arg2, double %arg3) #0 {
; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[4:5]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test140:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ogt double %arg1, %arg3
%cmp2 = fcmp ogt double %arg2, %arg3
@@ -2862,15 +2704,13 @@ define i1 @test141(float %arg1, float %arg2, float %arg3) #0 {
; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
; GFX11-NEXT: v_cmp_ge_f32_e64 s0, v1, v2
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test141:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_ge_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp oge float %arg1, %arg3
%cmp2 = fcmp oge float %arg2, %arg3
@@ -2885,15 +2725,13 @@ define i1 @test142(double %arg1, double %arg2, double %arg3) #0 {
; GFX11-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
; GFX11-NEXT: v_cmp_nle_f64_e64 s0, v[2:3], v[4:5]
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test142:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ugt double %arg1, %arg3
%cmp2 = fcmp ugt double %arg2, %arg3
@@ -2908,15 +2746,13 @@ define i1 @test143(float %arg1, float %arg2, float %arg3) #0 {
; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2
; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, v1, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test143:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_ge_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp uge float %arg1, %arg3
%cmp2 = fcmp uge float %arg2, %arg3
@@ -2931,15 +2767,13 @@ define i1 @test144(float %arg1, float %arg2, float %arg3) #0 {
; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2
; GFX11-NEXT: v_cmp_ngt_f32_e64 s0, v1, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test144:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_le_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ule float %arg1, %arg3
%cmp2 = fcmp ule float %arg2, %arg3
@@ -2954,15 +2788,13 @@ define i1 @test145(double %arg1, double %arg2, double %arg3) #0 {
; GFX11-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
; GFX11-NEXT: v_cmp_nge_f64_e64 s0, v[2:3], v[4:5]
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test145:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%cmp1 = fcmp ult double %arg1, %arg3
%cmp2 = fcmp ult double %arg2, %arg3
@@ -2978,15 +2810,13 @@ define i1 @test146(float %arg1, float %arg2, float %arg3) {
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
; GFX11-NEXT: v_cmp_lt_f32_e64 s0, v1, v2
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test146:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call float @llvm.canonicalize.f32(float %arg1)
%var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3005,7 +2835,6 @@ define i1 @test147(double %arg1, double %arg2, double %arg3) {
; GFX11-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
; GFX11-NEXT: v_cmp_le_f64_e64 s0, v[2:3], v[4:5]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test147:
@@ -3014,8 +2843,7 @@ define i1 @test147(double %arg1, double %arg2, double %arg3) {
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11NONANS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_le_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call double @llvm.canonicalize.f64(double %arg1)
%var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -3034,7 +2862,6 @@ define i1 @test148(double %arg1, double %arg2, double %arg3) {
; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[4:5]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test148:
@@ -3043,8 +2870,7 @@ define i1 @test148(double %arg1, double %arg2, double %arg3) {
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11NONANS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11NONANS-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call double @llvm.canonicalize.f64(double %arg1)
%var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -3062,15 +2888,13 @@ define i1 @test149(float %arg1, float %arg2, float %arg3) {
; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
; GFX11-NEXT: v_cmp_ge_f32_e64 s0, v1, v2
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test149:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_ge_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call float @llvm.canonicalize.f32(float %arg1)
%var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3089,7 +2913,6 @@ define i1 @test150(double %arg1, double %arg2, double %arg3) {
; GFX11-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
; GFX11-NEXT: v_cmp_nle_f64_e64 s0, v[2:3], v[4:5]
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test150:
@@ -3098,8 +2921,7 @@ define i1 @test150(double %arg1, double %arg2, double %arg3) {
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11NONANS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_gt_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call double @llvm.canonicalize.f64(double %arg1)
%var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -3117,15 +2939,13 @@ define i1 @test151(float %arg1, float %arg2, float %arg3) {
; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2
; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, v1, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test151:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_ge_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call float @llvm.canonicalize.f32(float %arg1)
%var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3143,15 +2963,13 @@ define i1 @test152(float %arg1, float %arg2, float %arg3) {
; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2
; GFX11-NEXT: v_cmp_ngt_f32_e64 s0, v1, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test152:
; GFX11NONANS: ; %bb.0:
; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11NONANS-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_le_f32_e64 s0, v0, v2
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call float @llvm.canonicalize.f32(float %arg1)
%var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3170,7 +2988,6 @@ define i1 @test153(double %arg1, double %arg2, double %arg3) {
; GFX11-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
; GFX11-NEXT: v_cmp_nge_f64_e64 s0, v[2:3], v[4:5]
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11NONANS-LABEL: test153:
@@ -3179,8 +2996,7 @@ define i1 @test153(double %arg1, double %arg2, double %arg3) {
; GFX11NONANS-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11NONANS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11NONANS-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT: v_cmp_lt_f64_e64 s0, v[0:1], v[4:5]
; GFX11NONANS-NEXT: s_setpc_b64 s[30:31]
%var1 = call double @llvm.canonicalize.f64(double %arg1)
%var2 = call double @llvm.canonicalize.f64(double %arg2)
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
index eecc91239c728..279819165f33c 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
@@ -5,20 +5,19 @@ define void @wombat(i1 %cond, ptr addrspace(5) %addr) {
; CHECK-LABEL: wombat:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: buffer_load_ubyte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %then
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: .LBB0_2: ; %end
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
+
entry:
%load = load <1 x i8>, ptr addrspace(5) %addr, align 1
br i1 %cond, label %then, label %end
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index c3a6cd5975a77..53448df79ee27 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -34,19 +34,17 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x
define i1 @divergent_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) {
; GCN-LABEL: name: divergent_trunc_i16_to_i1
; GCN: bb.0 (%ir-block.0):
- ; GCN-NEXT: liveins: $vgpr2, $vgpr3
+ ; GCN-NEXT: liveins: $vgpr2, $sgpr4_sgpr5
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec
- ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
; GCN-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[COPY1]], 0, 16, implicit $exec
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GCN-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_BFE_I32_e64_]], killed [[S_MOV_B32_]], implicit $exec
- ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
- ; GCN-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
- ; GCN-NEXT: SI_RETURN implicit $vgpr0
+ ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], [[COPY]], implicit-def dead $scc
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
+ ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[COPY2]]
+ ; GCN-NEXT: SI_RETURN implicit $sgpr0_sgpr1
%setcc = icmp slt i16 %x, 0
%select = select i1 %setcc, i1 true, i1 %z
ret i1 %select
@@ -86,18 +84,16 @@ define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x
define i1 @divergent_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) {
; GCN-LABEL: name: divergent_trunc_i32_to_i1
; GCN: bb.0 (%ir-block.0):
- ; GCN-NEXT: liveins: $vgpr2, $vgpr3
+ ; GCN-NEXT: liveins: $vgpr2, $sgpr4_sgpr5
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec
- ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GCN-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY1]], killed [[S_MOV_B32_]], implicit $exec
- ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
- ; GCN-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
- ; GCN-NEXT: SI_RETURN implicit $vgpr0
+ ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], [[COPY]], implicit-def dead $scc
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
+ ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[COPY2]]
+ ; GCN-NEXT: SI_RETURN implicit $sgpr0_sgpr1
%setcc = icmp slt i32 %x, 0
%select = select i1 %setcc, i1 true, i1 %z
ret i1 %select
@@ -141,21 +137,19 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
define i1 @divergent_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) {
; GCN-LABEL: name: divergent_trunc_i64_to_i1
; GCN: bb.0 (%ir-block.0):
- ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4
+ ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $sgpr4_sgpr5
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec
- ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE]], [[COPY3]], implicit $exec
- ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
- ; GCN-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
- ; GCN-NEXT: SI_RETURN implicit $vgpr0
+ ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], [[COPY]], implicit-def dead $scc
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
+ ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[COPY2]]
+ ; GCN-NEXT: SI_RETURN implicit $sgpr0_sgpr1
%setcc = icmp slt i64 %x, 0
%select = select i1 %setcc, i1 true, i1 %z
ret i1 %select
diff --git a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
index 72ee660dc2adb..02a3066822e51 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
@@ -29,6 +29,8 @@ define i1 @extractloadi1(ptr %ptr, i32 %idx) {
; CHECK-NEXT: buffer_store_byte v2, off, s[0:3], s32 offset:1
; CHECK-NEXT: buffer_load_ubyte v0, v1, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%val = load <8 x i1>, ptr %ptr
%ret = extractelement <8 x i1> %val, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index b5440b9c38c9f..fdf060ce5c24e 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2835,10 +2835,8 @@ define float @v_fneg_select_infloop_regression_f32(float %arg, i1 %arg1) {
; GCN-LABEL: v_fneg_select_infloop_regression_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float 0.0, float %arg
%i2 = fneg float %i
@@ -2850,10 +2848,8 @@ define float @v_fneg_select_infloop_regression_f32_commute0(float %arg, i1 %arg1
; GCN-LABEL: v_fneg_select_infloop_regression_f32_commute0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float %arg, float 0.0
%i2 = fneg float %i
@@ -2865,10 +2861,8 @@ define float @v_fneg_select_infloop_regression_f32_commute1(float %arg, i1 %arg1
; GCN-LABEL: v_fneg_select_infloop_regression_f32_commute1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -v0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -v0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float 0.0, float %arg
%i2 = fneg float %i
@@ -2880,10 +2874,8 @@ define float @v_fneg_select_infloop_regression_f32_commute2(float %arg, i1 %arg1
; GCN-LABEL: v_fneg_select_infloop_regression_f32_commute2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -v0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -v0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float %arg, float 0.0
%i2 = fneg float %i
@@ -2896,10 +2888,8 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32(float %arg, i1 %ar
; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 2.0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 2.0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 2.0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float 2.0, float %arg
%i2 = fneg float %i
@@ -2911,10 +2901,8 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute0(float %ar
; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32_commute0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 2.0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, v0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 2.0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float %arg, float 2.0
%i2 = fneg float %i
@@ -2926,10 +2914,8 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute1(float %ar
; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32_commute1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, -v0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 2.0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, -v0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float 2.0, float %arg
%i2 = fneg float %i
@@ -2941,10 +2927,8 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute2(float %ar
; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32_commute2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, -v0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, v0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, -v0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float %arg, float 2.0
%i2 = fneg float %i
@@ -2957,10 +2941,8 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32(float %arg, i1
; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, -2.0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -2.0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, -2.0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float -2.0, float %arg
%i2 = fneg float %i
@@ -2972,10 +2954,8 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute0(float
; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32_commute0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, -2.0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, -2.0, v0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, -2.0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float %arg, float -2.0
%i2 = fneg float %i
@@ -2987,10 +2967,8 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute1(float
; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32_commute1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, -2.0, -v0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -2.0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, -2.0, -v0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float -2.0, float %arg
%i2 = fneg float %i
@@ -3002,10 +2980,8 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float
; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, -2.0, -v0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, -2.0, v0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, -2.0, -v0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float %arg, float -2.0
%i2 = fneg float %i
@@ -3064,12 +3040,10 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) {
; GCN-LABEL: v_fneg_select_infloop_regression_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v2, 1, v2
-; GCN-NEXT: v_bfrev_b32_e32 v3, 1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT: v_cndmask_b32_e64 v1, -v1, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GCN-NEXT: v_bfrev_b32_e32 v2, 1
+; GCN-NEXT: v_cndmask_b32_e64 v1, -v1, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, double 0.0, double %arg
%i2 = fneg double %i
@@ -3121,21 +3095,17 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_and_b32_e32 v1, 1, v1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc
+; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_select_infloop_regression_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 1, v1
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, half 0.0, half %arg
%i2 = fneg half %i
@@ -3188,11 +3158,9 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_and_b32_e32 v1, 1, v2
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
; SI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc
+; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
@@ -3201,11 +3169,9 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a
; VI-LABEL: v_fneg_select_infloop_regression_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 1, v1
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, <2 x half> zeroinitializer, <2 x half> %arg
%i2 = fneg <2 x half> %i
@@ -3262,13 +3228,11 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1
; GCN-LABEL: v_fneg_select_infloop_regression_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v2, 1, v2
-; GCN-NEXT: v_bfrev_b32_e32 v3, 1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT: v_cndmask_b32_e64 v1, -v1, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GCN-NEXT: v_bfrev_b32_e32 v2, 1
+; GCN-NEXT: v_cndmask_b32_e64 v1, -v1, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, <2 x float> zeroinitializer, <2 x float> %arg
%i2 = fneg <2 x float> %i
@@ -3315,10 +3279,8 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) {
; GCN-LABEL: v_fabs_select_infloop_regression_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, |v0|, 0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, |v0|, 0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float 0.0, float %arg
%i2 = call float @llvm.fabs.f32(float %i)
@@ -3366,10 +3328,8 @@ define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) {
; GCN-LABEL: v_fneg_fabs_select_infloop_regression:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, -|v0|, 0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, -|v0|, 0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float 0.0, float %arg
%i2 = call float @llvm.fabs.f32(float %i)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index cd1ec85eb8d0f..3680c416cd43f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -7,18 +7,13 @@ define i32 @fneg_xor_select_i32(i1 %cond, i32 %arg0, i32 %arg1) {
; GCN-LABEL: fneg_xor_select_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v1, -v0, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fneg_xor_select_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -v1, -v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i32 %arg0, i32 %arg1
%fneg = xor i32 %select, -2147483648
@@ -57,10 +52,8 @@ define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr ad
; GFX7-LABEL: fneg_xor_select_i32_multi_use:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: flat_store_dword v[3:4], v0
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -68,10 +61,8 @@ define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr ad
; GFX9-LABEL: fneg_xor_select_i32_multi_use:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: global_store_dword v[3:4], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -79,12 +70,10 @@ define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr ad
; GFX11-LABEL: fneg_xor_select_i32_multi_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
-; GFX11-NEXT: global_store_b32 v[3:4], v1, off
+; GFX11-NEXT: global_store_b32 v[2:3], v1, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i32 %arg0, i32 %arg1
store i32 %select, ptr addrspace(1) %ptr
@@ -96,20 +85,15 @@ define i64 @fneg_xor_select_i64(i1 %cond, i64 %arg0, i64 %arg1) {
; GCN-LABEL: fneg_xor_select_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, -v3, -v1, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fneg_xor_select_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, -v4, -v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, -v3, -v1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i64 %arg0, i64 %arg1
%fneg = xor i64 %select, 9223372036854775808
@@ -152,19 +136,15 @@ define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) {
; GCN-LABEL: fneg_xor_select_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fneg_xor_select_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, v0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i16 %arg0, i16 %arg1
@@ -231,10 +211,8 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
; GFX7-LABEL: fneg_xor_select_i16_multi_use:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: flat_store_short v[3:4], v0
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX7-NEXT: flat_store_short v[2:3], v0
; GFX7-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -242,10 +220,8 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
; GFX9-LABEL: fneg_xor_select_i16_multi_use:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: global_store_short v[3:4], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -253,12 +229,10 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
; GFX11-LABEL: fneg_xor_select_i16_multi_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff8000, v1
-; GFX11-NEXT: global_store_b16 v[3:4], v1, off
+; GFX11-NEXT: global_store_b16 v[2:3], v1, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i16 %arg0, i16 %arg1
store i16 %select, ptr addrspace(1) %ptr
@@ -270,38 +244,34 @@ define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr a
; GFX7-LABEL: fneg_xor_select_i64_multi_user:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
-; GFX7-NEXT: flat_store_dwordx2 v[5:6], v[0:1]
-; GFX7-NEXT: v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GFX7-NEXT: v_mov_b32_e32 v6, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, v6, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX7-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, -v3, -v6, s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fneg_xor_select_i64_multi_user:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_mov_b32_e32 v7, v1
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
-; GFX9-NEXT: global_store_dwordx2 v[5:6], v[0:1], off
-; GFX9-NEXT: v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GFX9-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v6, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT: v_cndmask_b32_e64 v1, -v3, -v6, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fneg_xor_select_i64_multi_user:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v2 :: v_dual_cndmask_b32 v0, v3, v7
-; GFX11-NEXT: v_cndmask_b32_e64 v2, -v4, -v2, vcc_lo
-; GFX11-NEXT: global_store_b64 v[5:6], v[0:1], off
+; GFX11-NEXT: v_mov_b32_e32 v6, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, v6, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, -v3, -v6, s0
+; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off
; GFX11-NEXT: v_mov_b32_e32 v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i64 %arg0, i64 %arg1
@@ -314,30 +284,21 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
; GCN-LABEL: select_fneg_xor_select_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: select_fneg_xor_select_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
+; GFX11-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v1, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg0 = xor i32 %arg0, -2147483648
%select0 = select i1 %cond0, i32 %arg1, i32 %fneg0
@@ -350,25 +311,16 @@ define float @select_fneg_select_f32(i1 %cond0, i1 %cond1, float %arg0, float %a
; GCN-LABEL: select_fneg_select_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, v3, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -v0, s[6:7]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: select_fneg_select_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, v3, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, v1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -v0, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg0 = fneg float %arg0
%select0 = select i1 %cond0, float %arg1, float %fneg0
@@ -381,20 +333,15 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
; GCN-LABEL: fneg_xor_select_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, -v3, -v1, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fneg_xor_select_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, -v4, -v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, -v3, -v1, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, double %arg0, double %arg1
%fneg = fneg double %select
@@ -405,12 +352,9 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
; GFX7-LABEL: fneg_xor_select_f64_multi_user:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
-; GFX7-NEXT: flat_store_dwordx2 v[5:6], v[0:1]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX7-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -418,12 +362,9 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
; GFX9-LABEL: fneg_xor_select_f64_multi_user:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_mov_b32_e32 v7, v1
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
-; GFX9-NEXT: global_store_dwordx2 v[5:6], v[0:1], off
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -431,13 +372,11 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
; GFX11-LABEL: fneg_xor_select_f64_multi_user:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v2 :: v_dual_cndmask_b32 v0, v3, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, v1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
-; GFX11-NEXT: global_store_b64 v[5:6], v[0:1], off
+; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off
; GFX11-NEXT: v_mov_b32_e32 v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, double %arg0, double %arg1
@@ -450,21 +389,18 @@ define double @fneg_xor_select_i64_user_with_srcmods(i1 %cond, i64 %arg0, i64 %a
; GCN-LABEL: fneg_xor_select_i64_user_with_srcmods:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GCN-NEXT: v_add_f64 v[0:1], -v[1:2], 2.0
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GCN-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fneg_xor_select_i64_user_with_srcmods:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v2, v4, v2
-; GFX11-NEXT: v_add_f64 v[0:1], -v[1:2], 2.0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, v1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i64 %arg0, i64 %arg1
%fneg = xor i64 %select, 9223372036854775808
@@ -477,32 +413,23 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
; GCN-LABEL: select_fneg_select_fneg_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
-; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[6:7]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: select_fneg_select_fneg_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
+; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0
+; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v2, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg0 = fneg double %arg0
%select0 = select i1 %cond0, double %arg1, double %fneg0
@@ -515,32 +442,23 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
; GCN-LABEL: select_fneg_xor_select_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
-; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[6:7]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: select_fneg_xor_select_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
+; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0
+; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v2, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg0 = xor i64 %arg0, 9223372036854775808
%select0 = select i1 %cond0, i64 %arg1, i64 %fneg0
@@ -553,45 +471,32 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1
; GFX7-LABEL: select_fneg_select_f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e64 v2, -v2
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, -v0, s[6:7]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: select_fneg_select_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: select_fneg_select_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v0
+; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v1, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg0 = fneg half %arg0
%select0 = select i1 %cond0, half %arg1, half %fneg0
@@ -604,30 +509,21 @@ define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg
; GCN-LABEL: select_fneg_xor_select_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GCN-NEXT: v_xor_b32_e32 v2, 0xffff8000, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: select_fneg_xor_select_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_xor_b32_e32 v2, 0xffff8000, v0
+; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v1, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg0 = xor i16 %arg0, -32768
%select0 = select i1 %cond0, i16 %arg1, i16 %fneg0
>From ad7d65712f860ddee8413fa75b11f820b02681d3 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 19 Feb 2024 18:20:19 -0600
Subject: [PATCH 10/20] Updated calling conv such that inreg i1 is promoted to
i32 before being allocated.
---
llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 9 +-
.../CodeGen/AMDGPU/function-args-inreg.ll | 3 -
llvm/test/CodeGen/AMDGPU/function-args.ll | 192 +++++++++---------
3 files changed, 103 insertions(+), 101 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 0a197e4a786cc..8dd1daa642f9f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -187,16 +187,17 @@ def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
// Calling convention for leaf functions
def CC_AMDGPU_Func : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
+ CCIfType<[i1], CCIfInReg<CCPromoteToType<i32>>>,
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
- CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
-
- CCIfType<[i1], CCPromoteToType<i32>>,
-
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
!foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
>>>,
+ CCIfType<[i1], CCCustom<"CC_AMDGPU_Custom_I1">>,
+
+ CCIfType<[i1], CCPromoteToType<i32>>,
+
CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1, bf16, v2bf16], CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 44a9127b4bd09..9871b89431cd0 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1793,9 +1793,6 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s7, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s2, s6
-; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 9fca84ef2667c..530e439ae572a 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -2778,11 +2778,16 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12
; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
+; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
+; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8
+; CI-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; CI-NEXT: s_waitcnt vmcnt(2)
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2791,13 +2796,9 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v18, v20
-; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v17, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_short v18, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2824,8 +2825,9 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16
; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:4
-; VI-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2834,7 +2836,7 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2842,7 +2844,7 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_short v17, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_short v19, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v18, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2863,8 +2865,9 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16
; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2873,7 +2876,7 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -2881,22 +2884,23 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_short v17, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_short v19, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v18, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: scratch_load_u16 v33, off, s32 offset:4
; GFX11-NEXT: scratch_load_u16 v34, off, s32 offset:8
; GFX11-NEXT: scratch_load_u16 v35, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v36, off, s32 offset:16
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_cndmask_b32_e64 v32, 0, 1, s0
-; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -4892,51 +4896,51 @@ define void @many_i1_args(
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_xor_saveexec_b64 vcc, -1
-; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, vcc
-; GFX9-NEXT: v_writelane_b32 v19, s30, 0
-; GFX9-NEXT: v_writelane_b32 v19, s31, 1
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX9-NEXT: v_writelane_b32 v20, s30, 0
+; GFX9-NEXT: v_writelane_b32 v20, s31, 1
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
; GFX9-NEXT: s_mov_b32 s31, 0xf000
; GFX9-NEXT: s_mov_b32 s30, -1
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7]
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9]
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[8:9]
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[10:11]
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[10:11]
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[12:13]
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[12:13]
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[14:15]
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[14:15]
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[16:17]
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[16:17]
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[18:19]
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[18:19]
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[20:21]
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[20:21]
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[22:23]
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[22:23]
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[24:25]
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[24:25]
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[26:27]
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[26:27]
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[28:29]
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[28:29]
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: buffer_store_byte v20, off, s[28:31], 0
+; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -4994,10 +4998,10 @@ define void @many_i1_args(
; GFX9-NEXT: v_and_b32_e32 v0, 1, v18
; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v19, 1
-; GFX9-NEXT: v_readlane_b32 s30, v19, 0
+; GFX9-NEXT: v_readlane_b32 s31, v20, 1
+; GFX9-NEXT: v_readlane_b32 s30, v20, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -5006,23 +5010,25 @@ define void @many_i1_args(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_xor_saveexec_b32 vcc_lo, -1
-; GFX11-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: scratch_store_b32 off, v7, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo
-; GFX11-NEXT: v_writelane_b32 v2, s30, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1
+; GFX11-NEXT: v_writelane_b32 v7, s30, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1
; GFX11-NEXT: s_mov_b32 s30, -1
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
-; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX11-NEXT: v_writelane_b32 v7, s31, 1
; GFX11-NEXT: s_mov_b32 s31, 0x31016000
+; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s3
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6
+; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s6
; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
@@ -5031,13 +5037,13 @@ define void @many_i1_args(
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s9
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s11
+; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s7
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s9
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s11
; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
@@ -5046,13 +5052,13 @@ define void @many_i1_args(
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s13
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s14
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s15
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s16
+; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s12
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s13
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s14
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s15
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s16
; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
@@ -5061,13 +5067,13 @@ define void @many_i1_args(
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s17
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s18
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s19
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s20
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s21
+; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s17
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s18
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s19
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s20
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s21
; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
@@ -5076,13 +5082,13 @@ define void @many_i1_args(
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s22
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s24
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s25
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s26
+; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s22
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s24
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s25
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s26
; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
@@ -5091,27 +5097,25 @@ define void @many_i1_args(
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v7, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s27
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s28
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s29
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s27
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s28
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s29
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v0, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v1, off, s[28:31], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_readlane_b32 s31, v2, 1
-; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: v_readlane_b32 s31, v7, 1
+; GFX11-NEXT: v_readlane_b32 s30, v7, 0
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: scratch_load_b32 v7, off, s32 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
>From d841a49e6b9d06c9b91bdae02e42e6561329facb Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Sun, 10 Mar 2024 19:07:38 -0500
Subject: [PATCH 11/20] Add an additional CopyToReg and CopyFromReg for the
CopyFromReg for the i1 return value.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 33 +++++++++++++++++++++-
llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 8 ------
2 files changed, 32 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0fdb3c4e36c67..a40fec97d6b91 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3239,6 +3239,21 @@ SDValue SITargetLowering::LowerCallResult(
Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
Chain = Val.getValue(1);
InGlue = Val.getValue(2);
+
+ // For i1 return value allocated to an SGPR, we want the dst reg for the
+ // above CopyFromReg not to be of VReg_1 when emitting machine code.
+ // This requires creating an addional CopyToReg followed by another
+ // CopyFromReg.
+ if (RVLocs.size() == 1 && VA.getLocVT() == MVT::i1) {
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+
+ if (TRI->isSGPRReg(MRI, VA.getLocReg())) {
+ Register TmpVReg = MRI.createVirtualRegister(TRI->getBoolRC());
+ SDValue TmpCopyTo = DAG.getCopyToReg(Chain, DL, TmpVReg, Val);
+ Val = DAG.getCopyFromReg(TmpCopyTo, DL, TmpVReg, MVT::i1);
+ }
+ }
} else if (VA.isMemLoc()) {
report_fatal_error("TODO: return values in memory");
} else
@@ -15995,6 +16010,21 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
return false;
}
+LLVM_ATTRIBUTE_UNUSED
+static bool isCopyFromRegForI1Return(const SDNode *N) {
+ assert(N->getOpcode() == ISD::CopyFromReg);
+ SDNode *N1 = N->getOperand(0).getNode();
+ if (N1->getOpcode() != ISD::CopyToReg)
+ return false;
+ SDNode *N2 = N1->getOperand(0).getNode();
+ if (N2->getOpcode() != ISD::CopyFromReg)
+ return false;
+ SDNode *N3 = N2->getOperand(0).getNode();
+ if (N3->getOpcode() != ISD::CALLSEQ_END)
+ return false;
+ return true;
+}
+
bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
FunctionLoweringInfo *FLI,
UniformityInfo *UA) const {
@@ -16012,7 +16042,8 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
return UA->isDivergent(V);
- assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
+ assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N) ||
+ isCopyFromRegForI1Return(N));
return !TRI->isSGPRReg(MRI, Reg);
}
case ISD::LOAD: {
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index a04ce16cbddb6..32dad0c425c04 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -689,14 +689,6 @@ bool Vreg1LoweringHelper::lowerCopiesToI1() {
assert(!MI.getOperand(1).getSubReg());
if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
- if (!SrcReg.isVirtual() &&
- TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
- // When calling convention allocates SGPR for i1, for GPUs with
- // wavefront size 64, i1 return value is put in 64b SGPR.
- assert(ST->isWave64());
- continue;
- }
-
assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
>From df1bbe3c23080e39486bc0ad6be2a71092877d02 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Sun, 10 Mar 2024 19:18:55 -0500
Subject: [PATCH 12/20] Revert a formatting change made by clang-format.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3db884b78e007..08351c49b2231 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -866,7 +866,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addReg(SrcReg, getKillRegState(KillSrc));
return;
}
@@ -881,13 +881,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (DestReg == AMDGPU::VCC) {
if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addReg(SrcReg, getKillRegState(KillSrc));
} else {
// FIXME: Hack until VReg_1 removed.
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
- .addImm(0)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
}
return;
>From 4c098fed1b45576c60bed18ca1cdc6697f452a07 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 21 Mar 2024 18:51:58 -0500
Subject: [PATCH 13/20] This commit: (1) fixed i1 array as func return (2)
fixed i1 return when GlobalISel is used (3) zeroext/signext in i1 return is
ignored (4) inreg return of i1 is treated as i32 (5) new test files.
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 12 +-
llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 2 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 19 +-
.../GlobalISel/function-call-i1-return.ll | 294 +++++++
.../AMDGPU/GlobalISel/function-i1-args.ll | 569 ++++++++++++
.../AMDGPU/GlobalISel/function-returns.ll | 22 +-
.../GlobalISel/irtranslator-function-args.ll | 209 +----
llvm/test/CodeGen/AMDGPU/function-args.ll | 329 ++-----
.../CodeGen/AMDGPU/function-call-i1-return.ll | 198 +++++
llvm/test/CodeGen/AMDGPU/function-i1-args.ll | 819 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/function-returns.ll | 12 +-
12 files changed, 1996 insertions(+), 492 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/function-i1-args.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 53dbae7765803..2d25827906f15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -73,7 +73,7 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
if (TRI->isSGPRReg(MRI, PhysReg)) {
LLT Ty = MRI.getType(ExtReg);
LLT S32 = LLT::scalar(32);
- if (Ty != S32) {
+ if (Ty != S32 && Ty != LLT::scalar(64)) {
// FIXME: We should probably support readfirstlane intrinsics with all
// legal 32-bit types.
assert(Ty.getSizeInBits() == 32);
@@ -88,6 +88,9 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
{MRI.getType(ExtReg)})
.addReg(ExtReg);
ExtReg = ToSGPR.getReg(0);
+ if (VA.getLocVT() == MVT::i1 &&
+ MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
+ ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ExtReg).getReg(0);
}
MIRBuilder.buildCopy(PhysReg, ExtReg);
@@ -127,10 +130,9 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
unsigned CopyToBits = 32;
// When function return type is i1, it may be in a 64b register.
- if (VA.getLocVT() == MVT::i1) {
- if (MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
- CopyToBits = 64;
- }
+ if (VA.getLocVT() == MVT::i1 &&
+ MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
+ CopyToBits = 64;
auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 8dd1daa642f9f..2c356731bf995 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -209,6 +209,8 @@ def CC_AMDGPU_Func : CallingConv<[
// Calling convention for leaf functions
def RetCC_AMDGPU_Func : CallingConv<[
CCIfType<[i16], CCIfExtend<CCPromoteToType<i32>>>,
+ CCIfType<[i1], CCIfInReg<CCPromoteToType<i32>>>,
+
CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
CCIfType<[i32, f32, i16, f16, v2i16, v2f16, bf16, v2bf16], CCAssignToReg<[
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 02cb248836df1..e117395eb699a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -816,6 +816,9 @@ EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
ISD::NodeType ExtendKind) const {
assert(!VT.isVector() && "only scalar expected");
+ if (VT == MVT::i1)
+ return MVT::i1;
+
// Round to the next multiple of 32-bits.
unsigned Size = VT.getSizeInBits();
if (Size <= 32)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a40fec97d6b91..618fdd95f4a4b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3240,11 +3240,13 @@ SDValue SITargetLowering::LowerCallResult(
Chain = Val.getValue(1);
InGlue = Val.getValue(2);
- // For i1 return value allocated to an SGPR, we want the dst reg for the
- // above CopyFromReg not to be of VReg_1 when emitting machine code.
- // This requires creating an addional CopyToReg followed by another
+ // For i1 return value allocated to an SGPR, the following is a
+ // workaround before SILowerI1Copies is fixed. Basically we want the
+ // dst reg for the above CopyFromReg not to be of the VReg_1 class
+ // when emitting machine code. This workaround creats an addional
+ // CopyToReg with a new virtual register, followed by another
// CopyFromReg.
- if (RVLocs.size() == 1 && VA.getLocVT() == MVT::i1) {
+ if (VA.getLocVT() == MVT::i1) {
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
@@ -16019,7 +16021,14 @@ static bool isCopyFromRegForI1Return(const SDNode *N) {
SDNode *N2 = N1->getOperand(0).getNode();
if (N2->getOpcode() != ISD::CopyFromReg)
return false;
- SDNode *N3 = N2->getOperand(0).getNode();
+
+ // Possibly multiple CopyFromReg nodes before getting to CALLSEQ_END,
+ // e.g., when the return value is an array.
+ SDNode *N3 = N2;
+ do {
+ N3 = N3->getOperand(0).getNode();
+ } while (N3->getOpcode() == ISD::CopyFromReg);
+
if (N3->getOpcode() != ISD::CALLSEQ_END)
return false;
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
new file mode 100644
index 0000000000000..24a51a9904d25
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX9 -enable-var-scope %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 -enable-var-scope %s
+
+define i1 @i1_func_void() {
+; GFX9-LABEL: name: i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+; GFX9-NEXT: SI_RETURN implicit $sgpr0_sgpr1
+;
+; GFX11-LABEL: name: i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr0 = COPY [[INTRIN]](s32)
+; GFX11-NEXT: SI_RETURN implicit $sgpr0
+ %val = load i1, ptr addrspace(1) undef
+ ret i1 %val
+}
+
+define void @test_call_i1_func_void() {
+; GFX9-LABEL: name: test_call_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i1_func_void
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i1_func_void
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit-def $sgpr0
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ %val = call i1 @i1_func_void()
+ store volatile i1 %val, ptr addrspace(1) undef
+ ret void
+}
+
+define zeroext i1 @zeroext_i1_func_void() {
+; GFX9-LABEL: name: zeroext_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+; GFX9-NEXT: SI_RETURN implicit $sgpr0_sgpr1
+;
+; GFX11-LABEL: name: zeroext_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr0 = COPY [[INTRIN]](s32)
+; GFX11-NEXT: SI_RETURN implicit $sgpr0
+ %val = load i1, ptr addrspace(1) undef
+ ret i1 %val
+}
+
+define void @test_call_zeroext_i1_func_void() {
+; GFX9-LABEL: name: test_call_zeroext_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @zeroext_i1_func_void
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_zeroext_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @zeroext_i1_func_void
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit-def $sgpr0
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ %val = call i1 @zeroext_i1_func_void()
+ store volatile i1 %val, ptr addrspace(1) undef
+ ret void
+}
+
+define signext i1 @signext_i1_func_void() {
+; GFX9-LABEL: name: signext_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+; GFX9-NEXT: SI_RETURN implicit $sgpr0_sgpr1
+;
+; GFX11-LABEL: name: signext_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr0 = COPY [[INTRIN]](s32)
+; GFX11-NEXT: SI_RETURN implicit $sgpr0
+ %val = load i1, ptr addrspace(1) undef
+ ret i1 %val
+}
+
+define void @test_call_signext_i1_func_void() {
+; GFX9-LABEL: name: test_call_signext_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @signext_i1_func_void
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_signext_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @signext_i1_func_void
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit-def $sgpr0
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ %val = call i1 @signext_i1_func_void()
+ store volatile i1 %val, ptr addrspace(1) undef
+ ret void
+}
+
+define inreg i1 @inreg_i1_func_void() {
+; GFX9-LABEL: name: inreg_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+; GFX9-NEXT: SI_RETURN implicit $vgpr0
+;
+; GFX11-LABEL: name: inreg_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT: SI_RETURN implicit $vgpr0
+ %val = load i1, ptr addrspace(1) undef
+ ret i1 %val
+}
+
+define void @test_call_inreg_i1_func_void() {
+; GFX9-LABEL: name: test_call_inreg_i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @inreg_i1_func_void
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @inreg_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_inreg_i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @inreg_i1_func_void
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @inreg_i1_func_void, csr_amdgpu, implicit-def $vgpr0
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ %val = call i1 @inreg_i1_func_void()
+ store volatile i1 %val, ptr addrspace(1) undef
+ ret void
+}
+
+define [2 x i1] @a2i1_func_void() {
+; GFX9-LABEL: name: a2i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX9-NEXT: [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
+; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD]](p1) :: (load (s1) from `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD2]](s1)
+; GFX9-NEXT: [[INTRIN2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT3]](s32)
+; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN2]](s32)
+; GFX9-NEXT: $sgpr2_sgpr3 = COPY [[ANYEXT4]](s64)
+; GFX9-NEXT: SI_RETURN implicit $sgpr0_sgpr1, implicit $sgpr2_sgpr3
+;
+; GFX11-LABEL: name: a2i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX11-NEXT: [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
+; GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD]](p1) :: (load (s1) from `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr0 = COPY [[INTRIN]](s32)
+; GFX11-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD2]](s1)
+; GFX11-NEXT: [[INTRIN2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT3]](s32)
+; GFX11-NEXT: $sgpr1 = COPY [[INTRIN2]](s32)
+; GFX11-NEXT: SI_RETURN implicit $sgpr0, implicit $sgpr1
+ %val = load [2 x i1], ptr addrspace(1) undef
+ ret [2 x i1] %val
+}
+
+define void @test_call_a2i1_func_void() {
+; GFX9-LABEL: name: test_call_a2i1_func_void
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a2i1_func_void
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @a2i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1, implicit-def $sgpr2_sgpr3
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr2_sgpr3
+; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX9-NEXT: [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
+; GFX9-NEXT: G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_a2i1_func_void
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a2i1_func_void
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @a2i1_func_void, csr_amdgpu, implicit-def $sgpr0, implicit-def $sgpr1
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX11-NEXT: [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
+; GFX11-NEXT: G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ %val = call [2 x i1] @a2i1_func_void()
+ store volatile [2 x i1] %val, ptr addrspace(1) undef
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
new file mode 100644
index 0000000000000..f4c85df0e0a1b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
@@ -0,0 +1,569 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX9 -enable-var-scope %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 -enable-var-scope %s
+
+define void @void_func_i1(i1 %arg0) {
+; GFX9-LABEL: name: void_func_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $sgpr4_sgpr5
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ store i1 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1() {
+; GFX9-LABEL: name: test_call_void_func_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1
+; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1, csr_amdgpu, implicit $sgpr0
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: SI_RETURN
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1(i1 %val)
+ ret void
+}
+
+define void @void_func_i1_zeroext(i1 zeroext %arg0) {
+; GFX9-LABEL: name: void_func_i1_zeroext
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $sgpr4_sgpr5
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+; GFX9-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[CONST]]
+; GFX9-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_zeroext
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+; GFX11-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[CONST]]
+; GFX11-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ %ext = zext i1 %arg0 to i32
+ %add = add i32 %ext, 12
+ store i32 %add, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1_zeroext() {
+; GFX9-LABEL: name: test_call_void_func_i1_zeroext
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_zeroext
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1_zeroext
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_zeroext
+; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: SI_RETURN
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_zeroext(i1 %val)
+ ret void
+}
+
+define void @void_func_i1_signext(i1 signext %arg0) {
+; GFX9-LABEL: name: void_func_i1_signext
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $sgpr4_sgpr5
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+; GFX9-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[CONST]]
+; GFX9-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_signext
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+; GFX11-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[CONST]]
+; GFX11-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ %ext = sext i1 %arg0 to i32
+ %add = add i32 %ext, 12
+ store i32 %add, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1_signext() {
+; GFX9-LABEL: name: test_call_void_func_i1_signext
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_signext
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_signext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1_signext
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_signext
+; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_signext, csr_amdgpu, implicit $sgpr0
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: SI_RETURN
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_signext(i1 %val)
+ ret void
+}
+
+define void @void_func_a2i1([2 x i1] %arg0) {
+; GFX9-LABEL: name: void_func_a2i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX9-NEXT: [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
+; GFX9-NEXT: G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: void_func_a2i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX11-NEXT: [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
+; GFX11-NEXT: G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ store [2 x i1] %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_a2i1() {
+; GFX9-LABEL: name: test_call_void_func_a2i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[CONST1:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+; GFX9-NEXT: [[CONST2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_a2i1
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[CONST1]](s1)
+; GFX9-NEXT: $sgpr2_sgpr3 = COPY [[CONST2]](s1)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_a2i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_a2i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[CONST1:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+; GFX11-NEXT: [[CONST2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_a2i1
+; GFX11-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST1]](s1)
+; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32)
+; GFX11-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST2]](s1)
+; GFX11-NEXT: $sgpr1 = COPY [[ANYEXT2]](s32)
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_a2i1, csr_amdgpu, implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: SI_RETURN
+ %1 = insertvalue [2 x i1] undef, i1 0, 0
+ %2 = insertvalue [2 x i1] %1, i1 1, 1
+ call void @void_func_a2i1([2 x i1] %2)
+ ret void
+}
+
+define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
+; GFX9-LABEL: name: void_func_i1_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1_i1() {
+; GFX9-LABEL: name: test_call_void_func_i1_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_i1
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: $sgpr2_sgpr3 = COPY [[CONST]](s1)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_i1
+; GFX11-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32)
+; GFX11-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
+; GFX11-NEXT: $sgpr1 = COPY [[ANYEXT2]](s32)
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_i1, csr_amdgpu, implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: SI_RETURN
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_i1(i1 %val, i1 true)
+ ret void
+}
+
+define void @many_i1_args(
+ i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+ i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+ i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+ i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; GFX9-LABEL: name: many_i1_args
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr8_sgpr9
+; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr10_sgpr11
+; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
+; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr12_sgpr13
+; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
+; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr14_sgpr15
+; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
+; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
+; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
+; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
+; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s64)
+; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
+; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s64)
+; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
+; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s64)
+; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
+; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s64)
+; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
+; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s64)
+; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
+; GFX9-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s64)
+; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX9-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX9-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr2
+; GFX9-NEXT: [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr3
+; GFX9-NEXT: [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr4
+; GFX9-NEXT: [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr5
+; GFX9-NEXT: [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr6
+; GFX9-NEXT: [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr7
+; GFX9-NEXT: [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr8
+; GFX9-NEXT: [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr9
+; GFX9-NEXT: [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr10
+; GFX9-NEXT: [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; GFX9-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr11
+; GFX9-NEXT: [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; GFX9-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr12
+; GFX9-NEXT: [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; GFX9-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr13
+; GFX9-NEXT: [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; GFX9-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr14
+; GFX9-NEXT: [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; GFX9-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr15
+; GFX9-NEXT: [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; GFX9-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr16
+; GFX9-NEXT: [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX9-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr17
+; GFX9-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX9-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr18
+; GFX9-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX9: G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+;
+; GFX11-LABEL: name: many_i1_args
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
+; GFX11-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
+; GFX11-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr5
+; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
+; GFX11-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX11-NEXT: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
+; GFX11-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr7
+; GFX11-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
+; GFX11-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr8
+; GFX11-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
+; GFX11-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr9
+; GFX11-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
+; GFX11-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr10
+; GFX11-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
+; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr11
+; GFX11-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
+; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr12
+; GFX11-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
+; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr13
+; GFX11-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $sgpr14
+; GFX11-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $sgpr15
+; GFX11-NEXT: [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $sgpr16
+; GFX11-NEXT: [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $sgpr17
+; GFX11-NEXT: [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $sgpr18
+; GFX11-NEXT: [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $sgpr19
+; GFX11-NEXT: [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $sgpr20
+; GFX11-NEXT: [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $sgpr21
+; GFX11-NEXT: [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $sgpr22
+; GFX11-NEXT: [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $sgpr23
+; GFX11-NEXT: [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $sgpr24
+; GFX11-NEXT: [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $sgpr25
+; GFX11-NEXT: [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; GFX11-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $sgpr26
+; GFX11-NEXT: [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; GFX11-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $sgpr27
+; GFX11-NEXT: [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; GFX11-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $sgpr28
+; GFX11-NEXT: [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; GFX11-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $sgpr29
+; GFX11-NEXT: [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX11-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX11-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+;
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX11: G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ store volatile i1 %arg2, ptr addrspace(1) undef
+ store volatile i1 %arg3, ptr addrspace(1) undef
+ store volatile i1 %arg4, ptr addrspace(1) undef
+ store volatile i1 %arg5, ptr addrspace(1) undef
+ store volatile i1 %arg6, ptr addrspace(1) undef
+ store volatile i1 %arg7, ptr addrspace(1) undef
+
+ store volatile i1 %arg8, ptr addrspace(1) undef
+ store volatile i1 %arg9, ptr addrspace(1) undef
+ store volatile i1 %arg10, ptr addrspace(1) undef
+ store volatile i1 %arg11, ptr addrspace(1) undef
+ store volatile i1 %arg12, ptr addrspace(1) undef
+ store volatile i1 %arg13, ptr addrspace(1) undef
+ store volatile i1 %arg14, ptr addrspace(1) undef
+ store volatile i1 %arg15, ptr addrspace(1) undef
+
+ store volatile i1 %arg16, ptr addrspace(1) undef
+ store volatile i1 %arg17, ptr addrspace(1) undef
+ store volatile i1 %arg18, ptr addrspace(1) undef
+ store volatile i1 %arg19, ptr addrspace(1) undef
+ store volatile i1 %arg20, ptr addrspace(1) undef
+ store volatile i1 %arg21, ptr addrspace(1) undef
+ store volatile i1 %arg22, ptr addrspace(1) undef
+ store volatile i1 %arg23, ptr addrspace(1) undef
+
+ store volatile i1 %arg24, ptr addrspace(1) undef
+ store volatile i1 %arg25, ptr addrspace(1) undef
+ store volatile i1 %arg26, ptr addrspace(1) undef
+ store volatile i1 %arg27, ptr addrspace(1) undef
+ store volatile i1 %arg28, ptr addrspace(1) undef
+ store volatile i1 %arg29, ptr addrspace(1) undef
+ store volatile i1 %arg30, ptr addrspace(1) undef
+ store volatile i1 %arg31, ptr addrspace(1) undef
+
+ ret void
+}
+
+define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
+; GFX9-LABEL: name: void_func_i1_i1_inreg
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $sgpr6, $sgpr4_sgpr5
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_i1_inreg
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
+; GFX9-LABEL: name: void_func_i1_inreg_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $sgpr4, $sgpr6_sgpr7
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
+; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_inreg_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
index e6c835fa25406..117a654d853f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
@@ -9,8 +9,10 @@ define i1 @i1_func_void() #0 {
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+ ; CHECK-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+ ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+ ; CHECK-NEXT: SI_RETURN implicit $sgpr0_sgpr1
%val = load i1, ptr addrspace(1) undef
ret i1 %val
}
@@ -20,9 +22,11 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32)
- ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+ ; CHECK-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+ ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+ ; CHECK-NEXT: SI_RETURN implicit $sgpr0_sgpr1
%val = load i1, ptr addrspace(1) undef
ret i1 %val
}
@@ -32,9 +36,11 @@ define signext i1 @i1_signext_func_void() #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
- ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[SEXT]](s32)
- ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+ ; CHECK-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+ ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
+ ; CHECK-NEXT: SI_RETURN implicit $sgpr0_sgpr1
%val = load i1, ptr addrspace(1) undef
ret i1 %val
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index d239b7271dd89..eece4397d1855 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -3,7 +3,7 @@
; the frame info, so some functions have manually added stack object
; checks.
; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=GFX1100 -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s
; FIXME: pre-VI should have same ABI without legal i16 operations.
define void @void_func_empty_arg({} %arg0, i32 %arg1) #0 {
@@ -42,6 +42,16 @@ define void @void_func_i1(i1 %arg0) #0 {
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: SI_RETURN
+ ;
+ ; GFX11-LABEL: name: void_func_i1
+ ; GFX11: bb.1 (%ir-block.0):
+ ; GFX11-NEXT: liveins: $sgpr0
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+ ; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+ ; GFX11-NEXT: SI_RETURN
store i1 %arg0, ptr addrspace(1) undef
ret void
}
@@ -2781,8 +2791,8 @@ define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr16
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -3232,199 +3242,6 @@ define void @void_func_v2p3_inreg(<2 x ptr addrspace(3)> inreg %arg0) #0 {
ret void
}
-; Check calling convention for i1 args
-define void @many_i1_args(
- i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
- i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
- i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
- i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
-; CHECK-LABEL: name: many_i1_args
-; CHECK: bb.1 (%ir-block.0):
-; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
-; CHECK-NEXT: {{ $}}
-; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
-; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
-; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
-; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
-; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
-; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
-; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
-; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
-; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
-; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
-; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
-; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
-; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
-; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr0
-; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
-; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr1
-; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
-; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr2
-; CHECK-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
-; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr3
-; CHECK-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
-; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr4
-; CHECK-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
-; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr5
-; CHECK-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
-; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr6
-; CHECK-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
-; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr7
-; CHECK-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
-; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr8
-; CHECK-NEXT: [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
-; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr9
-; CHECK-NEXT: [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
-; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr10
-; CHECK-NEXT: [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
-; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr11
-; CHECK-NEXT: [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
-; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr12
-; CHECK-NEXT: [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
-; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr13
-; CHECK-NEXT: [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
-; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr14
-; CHECK-NEXT: [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
-; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr15
-; CHECK-NEXT: [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
-; CHECK-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr16
-; CHECK-NEXT: [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
-; CHECK-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr17
-; CHECK-NEXT: [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
-; CHECK-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr18
-; CHECK-NEXT: [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
-; CHECK-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr19
-; CHECK-NEXT: [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
-; CHECK-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr20
-; CHECK-NEXT: [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
-; CHECK-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr21
-; CHECK-NEXT: [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
-; CHECK-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr22
-; CHECK-NEXT: [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
-; CHECK-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr23
-; CHECK-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
-; CHECK-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr24
-; CHECK-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
-;
-; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; G_STOREs to TRUNC1-TRUNC30 omitted
-; CHECK: G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-;
-; GFX11-LABEL: name: many_i1_args
-; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
-; GFX11-NEXT: {{ $}}
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
-; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr17
-; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
-; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr18
-; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
-; GFX11-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr19
-; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
-; GFX11-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr20
-; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
-; GFX11-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr21
-; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
-; GFX11-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr22
-; GFX11-NEXT: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
-; GFX11-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr23
-; GFX11-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
-; GFX11-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr24
-; GFX11-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
-; GFX11-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr25
-; GFX11-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
-; GFX11-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr26
-; GFX11-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
-; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr27
-; GFX11-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
-; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr28
-; GFX11-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
-; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr29
-; GFX11-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
-; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr0
-; GFX11-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
-; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr1
-; GFX11-NEXT: [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
-; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr2
-; GFX11-NEXT: [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
-; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr3
-; GFX11-NEXT: [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
-; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr4
-; GFX11-NEXT: [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
-; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr5
-; GFX11-NEXT: [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
-; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr6
-; GFX11-NEXT: [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
-; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr7
-; GFX11-NEXT: [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
-; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr8
-; GFX11-NEXT: [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
-; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr9
-; GFX11-NEXT: [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
-; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr10
-; GFX11-NEXT: [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
-; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr11
-; GFX11-NEXT: [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
-; GFX11-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr12
-; GFX11-NEXT: [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
-; GFX11-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr13
-; GFX11-NEXT: [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
-; GFX11-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr14
-; GFX11-NEXT: [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
-; GFX11-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr15
-; GFX11-NEXT: [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
-; GFX11-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr16
-; GFX11-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
-; GFX11-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr17
-; GFX11-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
-;
-; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; G_STOREs to TRUNC1-TRUNC30 omitted
-; GFX11: G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-
- store volatile i1 %arg0, ptr addrspace(1) undef
- store volatile i1 %arg1, ptr addrspace(1) undef
- store volatile i1 %arg2, ptr addrspace(1) undef
- store volatile i1 %arg3, ptr addrspace(1) undef
- store volatile i1 %arg4, ptr addrspace(1) undef
- store volatile i1 %arg5, ptr addrspace(1) undef
- store volatile i1 %arg6, ptr addrspace(1) undef
- store volatile i1 %arg7, ptr addrspace(1) undef
-
- store volatile i1 %arg8, ptr addrspace(1) undef
- store volatile i1 %arg9, ptr addrspace(1) undef
- store volatile i1 %arg10, ptr addrspace(1) undef
- store volatile i1 %arg11, ptr addrspace(1) undef
- store volatile i1 %arg12, ptr addrspace(1) undef
- store volatile i1 %arg13, ptr addrspace(1) undef
- store volatile i1 %arg14, ptr addrspace(1) undef
- store volatile i1 %arg15, ptr addrspace(1) undef
-
- store volatile i1 %arg16, ptr addrspace(1) undef
- store volatile i1 %arg17, ptr addrspace(1) undef
- store volatile i1 %arg18, ptr addrspace(1) undef
- store volatile i1 %arg19, ptr addrspace(1) undef
- store volatile i1 %arg20, ptr addrspace(1) undef
- store volatile i1 %arg21, ptr addrspace(1) undef
- store volatile i1 %arg22, ptr addrspace(1) undef
- store volatile i1 %arg23, ptr addrspace(1) undef
-
- store volatile i1 %arg24, ptr addrspace(1) undef
- store volatile i1 %arg25, ptr addrspace(1) undef
- store volatile i1 %arg26, ptr addrspace(1) undef
- store volatile i1 %arg27, ptr addrspace(1) undef
- store volatile i1 %arg28, ptr addrspace(1) undef
- store volatile i1 %arg29, ptr addrspace(1) undef
- store volatile i1 %arg30, ptr addrspace(1) undef
- store volatile i1 %arg31, ptr addrspace(1) undef
-
- ret void
-}
-
attributes #0 = { nounwind }
!llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 530e439ae572a..d6acf82318cee 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -149,6 +149,54 @@ bb2:
ret void
}
+define void @void_func_v2i1(<2 x i1> %arg0) #0 {
+; GFX9-LABEL: void_func_v2i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+
+ store <2 x i1> %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_a2i1([2 x i1] %arg0) {
+; GFX9-LABEL: void_func_a2i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: buffer_store_byte v1, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+
+ store [2 x i1] %arg0, ptr addrspace(1) undef
+ ret void
+}
+
define void @void_func_i8(i8 %arg0) #0 {
; CIGFX89-LABEL: void_func_i8:
; CIGFX89: ; %bb.0:
@@ -2780,14 +2828,11 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12
; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8
+; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
; CI-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2802,9 +2847,12 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_short v18, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v20
; CI-NEXT: buffer_store_short v16, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v1, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -4887,275 +4935,4 @@ define void @void_func_v16bf16(<16 x bfloat> %arg0) #0 {
ret void
}
-define void @many_i1_args(
- i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
- i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
- i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
- i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
-; GFX9-LABEL: many_i1_args:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_xor_saveexec_b64 vcc, -1
-; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, vcc
-; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_writelane_b32 v20, s31, 1
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GFX9-NEXT: s_mov_b32 s31, 0xf000
-; GFX9-NEXT: s_mov_b32 s30, -1
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7]
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[8:9]
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[10:11]
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[12:13]
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[14:15]
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[16:17]
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[18:19]
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[20:21]
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[22:23]
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[24:25]
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[26:27]
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[28:29]
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: buffer_store_byte v19, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v1
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v3
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v4
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v5
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v6
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v7
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v8
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v9
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v10
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v11
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v12
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v13
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v14
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v15
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v16
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v17
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v18
-; GFX9-NEXT: buffer_store_byte v0, off, s[28:31], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v20, 1
-; GFX9-NEXT: v_readlane_b32 s30, v20, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: many_i1_args:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_xor_saveexec_b32 vcc_lo, -1
-; GFX11-NEXT: scratch_store_b32 off, v7, s32 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo
-; GFX11-NEXT: v_writelane_b32 v7, s30, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1
-; GFX11-NEXT: s_mov_b32 s30, -1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
-; GFX11-NEXT: v_writelane_b32 v7, s31, 1
-; GFX11-NEXT: s_mov_b32 s31, 0x31016000
-; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s3
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6
-; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s9
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s11
-; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s13
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s14
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s15
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s16
-; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s17
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s18
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s19
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s20
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s21
-; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s22
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s24
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s25
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s26
-; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v5, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v6, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s27
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s28
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s29
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: buffer_store_b8 v2, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v3, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v4, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v0, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b8 v1, off, s[28:31], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_readlane_b32 s31, v7, 1
-; GFX11-NEXT: v_readlane_b32 s30, v7, 0
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v7, off, s32 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- store volatile i1 %arg0, ptr addrspace(1) undef
- store volatile i1 %arg1, ptr addrspace(1) undef
- store volatile i1 %arg2, ptr addrspace(1) undef
- store volatile i1 %arg3, ptr addrspace(1) undef
- store volatile i1 %arg4, ptr addrspace(1) undef
- store volatile i1 %arg5, ptr addrspace(1) undef
- store volatile i1 %arg6, ptr addrspace(1) undef
- store volatile i1 %arg7, ptr addrspace(1) undef
-
- store volatile i1 %arg8, ptr addrspace(1) undef
- store volatile i1 %arg9, ptr addrspace(1) undef
- store volatile i1 %arg10, ptr addrspace(1) undef
- store volatile i1 %arg11, ptr addrspace(1) undef
- store volatile i1 %arg12, ptr addrspace(1) undef
- store volatile i1 %arg13, ptr addrspace(1) undef
- store volatile i1 %arg14, ptr addrspace(1) undef
- store volatile i1 %arg15, ptr addrspace(1) undef
-
- store volatile i1 %arg16, ptr addrspace(1) undef
- store volatile i1 %arg17, ptr addrspace(1) undef
- store volatile i1 %arg18, ptr addrspace(1) undef
- store volatile i1 %arg19, ptr addrspace(1) undef
- store volatile i1 %arg20, ptr addrspace(1) undef
- store volatile i1 %arg21, ptr addrspace(1) undef
- store volatile i1 %arg22, ptr addrspace(1) undef
- store volatile i1 %arg23, ptr addrspace(1) undef
-
- store volatile i1 %arg24, ptr addrspace(1) undef
- store volatile i1 %arg25, ptr addrspace(1) undef
- store volatile i1 %arg26, ptr addrspace(1) undef
- store volatile i1 %arg27, ptr addrspace(1) undef
- store volatile i1 %arg28, ptr addrspace(1) undef
- store volatile i1 %arg29, ptr addrspace(1) undef
- store volatile i1 %arg30, ptr addrspace(1) undef
- store volatile i1 %arg31, ptr addrspace(1) undef
-
- ret void
-}
-
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
new file mode 100644
index 0000000000000..91c739701a1a8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+
+define i1 @i1_func_void() {
+; GFX9-LABEL: i1_func_void:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i1_func_void:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val = load i1, ptr addrspace(1) undef
+ ret i1 %val
+}
+
+define void @test_call_i1_func_void() {
+; GFX9-LABEL: test_call_i1_func_void:
+; GFX9: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+;
+; GFX11-LABEL: test_call_i1_func_void:
+; GFX11: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+ %val = call i1 @i1_func_void()
+ store volatile i1 %val, ptr addrspace(1) undef
+ ret void
+}
+
+define zeroext i1 @zeroext_i1_func_void() {
+; GFX9-LABEL: zeroext_i1_func_void:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: zeroext_i1_func_void:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val = load i1, ptr addrspace(1) undef
+ ret i1 %val
+}
+
+define void @test_call_zeroext_i1_func_void() {
+; GFX9-LABEL: test_call_zeroext_i1_func_void:
+; GFX9: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+;
+; GFX11-LABEL: test_call_zeroext_i1_func_void:
+; GFX11: s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+ %val = call i1 @zeroext_i1_func_void()
+ store volatile i1 %val, ptr addrspace(1) undef
+ ret void
+}
+
+define signext i1 @signext_i1_func_void() {
+; GFX9-LABEL: signext_i1_func_void:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: signext_i1_func_void:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val = load i1, ptr addrspace(1) undef
+ ret i1 %val
+}
+
+define void @test_call_signext_i1_func_void() {
+; GFX9-LABEL: test_call_signext_i1_func_void:
+; GFX9: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+;
+; GFX11-LABEL: test_call_signext_i1_func_void:
+; GFX11: s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+ %val = call i1 @signext_i1_func_void()
+ store volatile i1 %val, ptr addrspace(1) undef
+ ret void
+}
+
+define inreg i1 @inreg_i1_func_void() {
+; GFX9-LABEL: inreg_i1_func_void:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: inreg_i1_func_void:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val = load i1, ptr addrspace(1) undef
+ ret i1 %val
+}
+
+define void @test_call_inreg_i1_func_void() {
+; GFX9-LABEL: test_call_inreg_i1_func_void:
+; GFX9: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+;
+; GFX11-LABEL: test_call_inreg_i1_func_void:
+; GFX11: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+ %val = call i1 @inreg_i1_func_void()
+ store volatile i1 %val, ptr addrspace(1) undef
+ ret void
+}
+
+define [2 x i1] @a2i1_func_void() {
+; GFX9-LABEL: a2i1_func_void:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val = load [2 x i1], ptr addrspace(1) undef
+ ret [2 x i1] %val
+}
+
+define void @test_call_a2i1_func_void() {
+; GFX9-LABEL: test_call_a2i1_func_void:
+;
+; GFX11: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: v_cmp_ne_u32_e64 s1, s1, 0
+; GFX11-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+ %val = call [2 x i1] @a2i1_func_void()
+ store volatile [2 x i1] %val, ptr addrspace(1) undef
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
new file mode 100644
index 0000000000000..55f3422e5c834
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
@@ -0,0 +1,819 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1(i1 %arg0) {
+; GFX9-LABEL: void_func_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i1 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1() {
+; GFX9-LABEL: test_call_void_func_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_i1 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_i1 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+;
+; GFX11-LABEL: test_call_void_func_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_i1 at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_i1 at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1(i1 %val)
+ ret void
+}
+
+define void @void_func_i1_zeroext(i1 zeroext %arg0) {
+; GFX9-LABEL: void_func_i1_zeroext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: v_or_b32_e32 v0, 12, v0
+; GFX9-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_zeroext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, 12, v0
+; GFX11-NEXT: global_store_b32 v[0:1], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %ext = zext i1 %arg0 to i32
+ %add = add i32 %ext, 12
+ store i32 %add, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1_zeroext() {
+; GFX9-LABEL: test_call_void_func_i1_zeroext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_i1_zeroext at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_i1_zeroext at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+;
+; GFX11-LABEL: test_call_void_func_i1_zeroext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_i1_zeroext at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_i1_zeroext at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_zeroext(i1 %val)
+ ret void
+}
+
+define void @void_func_i1_signext(i1 signext %arg0) {
+; GFX9-LABEL: void_func_i1_signext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: v_sub_u32_e32 v0, 12, v0
+; GFX9-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_signext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT: global_store_b32 v[0:1], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %ext = sext i1 %arg0 to i32
+ %add = add i32 %ext, 12
+ store i32 %add, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1_signext() {
+; GFX9-LABEL: test_call_void_func_i1_signext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_i1_signext at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_i1_signext at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+;
+; GFX11-LABEL: test_call_void_func_i1_signext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_i1_signext at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_i1_signext at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_signext(i1 %val)
+ ret void
+}
+
+define void @void_func_a2i1([2 x i1] %arg0) {
+; GFX9-LABEL: void_func_a2i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX9-NEXT: global_store_byte v[0:1], v1, off
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_a2i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store [2 x i1] %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_a2i1() {
+; GFX9-LABEL: test_call_void_func_a2i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s10, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_a2i1 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_a2i1 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_mov_b64 s[6:7], -1
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+;
+; GFX11-LABEL: test_call_void_func_a2i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT; s_mov_b32 s4, s33
+; GFX11-NEXT; s_mov_b32 s33, s32
+; GFX11-NEXT; s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT; scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT; s_mov_b32 exec_lo, s0
+; GFX11-NEXT; s_add_i32 s32, s32, 16
+; GFX11-NEXT; s_getpc_b64 s[0:1]
+; GFX11-NEXT; s_add_u32 s0, s0, void_func_a2i1 at gotpcrel32@lo+4
+; GFX11-NEXT; s_addc_u32 s1, s1, void_func_a2i1 at gotpcrel32@hi+12
+; GFX11-NEXT; v_writelane_b32 v2, s30, 0
+; GFX11-NEXT; s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT; s_mov_b32 s0, 0
+; GFX11-NEXT; s_mov_b32 s1, -1
+; GFX11-NEXT; v_writelane_b32 v2, s31, 1
+; GFX11-NEXT; s_waitcnt lgkmcnt(0)
+; GFX11-NEXT; s_swappc_b64 s[30:31], s[2:3]
+ %1 = insertvalue [2 x i1] undef, i1 0, 0
+ %2 = insertvalue [2 x i1] %1, i1 1, 1
+ call void @void_func_a2i1([2 x i1] %2)
+ ret void
+}
+
+define void @i1_arg_i1_use(i1 %arg) {
+; CIGFX89-LABEL: i1_arg_i1_use:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
+; GFX9: ; %bb.1:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: .LBB{{[0-9]+}}_2:
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i1_arg_i1_use:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s1, s0, -1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s1
+; GFX11: ; %bb.1: ; %bb1
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: .LBB{{[0-9]+}}_2: ; %bb2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+bb:
+ br i1 %arg, label %bb2, label %bb1
+
+bb1:
+ store volatile i32 0, ptr addrspace(1) undef
+ br label %bb2
+
+bb2:
+ ret void
+}
+
+define void @void_func_v2i1(<2 x i1> %arg0) {
+; GFX9-LABEL: void_func_v2i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <2 x i1> %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_v2i1() {
+; GFX9-LABEL: test_call_void_func_v2i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_v2i1 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_v2i1 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+;
+; GFX11-LABEL: test_call_void_func_v2i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_v2i1 at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_v2i1 at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+ %1 = insertelement <2 x i1> undef, i1 0, i32 0
+ %2 = insertelement <2 x i1> %1, i1 1, i32 1
+ call void @void_func_v2i1(<2 x i1> %2)
+ ret void
+}
+
+define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
+; GFX9-LABEL: void_func_i1_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1_i1() {
+; GFX9-LABEL: test_call_void_func_i1_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s10, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_i1_i1 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_i1_i1 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: s_mov_b64 s[6:7], -1
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+;
+; GFX11-LABEL: test_call_void_func_i1_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_i1_i1 at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_i1_i1 at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT: s_mov_b32 s1, -1
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_i1(i1 %val, i1 true)
+ ret void
+}
+
+define void @void_func_a2i1_i1([2 x i1] %arg0, i1 %arg1) {
+; GFX9-LABEL: void_func_a2i1_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX9-NEXT: global_store_byte v[0:1], v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_a2i1_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile [2 x i1] %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @many_i1_args(
+ i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+ i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+ i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+ i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
+; GFX9-LABEL: many_i1_args:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[8:9]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[10:11]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[12:13]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[14:15]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[16:17]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[18:19]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[20:21]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[22:23]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[24:25]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[26:27]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[28:29]
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v1
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v3
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v4
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v5
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v6
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v7
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v8
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v9
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v10
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v11
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v12
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v13
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v14
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v15
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v16
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v17
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v18
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: many_i1_args:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s3
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v4, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v5, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v6, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s9
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s11
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v4, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v5, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v6, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s13
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s14
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s15
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s16
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v4, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v5, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v6, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s17
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s18
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s19
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s20
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s21
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v4, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v5, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v6, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s22
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s24
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s25
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s26
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v4, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v5, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v6, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s27
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s28
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s29
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v4, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ store volatile i1 %arg2, ptr addrspace(1) undef
+ store volatile i1 %arg3, ptr addrspace(1) undef
+ store volatile i1 %arg4, ptr addrspace(1) undef
+ store volatile i1 %arg5, ptr addrspace(1) undef
+ store volatile i1 %arg6, ptr addrspace(1) undef
+ store volatile i1 %arg7, ptr addrspace(1) undef
+
+ store volatile i1 %arg8, ptr addrspace(1) undef
+ store volatile i1 %arg9, ptr addrspace(1) undef
+ store volatile i1 %arg10, ptr addrspace(1) undef
+ store volatile i1 %arg11, ptr addrspace(1) undef
+ store volatile i1 %arg12, ptr addrspace(1) undef
+ store volatile i1 %arg13, ptr addrspace(1) undef
+ store volatile i1 %arg14, ptr addrspace(1) undef
+ store volatile i1 %arg15, ptr addrspace(1) undef
+
+ store volatile i1 %arg16, ptr addrspace(1) undef
+ store volatile i1 %arg17, ptr addrspace(1) undef
+ store volatile i1 %arg18, ptr addrspace(1) undef
+ store volatile i1 %arg19, ptr addrspace(1) undef
+ store volatile i1 %arg20, ptr addrspace(1) undef
+ store volatile i1 %arg21, ptr addrspace(1) undef
+ store volatile i1 %arg22, ptr addrspace(1) undef
+ store volatile i1 %arg23, ptr addrspace(1) undef
+
+ store volatile i1 %arg24, ptr addrspace(1) undef
+ store volatile i1 %arg25, ptr addrspace(1) undef
+ store volatile i1 %arg26, ptr addrspace(1) undef
+ store volatile i1 %arg27, ptr addrspace(1) undef
+ store volatile i1 %arg28, ptr addrspace(1) undef
+ store volatile i1 %arg29, ptr addrspace(1) undef
+ store volatile i1 %arg30, ptr addrspace(1) undef
+ store volatile i1 %arg31, ptr addrspace(1) undef
+
+ ret void
+}
+
+define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
+; GFX9-LABEL: void_func_i1_i1_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: s_and_b32 s4, s6, 1
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_i1_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_and_b32 s0, s1, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
+; GFX9-LABEL: void_func_i1_inreg_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_and_b32 s4, s4, 1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: global_store_byte v[0:1], v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_inreg_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s0, s0, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index df2163c4f9578..fb5b4a704b8a1 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -40,6 +40,8 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
; GFX789-NEXT: s_waitcnt vmcnt(0)
+; GFX789-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX789-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
; GFX789-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_zeroext_func_void:
@@ -49,6 +51,9 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load i1, ptr addrspace(1) undef
ret i1 %val
@@ -62,7 +67,8 @@ define signext i1 @i1_signext_func_void() #0 {
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
; GFX789-NEXT: s_waitcnt vmcnt(0)
-; GFX789-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX789-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX789-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
; GFX789-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_signext_func_void:
@@ -72,7 +78,9 @@ define signext i1 @i1_signext_func_void() #0 {
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load i1, ptr addrspace(1) undef
ret i1 %val
>From e6e574dabfd9a4cacdbe7924aa0c23dc47e413f5 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 22 Mar 2024 16:15:26 -0500
Subject: [PATCH 14/20] This commit: (1) a fix for i1 return with GlobalISel
(2) testcases.
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 17 +-
.../GlobalISel/function-call-i1-return.ll | 33 ++--
.../AMDGPU/GlobalISel/function-returns.ll | 21 +--
.../CodeGen/AMDGPU/function-call-i1-return.ll | 146 ++++++++++++++----
llvm/test/CodeGen/AMDGPU/function-i1-args.ll | 88 ++++++++++-
5 files changed, 217 insertions(+), 88 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 2d25827906f15..2b2584e6cbe40 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -63,6 +63,11 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
const CCValAssign &VA) override {
+ if (VA.getLocVT() == MVT::i1 && MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
+ MIRBuilder.buildCopy(PhysReg, ValVReg);
+ return;
+ }
+
Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
// If this is a scalar return, insert a readfirstlane just in case the value
@@ -88,9 +93,6 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
{MRI.getType(ExtReg)})
.addReg(ExtReg);
ExtReg = ToSGPR.getReg(0);
- if (VA.getLocVT() == MVT::i1 &&
- MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
- ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ExtReg).getReg(0);
}
MIRBuilder.buildCopy(PhysReg, ExtReg);
@@ -127,12 +129,9 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
if (VA.getLocVT().getSizeInBits() < 32) {
// 16-bit types are reported as legal for 32-bit registers. We need to do
// a 32-bit copy, and truncate to avoid the verifier complaining about it.
- unsigned CopyToBits = 32;
-
- // When function return type is i1, it may be in a 64b register.
- if (VA.getLocVT() == MVT::i1 &&
- MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
- CopyToBits = 64;
+ //
+ // However, when function return type is i1, it may be in a 64b register.
+ unsigned CopyToBits = (VA.getLocVT() == MVT::i1 && MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) ? 64 : 32;
auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
index 24a51a9904d25..86198dd70218b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
@@ -7,11 +7,8 @@ define i1 @i1_func_void() {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX9-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
-; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
-; GFX9-NEXT: SI_RETURN implicit $sgpr0_sgpr1
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: i1_func_void
; GFX11: bb.1 (%ir-block.0):
@@ -61,11 +58,8 @@ define zeroext i1 @zeroext_i1_func_void() {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX9-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
-; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
-; GFX9-NEXT: SI_RETURN implicit $sgpr0_sgpr1
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: zeroext_i1_func_void
; GFX11: bb.1 (%ir-block.0):
@@ -115,11 +109,8 @@ define signext i1 @signext_i1_func_void() {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX9-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
-; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
-; GFX9-NEXT: SI_RETURN implicit $sgpr0_sgpr1
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: signext_i1_func_void
; GFX11: bb.1 (%ir-block.0):
@@ -223,15 +214,9 @@ define [2 x i1] @a2i1_func_void() {
; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; GFX9-NEXT: [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD]](p1) :: (load (s1) from `ptr addrspace(1) undef` + 1, addrspace 1)
-; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX9-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
-; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
-; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD2]](s1)
-; GFX9-NEXT: [[INTRIN2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT3]](s32)
-; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN2]](s32)
-; GFX9-NEXT: $sgpr2_sgpr3 = COPY [[ANYEXT4]](s64)
-; GFX9-NEXT: SI_RETURN implicit $sgpr0_sgpr1, implicit $sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: $sgpr2_sgpr3 = COPY [[LOAD2]](s1)
+; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: a2i1_func_void
; GFX11: bb.1 (%ir-block.0):
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
index 117a654d853f5..252afe1712464 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
@@ -8,11 +8,8 @@ define i1 @i1_func_void() #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
- ; CHECK-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
- ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
- ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
- ; CHECK-NEXT: SI_RETURN implicit $sgpr0_sgpr1
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+ ; CHECK-NEXT: SI_RETURN
%val = load i1, ptr addrspace(1) undef
ret i1 %val
}
@@ -22,11 +19,8 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
- ; CHECK-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
- ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
- ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
- ; CHECK-NEXT: SI_RETURN implicit $sgpr0_sgpr1
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+ ; CHECK-NEXT: SI_RETURN
%val = load i1, ptr addrspace(1) undef
ret i1 %val
}
@@ -36,11 +30,8 @@ define signext i1 @i1_signext_func_void() #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
- ; CHECK-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
- ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[INTRIN]](s32)
- ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ANYEXT2]](s64)
- ; CHECK-NEXT: SI_RETURN implicit $sgpr0_sgpr1
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+ ; CHECK-NEXT: SI_RETURN
%val = load i1, ptr addrspace(1) undef
ret i1 %val
}
diff --git a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
index 91c739701a1a8..5319bbac3a087 100644
--- a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
@@ -1,26 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+ at GV = external addrspace(1) global i32
define i1 @i1_func_void() {
; GFX9-LABEL: i1_func_void:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, GV at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, GV at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, GV at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, GV at gotpcrel32@hi+12
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %val = load i1, ptr addrspace(1) undef
+ %val = load i1, ptr addrspace(1) @GV
ret i1 %val
}
@@ -46,7 +61,13 @@ define zeroext i1 @zeroext_i1_func_void() {
; GFX9-LABEL: zeroext_i1_func_void:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, GV at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, GV at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
@@ -55,13 +76,19 @@ define zeroext i1 @zeroext_i1_func_void() {
; GFX11-LABEL: zeroext_i1_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, GV at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, GV at gotpcrel32@hi+12
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %val = load i1, ptr addrspace(1) undef
+ %val = load i1, ptr addrspace(1) @GV
ret i1 %val
}
@@ -72,7 +99,7 @@ define void @test_call_zeroext_i1_func_void() {
; GFX9-NEXT: global_store_byte v[0:1], v0, off
;
; GFX11-LABEL: test_call_zeroext_i1_func_void:
-; GFX11: s_swappc_b64 s[30:31], s[4:5]
+; GFX11: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
@@ -87,7 +114,13 @@ define signext i1 @signext_i1_func_void() {
; GFX9-LABEL: signext_i1_func_void:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, GV at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, GV at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
@@ -96,13 +129,19 @@ define signext i1 @signext_i1_func_void() {
; GFX11-LABEL: signext_i1_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, GV at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, GV at gotpcrel32@hi+12
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %val = load i1, ptr addrspace(1) undef
+ %val = load i1, ptr addrspace(1) @GV
ret i1 %val
}
@@ -113,7 +152,7 @@ define void @test_call_signext_i1_func_void() {
; GFX9-NEXT: global_store_byte v[0:1], v0, off
;
; GFX11-LABEL: test_call_signext_i1_func_void:
-; GFX11: s_swappc_b64 s[30:31], s[4:5]
+; GFX11: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
@@ -128,17 +167,29 @@ define inreg i1 @inreg_i1_func_void() {
; GFX9-LABEL: inreg_i1_func_void:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, GV at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, GV at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: inreg_i1_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, GV at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, GV at gotpcrel32@hi+12
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %val = load i1, ptr addrspace(1) undef
+ %val = load i1, ptr addrspace(1) @GV
ret i1 %val
}
@@ -164,33 +215,64 @@ define [2 x i1] @a2i1_func_void() {
; GFX9-LABEL: a2i1_func_void:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, GV at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, GV at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5]
+; GFX9-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[4:5]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX9-NEXT: v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v1
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
- %val = load [2 x i1], ptr addrspace(1) undef
+;
+; GFX11-LABEL: a2i1_func_void:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, GV at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, GV at gotpcrel32@hi+12
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_u8 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:1
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val = load [2 x i1], ptr addrspace(1) @GV
ret [2 x i1] %val
}
define void @test_call_a2i1_func_void() {
; GFX9-LABEL: test_call_a2i1_func_void:
+; GFX9: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
;
+; GFX11-LABEL: test_call_a2i1_func_void:
; GFX11: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_cmp_ne_u32_e64 s1, s1, 0
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
%val = call [2 x i1] @a2i1_func_void()
store volatile [2 x i1] %val, ptr addrspace(1) undef
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
index 55f3422e5c834..c9877db735ebb 100644
--- a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
@@ -21,7 +21,7 @@ define void @void_func_i1(i1 %arg0) {
ret void
}
-define void @test_call_void_func_i1() {
+define void @test_call_void_func_i1(ptr addrspace(1) %in) {
; GFX9-LABEL: test_call_void_func_i1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -66,7 +66,7 @@ define void @test_call_void_func_i1() {
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
- %val = load i1, ptr addrspace(1) undef
+ %val = load i1, ptr addrspace(1) %in
call void @void_func_i1(i1 %val)
ret void
}
@@ -95,7 +95,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) {
ret void
}
-define void @test_call_void_func_i1_zeroext() {
+define void @test_call_void_func_i1_zeroext(ptr addrspace(1) %in) {
; GFX9-LABEL: test_call_void_func_i1_zeroext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -140,7 +140,7 @@ define void @test_call_void_func_i1_zeroext() {
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
- %val = load i1, ptr addrspace(1) undef
+ %val = load i1, ptr addrspace(1) %in
call void @void_func_i1_zeroext(i1 %val)
ret void
}
@@ -169,7 +169,7 @@ define void @void_func_i1_signext(i1 signext %arg0) {
ret void
}
-define void @test_call_void_func_i1_signext() {
+define void @test_call_void_func_i1_signext(ptr addrspace(1) %in) {
; GFX9-LABEL: test_call_void_func_i1_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -214,7 +214,7 @@ define void @test_call_void_func_i1_signext() {
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
- %val = load i1, ptr addrspace(1) undef
+ %val = load i1, ptr addrspace(1) %in
call void @void_func_i1_signext(i1 %val)
ret void
}
@@ -401,6 +401,78 @@ define void @test_call_void_func_v2i1() {
ret void
}
+define void @void_func_v2i1_inreg(<2 x i1> inreg %arg0) {
+; GFX9-LABEL: void_func_v2i1_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e64 v0, 1, s5
+; GFX9-NEXT: v_and_b32_e64 v1, s4, 1
+; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i1_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b16 v0, 1, s1
+; GFX11-NEXT: v_and_b32_e64 v1, s0, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <2 x i1> %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_v2i1_inreg() {
+; GFX9-LABEL: test_call_void_func_v2i1_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_v2i1_inreg at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_v2i1_inreg at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: s_mov_b32 s4, 0
+; GFX9-NEXT: s_mov_b32 s5, 1
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+;
+; GFX11-LABEL: test_call_void_func_v2i1_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_v2i1_inreg at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_v2i1_inreg at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_mov_b32 s1, 1
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+ %1 = insertelement <2 x i1> undef, i1 0, i32 0
+ %2 = insertelement <2 x i1> %1, i1 1, i32 1
+ call void @void_func_v2i1_inreg(<2 x i1> %2)
+ ret void
+}
+
define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
; GFX9-LABEL: void_func_i1_i1:
; GFX9: ; %bb.0:
@@ -428,7 +500,7 @@ define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
ret void
}
-define void @test_call_void_func_i1_i1() {
+define void @test_call_void_func_i1_i1(ptr addrspace(1) %in) {
; GFX9-LABEL: test_call_void_func_i1_i1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -475,7 +547,7 @@ define void @test_call_void_func_i1_i1() {
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
- %val = load i1, ptr addrspace(1) undef
+ %val = load i1, ptr addrspace(1) %in
call void @void_func_i1_i1(i1 %val, i1 true)
ret void
}
>From 4f54c9847c5b9abb98c78e809b82693bd6480421 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 22 Mar 2024 16:25:27 -0500
Subject: [PATCH 15/20] Fix formatting.
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 2b2584e6cbe40..6f2425c71f09a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -63,7 +63,8 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
const CCValAssign &VA) override {
- if (VA.getLocVT() == MVT::i1 && MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
+ if (VA.getLocVT() == MVT::i1 &&
+ MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
MIRBuilder.buildCopy(PhysReg, ValVReg);
return;
}
@@ -131,7 +132,11 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
// a 32-bit copy, and truncate to avoid the verifier complaining about it.
//
// However, when function return type is i1, it may be in a 64b register.
- unsigned CopyToBits = (VA.getLocVT() == MVT::i1 && MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) ? 64 : 32;
+ unsigned CopyToBits =
+ (VA.getLocVT() == MVT::i1 &&
+ MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
+ ? 64
+ : 32;
auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
@@ -276,7 +281,7 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
}
};
-}
+} // namespace
AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
: CallLowering(&TLI) {
>From a79ddaeaf0fe54d14d8dcfb7d582884861c76263 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 1 Apr 2024 13:18:42 -0500
Subject: [PATCH 16/20] Use update_llc_test_checks.py on new test files; remove
incorrect comments in 2 new test files.
---
.../GlobalISel/function-call-i1-return.ll | 1 -
.../AMDGPU/GlobalISel/function-i1-args.ll | 1 -
.../CodeGen/AMDGPU/function-call-i1-return.ll | 276 ++++++++++++++--
llvm/test/CodeGen/AMDGPU/function-i1-args.ll | 294 +++++++++++++-----
4 files changed, 479 insertions(+), 93 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
index 86198dd70218b..81a1994b5afb1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX9 -enable-var-scope %s
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 -enable-var-scope %s
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
index f4c85df0e0a1b..134751ee1e313 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX9 -enable-var-scope %s
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 -enable-var-scope %s
diff --git a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
index 5319bbac3a087..0b3366f71d89c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
@@ -6,7 +6,7 @@
define i1 @i1_func_void() {
; GFX9-LABEL: i1_func_void:
-; GFX9: ; %bb.0:
+; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, GV at gotpcrel32@lo+4
@@ -21,7 +21,7 @@ define i1 @i1_func_void() {
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_func_void:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, GV at gotpcrel32@lo+4
@@ -41,17 +41,65 @@ define i1 @i1_func_void() {
define void @test_call_i1_func_void() {
; GFX9-LABEL: test_call_i1_func_void:
-; GFX9: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_i1_func_void:
-; GFX11: s_swappc_b64 s[30:31], s[0:1]
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = call i1 @i1_func_void()
store volatile i1 %val, ptr addrspace(1) undef
ret void
@@ -94,17 +142,65 @@ define zeroext i1 @zeroext_i1_func_void() {
define void @test_call_zeroext_i1_func_void() {
; GFX9-LABEL: test_call_zeroext_i1_func_void:
-; GFX9: s_swappc_b64 s[30:31], s[4:5]
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, zeroext_i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, zeroext_i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_zeroext_i1_func_void:
-; GFX11: s_swappc_b64 s[30:31], s[0:1]
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, zeroext_i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, zeroext_i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = call i1 @zeroext_i1_func_void()
store volatile i1 %val, ptr addrspace(1) undef
ret void
@@ -147,17 +243,65 @@ define signext i1 @signext_i1_func_void() {
define void @test_call_signext_i1_func_void() {
; GFX9-LABEL: test_call_signext_i1_func_void:
-; GFX9: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, signext_i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, signext_i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_signext_i1_func_void:
-; GFX11: s_swappc_b64 s[30:31], s[0:1]
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, signext_i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, signext_i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = call i1 @signext_i1_func_void()
store volatile i1 %val, ptr addrspace(1) undef
ret void
@@ -165,7 +309,7 @@ define void @test_call_signext_i1_func_void() {
define inreg i1 @inreg_i1_func_void() {
; GFX9-LABEL: inreg_i1_func_void:
-; GFX9: ; %bb.0:
+; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, GV at gotpcrel32@lo+4
@@ -178,7 +322,7 @@ define inreg i1 @inreg_i1_func_void() {
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: inreg_i1_func_void:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, GV at gotpcrel32@lo+4
@@ -195,17 +339,65 @@ define inreg i1 @inreg_i1_func_void() {
define void @test_call_inreg_i1_func_void() {
; GFX9-LABEL: test_call_inreg_i1_func_void:
-; GFX9: s_swappc_b64 s[30:31], s[4:5]
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, inreg_i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, inreg_i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_inreg_i1_func_void:
-; GFX11: s_swappc_b64 s[30:31], s[0:1]
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, inreg_i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, inreg_i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = call i1 @inreg_i1_func_void()
store volatile i1 %val, ptr addrspace(1) undef
ret void
@@ -257,15 +449,55 @@ define [2 x i1] @a2i1_func_void() {
define void @test_call_a2i1_func_void() {
; GFX9-LABEL: test_call_a2i1_func_void:
-; GFX9: s_swappc_b64 s[30:31], s[4:5]
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, a2i1_func_void at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, a2i1_func_void at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v3, s30, 0
+; GFX9-NEXT: v_writelane_b32 v3, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v3, 1
+; GFX9-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_a2i1_func_void:
-; GFX11: s_swappc_b64 s[30:31], s[0:1]
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, a2i1_func_void at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, a2i1_func_void at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
@@ -273,6 +505,14 @@ define void @test_call_a2i1_func_void() {
; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = call [2 x i1] @a2i1_func_void()
store volatile [2 x i1] %val, ptr addrspace(1) undef
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
index c9877db735ebb..2d63695674404 100644
--- a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
@@ -23,7 +23,7 @@ define void @void_func_i1(i1 %arg0) {
define void @test_call_void_func_i1(ptr addrspace(1) %in) {
; GFX9-LABEL: test_call_void_func_i1:
-; GFX9: ; %bb.0:
+; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s33
; GFX9-NEXT: s_mov_b32 s33, s32
@@ -43,14 +43,23 @@ define void @test_call_void_func_i1(ptr addrspace(1) %in) {
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s8
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_void_func_i1:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s4, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_add_i32 s32, s32, 16
@@ -66,6 +75,15 @@ define void @test_call_void_func_i1(ptr addrspace(1) %in) {
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load i1, ptr addrspace(1) %in
call void @void_func_i1(i1 %val)
ret void
@@ -97,7 +115,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) {
define void @test_call_void_func_i1_zeroext(ptr addrspace(1) %in) {
; GFX9-LABEL: test_call_void_func_i1_zeroext:
-; GFX9: ; %bb.0:
+; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s33
; GFX9-NEXT: s_mov_b32 s33, s32
@@ -117,14 +135,23 @@ define void @test_call_void_func_i1_zeroext(ptr addrspace(1) %in) {
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s8
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_void_func_i1_zeroext:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s4, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_add_i32 s32, s32, 16
@@ -140,6 +167,15 @@ define void @test_call_void_func_i1_zeroext(ptr addrspace(1) %in) {
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load i1, ptr addrspace(1) %in
call void @void_func_i1_zeroext(i1 %val)
ret void
@@ -171,7 +207,7 @@ define void @void_func_i1_signext(i1 signext %arg0) {
define void @test_call_void_func_i1_signext(ptr addrspace(1) %in) {
; GFX9-LABEL: test_call_void_func_i1_signext:
-; GFX9: ; %bb.0:
+; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s33
; GFX9-NEXT: s_mov_b32 s33, s32
@@ -191,14 +227,23 @@ define void @test_call_void_func_i1_signext(ptr addrspace(1) %in) {
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s8
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_void_func_i1_signext:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s4, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_add_i32 s32, s32, 16
@@ -214,6 +259,15 @@ define void @test_call_void_func_i1_signext(ptr addrspace(1) %in) {
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load i1, ptr addrspace(1) %in
call void @void_func_i1_signext(i1 %val)
ret void
@@ -221,17 +275,17 @@ define void @test_call_void_func_i1_signext(ptr addrspace(1) %in) {
define void @void_func_a2i1([2 x i1] %arg0) {
; GFX9-LABEL: void_func_a2i1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
-; GFX9-NEXT: global_store_byte v[0:1], v1, off
-; GFX9-NEXT: global_store_byte v[0:1], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX9-NEXT: global_store_byte v[0:1], v1, off
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_a2i1:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
@@ -245,7 +299,7 @@ define void @void_func_a2i1([2 x i1] %arg0) {
define void @test_call_void_func_a2i1() {
; GFX9-LABEL: test_call_void_func_a2i1:
-; GFX9: ; %bb.0:
+; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s10, s33
; GFX9-NEXT: s_mov_b32 s33, s32
@@ -263,10 +317,45 @@ define void @test_call_void_func_a2i1() {
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s10
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_void_func_a2i1:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_a2i1 at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_a2i1 at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_mov_b32 s1, -1
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-NEXT; s_mov_b32 s4, s33
; GFX11-NEXT; s_mov_b32 s33, s32
; GFX11-NEXT; s_xor_saveexec_b32 s0, -1
@@ -293,18 +382,20 @@ define void @i1_arg_i1_use(i1 %arg) {
; CIGFX89-LABEL: i1_arg_i1_use:
; CIGFX89-NEXT: s_waitcnt vmcnt(0)
; CIGFX89-NEXT: s_setpc_b64 s[30:31]
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
-; GFX9: ; %bb.1:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: global_store_dword v[0:1], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: .LBB{{[0-9]+}}_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: i1_arg_i1_use:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
+; GFX9-NEXT: s_cbranch_execz .LBB8_2
+; GFX9-NEXT: ; %bb.1: ; %bb1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: .LBB8_2: ; %bb2
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_arg_i1_use:
; GFX11: ; %bb.0: ; %bb
@@ -312,11 +403,12 @@ define void @i1_arg_i1_use(i1 %arg) {
; GFX11-NEXT: s_xor_b32 s1, s0, -1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s1
-; GFX11: ; %bb.1: ; %bb1
+; GFX11-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-NEXT: ; %bb.1: ; %bb1
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: .LBB{{[0-9]+}}_2: ; %bb2
+; GFX11-NEXT: .LBB8_2: ; %bb2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
@@ -332,15 +424,15 @@ bb2:
define void @void_func_v2i1(<2 x i1> %arg0) {
; GFX9-LABEL: void_func_v2i1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX9-NEXT: global_store_byte v[0:1], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_v2i1:
; GFX11: ; %bb.0:
@@ -358,7 +450,7 @@ define void @void_func_v2i1(<2 x i1> %arg0) {
define void @test_call_void_func_v2i1() {
; GFX9-LABEL: test_call_void_func_v2i1:
-; GFX9: ; %bb.0:
+; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s6, s33
; GFX9-NEXT: s_mov_b32 s33, s32
@@ -376,14 +468,23 @@ define void @test_call_void_func_v2i1() {
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_void_func_v2i1:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
@@ -395,6 +496,16 @@ define void @test_call_void_func_v2i1() {
; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%1 = insertelement <2 x i1> undef, i1 0, i32 0
%2 = insertelement <2 x i1> %1, i1 1, i32 1
call void @void_func_v2i1(<2 x i1> %2)
@@ -403,7 +514,7 @@ define void @test_call_void_func_v2i1() {
define void @void_func_v2i1_inreg(<2 x i1> inreg %arg0) {
; GFX9-LABEL: void_func_v2i1_inreg:
-; GFX9: ; %bb.0:
+; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e64 v0, 1, s5
; GFX9-NEXT: v_and_b32_e64 v1, s4, 1
@@ -414,7 +525,7 @@ define void @void_func_v2i1_inreg(<2 x i1> inreg %arg0) {
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_v2i1_inreg:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b16 v0, 1, s1
; GFX11-NEXT: v_and_b32_e64 v1, s0, 1
@@ -429,7 +540,7 @@ define void @void_func_v2i1_inreg(<2 x i1> inreg %arg0) {
define void @test_call_void_func_v2i1_inreg() {
; GFX9-LABEL: test_call_void_func_v2i1_inreg:
-; GFX9: ; %bb.0:
+; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s33
; GFX9-NEXT: s_mov_b32 s33, s32
@@ -447,14 +558,23 @@ define void @test_call_void_func_v2i1_inreg() {
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s8
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_void_func_v2i1_inreg:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s4, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
@@ -467,6 +587,16 @@ define void @test_call_void_func_v2i1_inreg() {
; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%1 = insertelement <2 x i1> undef, i1 0, i32 0
%2 = insertelement <2 x i1> %1, i1 1, i32 1
call void @void_func_v2i1_inreg(<2 x i1> %2)
@@ -502,7 +632,7 @@ define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
define void @test_call_void_func_i1_i1(ptr addrspace(1) %in) {
; GFX9-LABEL: test_call_void_func_i1_i1:
-; GFX9: ; %bb.0:
+; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s10, s33
; GFX9-NEXT: s_mov_b32 s33, s32
@@ -523,14 +653,23 @@ define void @test_call_void_func_i1_i1(ptr addrspace(1) %in) {
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s10
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_void_func_i1_i1:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s4, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_add_i32 s32, s32, 16
@@ -547,6 +686,15 @@ define void @test_call_void_func_i1_i1(ptr addrspace(1) %in) {
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load i1, ptr addrspace(1) %in
call void @void_func_i1_i1(i1 %val, i1 true)
ret void
@@ -554,21 +702,21 @@ define void @test_call_void_func_i1_i1(ptr addrspace(1) %in) {
define void @void_func_a2i1_i1([2 x i1] %arg0, i1 %arg1) {
; GFX9-LABEL: void_func_a2i1_i1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
-; GFX9-NEXT: global_store_byte v[0:1], v1, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_byte v[0:1], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
-; GFX9-NEXT: global_store_byte v[0:1], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX9-NEXT: global_store_byte v[0:1], v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_a2i1_i1:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
@@ -586,12 +734,8 @@ define void @void_func_a2i1_i1([2 x i1] %arg0, i1 %arg1) {
}
define void @many_i1_args(
- i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
- i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
- i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
- i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
; GFX9-LABEL: many_i1_args:
-; GFX9: ; %bb.0:
+; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
; GFX9-NEXT: global_store_byte v[0:1], v19, off
@@ -692,7 +836,7 @@ define void @many_i1_args(
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: many_i1_args:
-; GFX11: ; %bb.0:
+; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1
@@ -791,6 +935,10 @@ define void @many_i1_args(
; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+ i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+ i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+ i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+ i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
store volatile i1 %arg0, ptr addrspace(1) undef
store volatile i1 %arg1, ptr addrspace(1) undef
store volatile i1 %arg2, ptr addrspace(1) undef
>From d3338c9365a2d77b15b5665c1e08a7493d635dca Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 29 Apr 2024 19:04:50 -0500
Subject: [PATCH 17/20] For GlobalISel: (1) for incoming i1 arg/return, do not
generate G_TRUNC; however, we need to set the register class to avoid later
problems with instruction selection (2) for outgoing i1, do not differenciate
between wavesize32 and wavesize64.
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 27 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 6 +
.../GlobalISel/function-call-i1-return.ll | 78 ++----
.../AMDGPU/GlobalISel/function-i1-args.ll | 252 +++++++-----------
.../irtranslator-call-return-values.ll | 15 +-
.../GlobalISel/irtranslator-function-args.ll | 34 +--
.../GlobalISel/irtranslator-invariant.ll | 5 +-
.../CodeGen/AMDGPU/GlobalISel/localizer.ll | 2 -
8 files changed, 165 insertions(+), 254 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 6f2425c71f09a..5e42ecd0f956d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -63,8 +63,7 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
const CCValAssign &VA) override {
- if (VA.getLocVT() == MVT::i1 &&
- MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
+ if (VA.getLocVT() == MVT::i1) {
MIRBuilder.buildCopy(PhysReg, ValVReg);
return;
}
@@ -79,7 +78,7 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
if (TRI->isSGPRReg(MRI, PhysReg)) {
LLT Ty = MRI.getType(ExtReg);
LLT S32 = LLT::scalar(32);
- if (Ty != S32 && Ty != LLT::scalar(64)) {
+ if (Ty != S32) {
// FIXME: We should probably support readfirstlane intrinsics with all
// legal 32-bit types.
assert(Ty.getSizeInBits() == 32);
@@ -127,18 +126,19 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
const CCValAssign &VA) override {
markPhysRegUsed(PhysReg);
+ if (VA.getLocVT() == MVT::i1) {
+ MIRBuilder.buildCopy(ValVReg, PhysReg);
+ MRI.setRegClass(ValVReg, MIRBuilder.getMF()
+ .getSubtarget<GCNSubtarget>()
+ .getRegisterInfo()
+ ->getBoolRC());
+ return;
+ }
+
if (VA.getLocVT().getSizeInBits() < 32) {
// 16-bit types are reported as legal for 32-bit registers. We need to do
// a 32-bit copy, and truncate to avoid the verifier complaining about it.
- //
- // However, when function return type is i1, it may be in a 64b register.
- unsigned CopyToBits =
- (VA.getLocVT() == MVT::i1 &&
- MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
- ? 64
- : 32;
-
- auto Copy = MIRBuilder.buildCopy(LLT::scalar(CopyToBits), PhysReg);
+ auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
// If we have signext/zeroext, it applies to the whole 32-bit register
// before truncation.
@@ -248,8 +248,7 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
const CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
- if (VA.getLocVT() == MVT::i1 &&
- MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) {
+ if (VA.getLocVT() == MVT::i1) {
MIRBuilder.buildCopy(PhysReg, ValVReg);
return;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b48a09489653a..f12fe7f1118ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -131,6 +131,12 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
Register SrcReg = Src.getReg();
if (isVCC(DstReg, *MRI)) {
+ if (SrcReg.isPhysical() && SrcReg != AMDGPU::SCC) {
+ const TargetRegisterClass *DstRC = MRI->getRegClassOrNull(DstReg);
+ if (DstRC)
+ return DstRC->contains(SrcReg);
+ }
+
if (SrcReg == AMDGPU::SCC) {
const TargetRegisterClass *RC
= TRI.getConstrainedRegClassForOperand(Dst, *MRI);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
index 81a1994b5afb1..32c7c434d4716 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
@@ -13,10 +13,8 @@ define i1 @i1_func_void() {
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX11-NEXT: $sgpr0 = COPY [[INTRIN]](s32)
-; GFX11-NEXT: SI_RETURN implicit $sgpr0
+; GFX11-NEXT: $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT: SI_RETURN
%val = load i1, ptr addrspace(1) undef
ret i1 %val
}
@@ -30,10 +28,9 @@ define void @test_call_i1_func_void() {
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
-; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
-; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: test_call_i1_func_void
@@ -42,10 +39,9 @@ define void @test_call_i1_func_void() {
; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i1_func_void
; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit-def $sgpr0
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: SI_RETURN
%val = call i1 @i1_func_void()
store volatile i1 %val, ptr addrspace(1) undef
@@ -64,10 +60,8 @@ define zeroext i1 @zeroext_i1_func_void() {
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX11-NEXT: $sgpr0 = COPY [[INTRIN]](s32)
-; GFX11-NEXT: SI_RETURN implicit $sgpr0
+; GFX11-NEXT: $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT: SI_RETURN
%val = load i1, ptr addrspace(1) undef
ret i1 %val
}
@@ -81,10 +75,9 @@ define void @test_call_zeroext_i1_func_void() {
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
-; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
-; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: test_call_zeroext_i1_func_void
@@ -93,10 +86,9 @@ define void @test_call_zeroext_i1_func_void() {
; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @zeroext_i1_func_void
; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit-def $sgpr0
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: SI_RETURN
%val = call i1 @zeroext_i1_func_void()
store volatile i1 %val, ptr addrspace(1) undef
@@ -115,10 +107,8 @@ define signext i1 @signext_i1_func_void() {
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX11-NEXT: $sgpr0 = COPY [[INTRIN]](s32)
-; GFX11-NEXT: SI_RETURN implicit $sgpr0
+; GFX11-NEXT: $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT: SI_RETURN
%val = load i1, ptr addrspace(1) undef
ret i1 %val
}
@@ -132,10 +122,9 @@ define void @test_call_signext_i1_func_void() {
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
-; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
-; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: test_call_signext_i1_func_void
@@ -144,10 +133,9 @@ define void @test_call_signext_i1_func_void() {
; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @signext_i1_func_void
; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit-def $sgpr0
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: SI_RETURN
%val = call i1 @signext_i1_func_void()
store volatile i1 %val, ptr addrspace(1) undef
@@ -224,13 +212,9 @@ define [2 x i1] @a2i1_func_void() {
; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; GFX11-NEXT: [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
; GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s1) = G_LOAD [[PTRADD]](p1) :: (load (s1) from `ptr addrspace(1) undef` + 1, addrspace 1)
-; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT: [[INTRIN:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-; GFX11-NEXT: $sgpr0 = COPY [[INTRIN]](s32)
-; GFX11-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD2]](s1)
-; GFX11-NEXT: [[INTRIN2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT3]](s32)
-; GFX11-NEXT: $sgpr1 = COPY [[INTRIN2]](s32)
-; GFX11-NEXT: SI_RETURN implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT: $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT: $sgpr1 = COPY [[LOAD2]](s1)
+; GFX11-NEXT: SI_RETURN
%val = load [2 x i1], ptr addrspace(1) undef
ret [2 x i1] %val
}
@@ -244,15 +228,13 @@ define void @test_call_a2i1_func_void() {
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @a2i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1, implicit-def $sgpr2_sgpr3
-; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
-; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
-; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr2_sgpr3
-; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
+; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_64(s1) = COPY $sgpr2_sgpr3
; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; GFX9-NEXT: [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
-; GFX9-NEXT: G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY3]](s1), [[PTRADD]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: test_call_a2i1_func_void
@@ -261,15 +243,13 @@ define void @test_call_a2i1_func_void() {
; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a2i1_func_void
; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @a2i1_func_void, csr_amdgpu, implicit-def $sgpr0, implicit-def $sgpr1
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
-; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; GFX11-NEXT: [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
-; GFX11-NEXT: G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY2]](s1), [[PTRADD]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
; GFX11-NEXT: SI_RETURN
%val = call [2 x i1] @a2i1_func_void()
store volatile [2 x i1] %val, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
index 134751ee1e313..3e554fc8b638b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
@@ -6,20 +6,18 @@ define void @void_func_i1(i1 %arg0) {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
-; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: void_func_i1
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: liveins: $sgpr0
; GFX11-NEXT: {{ $}}
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: SI_RETURN
store i1 %arg0, ptr addrspace(1) undef
ret void
@@ -45,8 +43,7 @@ define void @test_call_void_func_i1() {
; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1
-; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr0 = COPY [[LOAD]](s1)
; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1, csr_amdgpu, implicit $sgpr0
; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX11-NEXT: SI_RETURN
@@ -60,11 +57,10 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
-; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s1)
; GFX9-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[CONST]]
; GFX9-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: SI_RETURN
@@ -73,11 +69,10 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) {
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: liveins: $sgpr0
; GFX11-NEXT: {{ $}}
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+; GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s1)
; GFX11-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[CONST]]
; GFX11-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: SI_RETURN
@@ -107,8 +102,7 @@ define void @test_call_void_func_i1_zeroext() {
; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_zeroext
-; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr0 = COPY [[LOAD]](s1)
; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0
; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX11-NEXT: SI_RETURN
@@ -122,11 +116,10 @@ define void @void_func_i1_signext(i1 signext %arg0) {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
-; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s1)
; GFX9-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[CONST]]
; GFX9-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: SI_RETURN
@@ -135,11 +128,10 @@ define void @void_func_i1_signext(i1 signext %arg0) {
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: liveins: $sgpr0
; GFX11-NEXT: {{ $}}
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+; GFX11-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s1)
; GFX11-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[CONST]]
; GFX11-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: SI_RETURN
@@ -169,8 +161,7 @@ define void @test_call_void_func_i1_signext() {
; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_signext
-; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr0 = COPY [[LOAD]](s1)
; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_signext, csr_amdgpu, implicit $sgpr0
; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX11-NEXT: SI_RETURN
@@ -184,30 +175,26 @@ define void @void_func_a2i1([2 x i1] %arg0) {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
; GFX9-NEXT: {{ $}}
-; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
-; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
-; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; GFX9-NEXT: [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
-; GFX9-NEXT: G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY2]](s1), [[PTRADD]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: void_func_a2i1
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: liveins: $sgpr0, $sgpr1
; GFX11-NEXT: {{ $}}
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
-; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; GFX11-NEXT: [[PTRADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST]](s64)
-; GFX11-NEXT: G_STORE [[TRUNC2]](s1), [[PTRADD]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY2]](s1), [[PTRADD]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
; GFX11-NEXT: SI_RETURN
store [2 x i1] %arg0, ptr addrspace(1) undef
ret void
@@ -234,10 +221,8 @@ define void @test_call_void_func_a2i1() {
; GFX11-NEXT: [[CONST2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_a2i1
-; GFX11-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST1]](s1)
-; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32)
-; GFX11-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST2]](s1)
-; GFX11-NEXT: $sgpr1 = COPY [[ANYEXT2]](s32)
+; GFX11-NEXT: $sgpr0 = COPY [[CONST1]](s1)
+; GFX11-NEXT: $sgpr1 = COPY [[CONST2]](s1)
; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_a2i1, csr_amdgpu, implicit $sgpr0, implicit $sgpr1
; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX11-NEXT: SI_RETURN
@@ -252,26 +237,22 @@ define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
; GFX9-NEXT: {{ $}}
-; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
-; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
-; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT: G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: void_func_i1_i1
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: liveins: $sgpr0, $sgpr1
; GFX11-NEXT: {{ $}}
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
-; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT: G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: SI_RETURN
store volatile i1 %arg0, ptr addrspace(1) undef
store volatile i1 %arg1, ptr addrspace(1) undef
@@ -301,10 +282,8 @@ define void @test_call_void_func_i1_i1() {
; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_i1
-; GFX11-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32)
-; GFX11-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
-; GFX11-NEXT: $sgpr1 = COPY [[ANYEXT2]](s32)
+; GFX11-NEXT: $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT: $sgpr1 = COPY [[CONST]](s1)
; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_i1, csr_amdgpu, implicit $sgpr0, implicit $sgpr1
; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX11-NEXT: SI_RETURN
@@ -322,32 +301,19 @@ define void @many_i1_args(
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
; GFX9-NEXT: {{ $}}
-; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
-; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
-; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
-; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr8_sgpr9
-; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
-; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $sgpr10_sgpr11
-; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s64)
-; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $sgpr12_sgpr13
-; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s64)
-; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s64) = COPY $sgpr14_sgpr15
-; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s64)
-; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
-; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s64)
-; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s64) = COPY $sgpr18_sgpr19
-; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s64)
-; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY $sgpr20_sgpr21
-; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s64)
-; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY $sgpr22_sgpr23
-; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s64)
-; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY $sgpr24_sgpr25
-; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s64)
-; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY $sgpr26_sgpr27
-; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s64)
-; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY $sgpr28_sgpr29
-; GFX9-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s64)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr8_sgpr9
+; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_64(s1) = COPY $sgpr10_sgpr11
+; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_64(s1) = COPY $sgpr12_sgpr13
+; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_64(s1) = COPY $sgpr14_sgpr15
+; GFX9-NEXT: [[COPY6:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
+; GFX9-NEXT: [[COPY7:%[0-9]+]]:sreg_64(s1) = COPY $sgpr18_sgpr19
+; GFX9-NEXT: [[COPY8:%[0-9]+]]:sreg_64(s1) = COPY $sgpr20_sgpr21
+; GFX9-NEXT: [[COPY9:%[0-9]+]]:sreg_64(s1) = COPY $sgpr22_sgpr23
+; GFX9-NEXT: [[COPY10:%[0-9]+]]:sreg_64(s1) = COPY $sgpr24_sgpr25
+; GFX9-NEXT: [[COPY11:%[0-9]+]]:sreg_64(s1) = COPY $sgpr26_sgpr27
+; GFX9-NEXT: [[COPY12:%[0-9]+]]:sreg_64(s1) = COPY $sgpr28_sgpr29
; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -388,82 +354,58 @@ define void @many_i1_args(
; GFX9-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
;
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX9-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to COPY1-COPY11 omitted
+; GFX9: G_STORE [[COPY12]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[TRUNC13]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to TRUNC14-TRUNC30 omitted
; GFX9: G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
;
; GFX11-LABEL: name: many_i1_args
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
; GFX11-NEXT: {{ $}}
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
-; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
-; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
-; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
-; GFX11-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
-; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[COPY3]](s32)
-; GFX11-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
-; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[COPY4]](s32)
-; GFX11-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr5
-; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[COPY5]](s32)
-; GFX11-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr6
-; GFX11-NEXT: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[COPY6]](s32)
-; GFX11-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr7
-; GFX11-NEXT: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[COPY7]](s32)
-; GFX11-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr8
-; GFX11-NEXT: [[TRUNC8:%[0-9]+]]:_(s1) = G_TRUNC [[COPY8]](s32)
-; GFX11-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr9
-; GFX11-NEXT: [[TRUNC9:%[0-9]+]]:_(s1) = G_TRUNC [[COPY9]](s32)
-; GFX11-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr10
-; GFX11-NEXT: [[TRUNC10:%[0-9]+]]:_(s1) = G_TRUNC [[COPY10]](s32)
-; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr11
-; GFX11-NEXT: [[TRUNC11:%[0-9]+]]:_(s1) = G_TRUNC [[COPY11]](s32)
-; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr12
-; GFX11-NEXT: [[TRUNC12:%[0-9]+]]:_(s1) = G_TRUNC [[COPY12]](s32)
-; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $sgpr13
-; GFX11-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
-; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $sgpr14
-; GFX11-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
-; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $sgpr15
-; GFX11-NEXT: [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
-; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $sgpr16
-; GFX11-NEXT: [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
-; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $sgpr17
-; GFX11-NEXT: [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
-; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $sgpr18
-; GFX11-NEXT: [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
-; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $sgpr19
-; GFX11-NEXT: [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
-; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $sgpr20
-; GFX11-NEXT: [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
-; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $sgpr21
-; GFX11-NEXT: [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
-; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $sgpr22
-; GFX11-NEXT: [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
-; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $sgpr23
-; GFX11-NEXT: [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
-; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $sgpr24
-; GFX11-NEXT: [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
-; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $sgpr25
-; GFX11-NEXT: [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
-; GFX11-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $sgpr26
-; GFX11-NEXT: [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
-; GFX11-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $sgpr27
-; GFX11-NEXT: [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
-; GFX11-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $sgpr28
-; GFX11-NEXT: [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
-; GFX11-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $sgpr29
-; GFX11-NEXT: [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr2
+; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY $sgpr3
+; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY $sgpr4
+; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY $sgpr5
+; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY $sgpr6
+; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY $sgpr7
+; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY $sgpr8
+; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY $sgpr9
+; GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY $sgpr10
+; GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY $sgpr11
+; GFX11-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY $sgpr12
+; GFX11-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY $sgpr13
+; GFX11-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY $sgpr14
+; GFX11-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY $sgpr15
+; GFX11-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY $sgpr16
+; GFX11-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY $sgpr17
+; GFX11-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY $sgpr18
+; GFX11-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY $sgpr19
+; GFX11-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY $sgpr20
+; GFX11-NEXT: [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY $sgpr21
+; GFX11-NEXT: [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY $sgpr22
+; GFX11-NEXT: [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY $sgpr23
+; GFX11-NEXT: [[COPY24:%[0-9]+]]:sreg_32(s1) = COPY $sgpr24
+; GFX11-NEXT: [[COPY25:%[0-9]+]]:sreg_32(s1) = COPY $sgpr25
+; GFX11-NEXT: [[COPY26:%[0-9]+]]:sreg_32(s1) = COPY $sgpr26
+; GFX11-NEXT: [[COPY27:%[0-9]+]]:sreg_32(s1) = COPY $sgpr27
+; GFX11-NEXT: [[COPY28:%[0-9]+]]:sreg_32(s1) = COPY $sgpr28
+; GFX11-NEXT: [[COPY29:%[0-9]+]]:sreg_32(s1) = COPY $sgpr29
; GFX11-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX11-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
; GFX11-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX11-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
;
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; G_STOREs to COPY1-COPY28 omitted
+; GFX11: G_STORE [[COPY29]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; G_STOREs to TRUNC1-TRUNC30 omitted
+; GFX11-NEXT: G_STORE [[TRUNC30]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11: G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
store volatile i1 %arg0, ptr addrspace(1) undef
store volatile i1 %arg1, ptr addrspace(1) undef
@@ -509,12 +451,11 @@ define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr6, $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
-; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
-; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr6
; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: SI_RETURN
;
@@ -522,12 +463,11 @@ define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: liveins: $sgpr0, $sgpr1
; GFX11-NEXT: {{ $}}
-; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
-; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: SI_RETURN
store volatile i1 %arg0, ptr addrspace(1) undef
@@ -542,11 +482,10 @@ define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $sgpr6_sgpr7
-; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT: G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: void_func_i1_inreg_i1
@@ -555,11 +494,10 @@ define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
-; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
-; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT: G_STORE [[TRUNC2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: SI_RETURN
store volatile i1 %arg0, ptr addrspace(1) undef
store volatile i1 %arg1, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 3db0acceec0b3..ec999149daed8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -199,10 +199,9 @@ define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32)
; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32)
; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
+ ; GCN-NEXT: [[COPY21:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
- ; GCN-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+ ; GCN-NEXT: G_STORE [[COPY21]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: S_ENDPGM 0
%val = call i1 @external_i1_func_void()
store volatile i1 %val, ptr addrspace(1) undef
@@ -276,10 +275,9 @@ define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32)
; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32)
; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
+ ; GCN-NEXT: [[COPY21:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
- ; GCN-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+ ; GCN-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY21]](s1)
; GCN-NEXT: G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: S_ENDPGM 0
%val = call i1 @external_i1_zeroext_func_void()
@@ -336,10 +334,9 @@ define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32)
; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32)
; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s64)
+ ; GCN-NEXT: [[COPY21:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
- ; GCN-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+ ; GCN-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY21]](s1)
; GCN-NEXT: G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: S_ENDPGM 0
%val = call i1 @external_i1_signext_func_void()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index eece4397d1855..5d2f794b94c4d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -37,20 +37,18 @@ define void @void_func_i1(i1 %arg0) #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr16_sgpr17
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+ ; CHECK-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: SI_RETURN
;
; GFX11-LABEL: name: void_func_i1
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: liveins: $sgpr0
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
- ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+ ; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX11-NEXT: SI_RETURN
store i1 %arg0, ptr addrspace(1) undef
ret void
@@ -61,11 +59,10 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr16_sgpr17
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s1)
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[C]]
; CHECK-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -80,11 +77,10 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr16_sgpr17
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s1)
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[C]]
; CHECK-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -100,14 +96,13 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; CHECK-NEXT: liveins: $sgpr16_sgpr17
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[TRUNC]], [[C]]
- ; CHECK-NEXT: [[INTRINSIC_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1)
- ; CHECK-NEXT: G_BRCOND [[INTRINSIC_W_SIDE_EFFECTS]](s1), %bb.2
+ ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY]], [[C]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1)
+ ; CHECK-NEXT: G_BRCOND [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS]](s1), %bb.2
; CHECK-NEXT: G_BR %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.bb1:
@@ -1998,8 +1993,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5)
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32)
- ; CHECK-NEXT: [[COPY31:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s64)
+ ; CHECK-NEXT: [[COPY31:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5)
; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
@@ -2009,7 +2003,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.0, align 4, addrspace 5)
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
- ; CHECK-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+ ; CHECK-NEXT: G_STORE [[COPY31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
; CHECK-NEXT: G_STORE [[LOAD2]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index 6360c5c2cbb2e..aa6f518a3e30f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -24,11 +24,10 @@ define i32 @load_select_const_i32_gv(i1 %cond) {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr4_sgpr5
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr4_sgpr5
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv0
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p1) = G_GLOBAL_VALUE @const_gv1
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(p1) = G_SELECT [[TRUNC]](s1), [[GV]], [[GV1]]
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(p1) = G_SELECT [[COPY]](s1), [[GV]], [[GV1]]
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[SELECT]](p1) :: (dereferenceable invariant load (s32) from %ir.select, addrspace 1)
; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 4d04d6b7570c2..40cd5d88d4a38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -168,8 +168,6 @@ define void @localize_internal_globals(i1 %cond) {
; GFX9-LABEL: localize_internal_globals:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s4, 1, s4
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4
; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
>From 4a82212cb3f27454a09f36a9f5c8b7d67bb461da Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 13 May 2024 14:11:10 -0500
Subject: [PATCH 18/20] (1) avoid using reserved ScratchRSrcReg (2) update/add
testcases.
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 53 +-
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h | 7 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +
.../GlobalISel/function-call-i1-return.ll | 18 +-
.../AMDGPU/GlobalISel/function-i1-args.ll | 518 +++++++++-
.../AMDGPU/GlobalISel/function-returns.ll | 6 +-
.../AMDGPU/GlobalISel/irtranslator-call.ll | 86 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 946 ++++++++++++------
.../codegen-prepare-addrspacecast-non-null.ll | 46 +-
.../AMDGPU/divergence-driven-trunc-to-i1.ll | 12 +-
llvm/test/CodeGen/AMDGPU/extract-load-i1.ll | 2 +-
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 25 +-
.../AMDGPU/fsub-as-fneg-src-modifier.ll | 212 ++--
.../CodeGen/AMDGPU/function-call-i1-return.ll | 24 +-
llvm/test/CodeGen/AMDGPU/function-returns.ll | 6 +-
.../identical-subrange-spill-infloop.ll | 599 ++++++-----
llvm/test/CodeGen/AMDGPU/indirect-call.ll | 24 +-
.../AMDGPU/lds-global-non-entry-func.ll | 134 ++-
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll | 6 +-
.../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll | 480 +++------
.../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 598 ++++-------
llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll | 108 +-
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 244 ++---
.../AMDGPU/loop-on-function-argument.ll | 12 +-
.../test/CodeGen/AMDGPU/loop_exit_with_xor.ll | 4 +-
...p-var-out-of-divergent-loop-swdev407790.ll | 33 +-
.../CodeGen/AMDGPU/mul24-pass-ordering.ll | 16 +-
.../si-annotate-nested-control-flows.ll | 10 +-
.../si-optimize-vgpr-live-range-dbg-instr.ll | 22 +-
.../AMDGPU/srem-seteq-illegal-types.ll | 11 +-
.../CodeGen/AMDGPU/stacksave_stackrestore.ll | 56 +-
.../AMDGPU/urem-seteq-illegal-types.ll | 16 +-
33 files changed, 2230 insertions(+), 2120 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 5e42ecd0f956d..c69cf8c34a6b2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -65,6 +65,7 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
const CCValAssign &VA) override {
if (VA.getLocVT() == MVT::i1) {
MIRBuilder.buildCopy(PhysReg, ValVReg);
+ MIB.addUse(PhysReg, RegState::Implicit);
return;
}
@@ -316,6 +317,31 @@ bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
}
+/// Special handling for i1 return val: based on determineAndHandleAssignments()
+bool AMDGPUCallLowering::determineAndHandleAssignmentsForI1Return(
+ ValueHandler &Handler, ValueAssigner &Assigner,
+ SmallVectorImpl<ArgInfo> &Args, MachineIRBuilder &MIRBuilder,
+ CallingConv::ID CallConv, bool IsVarArg) const {
+
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = MF.getFunction();
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, F.getContext());
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.enableFlatScratch()) {
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+ }
+
+ if (!determineAssignments(Assigner, Args, CCInfo))
+ return false;
+
+ return handleAssignments(Handler, Args, CCInfo, ArgLocs, MIRBuilder);
+}
+
/// Lower the return value for the already existing \p Ret. This assumes that
/// \p B's insertion point is correct.
bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
@@ -378,8 +404,13 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
OutgoingValueAssigner Assigner(AssignFn);
AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
- return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
- CC, F.isVarArg());
+
+ if (SplitEVTs.size() == 1 && SplitEVTs[0] == MVT::i1)
+ return determineAndHandleAssignmentsForI1Return(
+ RetHandler, Assigner, SplitRetInfos, B, CC, F.isVarArg());
+ else
+ return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
+ CC, F.isVarArg());
}
bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
@@ -1493,6 +1524,11 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
}
+ if (!ST.enableFlatScratch()) {
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+ }
+
// Do the actual argument marshalling.
SmallVector<Register, 8> PhysRegs;
@@ -1539,9 +1575,16 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
Info.IsVarArg);
IncomingValueAssigner Assigner(RetAssignFn);
CallReturnHandler Handler(MIRBuilder, MRI, MIB);
- if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
- Info.CallConv, Info.IsVarArg))
- return false;
+ if (Info.OrigRet.Ty->isIntegerTy(1)) {
+ if (!determineAndHandleAssignmentsForI1Return(Handler, Assigner, InArgs,
+ MIRBuilder, Info.CallConv,
+ Info.IsVarArg))
+ return false;
+ } else {
+ if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
+ Info.CallConv, Info.IsVarArg))
+ return false;
+ }
}
uint64_t CalleePopBytes = NumBytes;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index a6e801f2a547b..afe3a7a19601a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -37,6 +37,13 @@ class AMDGPUCallLowering final : public CallLowering {
bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
+ bool determineAndHandleAssignmentsForI1Return(ValueHandler &Handler,
+ ValueAssigner &Assigner,
+ SmallVectorImpl<ArgInfo> &Args,
+ MachineIRBuilder &MIRBuilder,
+ CallingConv::ID CallConv,
+ bool IsVarArg) const;
+
public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56345d14a331c..aa44cca11f800 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3741,6 +3741,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (!DstBank)
DstBank = SrcBank;
+ // For i1 return value, the dst reg is an SReg but we need to set the reg
+ // bank to VCCRegBank.
+ if (!MI.getOperand(0).getReg().isVirtual() &&
+ SrcBank == &AMDGPU::VCCRegBank)
+ DstBank = SrcBank;
+
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
if (MI.getOpcode() != AMDGPU::G_FREEZE &&
cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 618fdd95f4a4b..d98045f422878 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3149,6 +3149,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
+ if (!Subtarget->enableFlatScratch())
+ CCInfo.AllocateReg(Info->getScratchRSrcReg());
+
// Analyze outgoing return values.
CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
@@ -3228,6 +3231,13 @@ SDValue SITargetLowering::LowerCallResult(
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
+
+ if (!Subtarget->enableFlatScratch()) {
+ SIMachineFunctionInfo *FuncInfo =
+ DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
+ CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+ }
+
CCInfo.AnalyzeCallResult(Ins, RetCC);
// Copy all of the result registers out of their specified physreg.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
index 32c7c434d4716..a022c13f38f9a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-call-i1-return.ll
@@ -6,7 +6,7 @@ define i1 @i1_func_void() {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: i1_func_void
@@ -27,8 +27,8 @@ define void @test_call_i1_func_void() {
; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i1_func_void
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
-; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX9-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: SI_RETURN
@@ -53,7 +53,7 @@ define zeroext i1 @zeroext_i1_func_void() {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: zeroext_i1_func_void
@@ -74,8 +74,8 @@ define void @test_call_zeroext_i1_func_void() {
; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @zeroext_i1_func_void
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
-; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @zeroext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX9-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: SI_RETURN
@@ -100,7 +100,7 @@ define signext i1 @signext_i1_func_void() {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
; GFX9-NEXT: SI_RETURN
;
; GFX11-LABEL: name: signext_i1_func_void
@@ -121,8 +121,8 @@ define void @test_call_signext_i1_func_void() {
; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @signext_i1_func_void
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
-; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr0_sgpr1
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @signext_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX9-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
index 3e554fc8b638b..47c4682196d60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
@@ -30,10 +30,10 @@ define void @test_call_void_func_i1() {
; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1
-; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX9-NEXT: SI_RETURN
;
@@ -89,10 +89,10 @@ define void @test_call_void_func_i1_zeroext() {
; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_zeroext
-; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_zeroext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_zeroext, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX9-NEXT: SI_RETURN
;
@@ -148,10 +148,10 @@ define void @test_call_void_func_i1_signext() {
; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_signext
-; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+; GFX9-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_signext, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_signext, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX9-NEXT: SI_RETURN
;
@@ -207,11 +207,11 @@ define void @test_call_void_func_a2i1() {
; GFX9-NEXT: [[CONST2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_a2i1
-; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[CONST1]](s1)
-; GFX9-NEXT: $sgpr2_sgpr3 = COPY [[CONST2]](s1)
+; GFX9-NEXT: $sgpr4_sgpr5 = COPY [[CONST1]](s1)
+; GFX9-NEXT: $sgpr6_sgpr7 = COPY [[CONST2]](s1)
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_a2i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_a2i1, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX9-NEXT: SI_RETURN
;
@@ -267,11 +267,11 @@ define void @test_call_void_func_i1_i1() {
; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_i1
-; GFX9-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
-; GFX9-NEXT: $sgpr2_sgpr3 = COPY [[CONST]](s1)
+; GFX9-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
+; GFX9-NEXT: $sgpr6_sgpr7 = COPY [[CONST]](s1)
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_i1, csr_amdgpu, implicit $sgpr0_sgpr1, implicit $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_i1, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; GFX9-NEXT: SI_RETURN
;
@@ -292,12 +292,12 @@ define void @test_call_void_func_i1_i1() {
ret void
}
-define void @many_i1_args(
+define void @exhaust_sgprs_by_i1_args(
i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
-; GFX9-LABEL: name: many_i1_args
+; GFX9-LABEL: name: exhaust_sgprs_by_i1_args
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
; GFX9-NEXT: {{ $}}
@@ -361,7 +361,7 @@ define void @many_i1_args(
; G_STOREs to TRUNC14-TRUNC30 omitted
; GFX9: G_STORE [[TRUNC31]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
;
-; GFX11-LABEL: name: many_i1_args
+; GFX11-LABEL: name: exhaust_sgprs_by_i1_args
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1
; GFX11-NEXT: {{ $}}
@@ -446,6 +446,237 @@ define void @many_i1_args(
ret void
}
+define void @void_func_a48i1([48 x i1] %arg0) {
+; GFX9-LABEL: name: void_func_a48i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr8_sgpr9
+; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_64(s1) = COPY $sgpr10_sgpr11
+; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_64(s1) = COPY $sgpr12_sgpr13
+; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_64(s1) = COPY $sgpr14_sgpr15
+; GFX9-NEXT: [[COPY6:%[0-9]+]]:sreg_64(s1) = COPY $sgpr16_sgpr17
+; GFX9-NEXT: [[COPY7:%[0-9]+]]:sreg_64(s1) = COPY $sgpr18_sgpr19
+; GFX9-NEXT: [[COPY8:%[0-9]+]]:sreg_64(s1) = COPY $sgpr20_sgpr21
+; GFX9-NEXT: [[COPY9:%[0-9]+]]:sreg_64(s1) = COPY $sgpr22_sgpr23
+; GFX9-NEXT: [[COPY10:%[0-9]+]]:sreg_64(s1) = COPY $sgpr24_sgpr25
+; GFX9-NEXT: [[COPY11:%[0-9]+]]:sreg_64(s1) = COPY $sgpr26_sgpr27
+; GFX9-NEXT: [[COPY12:%[0-9]+]]:sreg_64(s1) = COPY $sgpr28_sgpr29
+; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX9-NEXT: [[TRUNC13:%[0-9]+]]:_(s1) = G_TRUNC [[COPY13]](s32)
+; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX9-NEXT: [[TRUNC14:%[0-9]+]]:_(s1) = G_TRUNC [[COPY14]](s32)
+; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr2
+; GFX9-NEXT: [[TRUNC15:%[0-9]+]]:_(s1) = G_TRUNC [[COPY15]](s32)
+; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr3
+; GFX9-NEXT: [[TRUNC16:%[0-9]+]]:_(s1) = G_TRUNC [[COPY16]](s32)
+; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr4
+; GFX9-NEXT: [[TRUNC17:%[0-9]+]]:_(s1) = G_TRUNC [[COPY17]](s32)
+; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr5
+; GFX9-NEXT: [[TRUNC18:%[0-9]+]]:_(s1) = G_TRUNC [[COPY18]](s32)
+; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr6
+; GFX9-NEXT: [[TRUNC19:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32)
+; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr7
+; GFX9-NEXT: [[TRUNC20:%[0-9]+]]:_(s1) = G_TRUNC [[COPY20]](s32)
+; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr8
+; GFX9-NEXT: [[TRUNC21:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr9
+; GFX9-NEXT: [[TRUNC22:%[0-9]+]]:_(s1) = G_TRUNC [[COPY22]](s32)
+; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr10
+; GFX9-NEXT: [[TRUNC23:%[0-9]+]]:_(s1) = G_TRUNC [[COPY23]](s32)
+; GFX9-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr11
+; GFX9-NEXT: [[TRUNC24:%[0-9]+]]:_(s1) = G_TRUNC [[COPY24]](s32)
+; GFX9-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr12
+; GFX9-NEXT: [[TRUNC25:%[0-9]+]]:_(s1) = G_TRUNC [[COPY25]](s32)
+; GFX9-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr13
+; GFX9-NEXT: [[TRUNC26:%[0-9]+]]:_(s1) = G_TRUNC [[COPY26]](s32)
+; GFX9-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr14
+; GFX9-NEXT: [[TRUNC27:%[0-9]+]]:_(s1) = G_TRUNC [[COPY27]](s32)
+; GFX9-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr15
+; GFX9-NEXT: [[TRUNC28:%[0-9]+]]:_(s1) = G_TRUNC [[COPY28]](s32)
+; GFX9-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr16
+; GFX9-NEXT: [[TRUNC29:%[0-9]+]]:_(s1) = G_TRUNC [[COPY29]](s32)
+; GFX9-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr17
+; GFX9-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX9-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr18
+; GFX9-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+; GFX9-NEXT: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr19
+; GFX9-NEXT: [[TRUNC32:%[0-9]+]]:_(s1) = G_TRUNC [[COPY32]](s32)
+; GFX9-NEXT: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr20
+; GFX9-NEXT: [[TRUNC33:%[0-9]+]]:_(s1) = G_TRUNC [[COPY33]](s32)
+; GFX9-NEXT: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr21
+; GFX9-NEXT: [[TRUNC34:%[0-9]+]]:_(s1) = G_TRUNC [[COPY34]](s32)
+; GFX9-NEXT: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr22
+; GFX9-NEXT: [[TRUNC35:%[0-9]+]]:_(s1) = G_TRUNC [[COPY35]](s32)
+; GFX9-NEXT: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr23
+; GFX9-NEXT: [[TRUNC36:%[0-9]+]]:_(s1) = G_TRUNC [[COPY36]](s32)
+; GFX9-NEXT: [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr24
+; GFX9-NEXT: [[TRUNC37:%[0-9]+]]:_(s1) = G_TRUNC [[COPY37]](s32)
+; GFX9-NEXT: [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr25
+; GFX9-NEXT: [[TRUNC38:%[0-9]+]]:_(s1) = G_TRUNC [[COPY38]](s32)
+; GFX9-NEXT: [[COPY39:%[0-9]+]]:_(s32) = COPY $vgpr26
+; GFX9-NEXT: [[TRUNC39:%[0-9]+]]:_(s1) = G_TRUNC [[COPY39]](s32)
+; GFX9-NEXT: [[COPY40:%[0-9]+]]:_(s32) = COPY $vgpr27
+; GFX9-NEXT: [[TRUNC40:%[0-9]+]]:_(s1) = G_TRUNC [[COPY40]](s32)
+; GFX9-NEXT: [[COPY41:%[0-9]+]]:_(s32) = COPY $vgpr28
+; GFX9-NEXT: [[TRUNC41:%[0-9]+]]:_(s1) = G_TRUNC [[COPY41]](s32)
+; GFX9-NEXT: [[COPY42:%[0-9]+]]:_(s32) = COPY $vgpr29
+; GFX9-NEXT: [[TRUNC42:%[0-9]+]]:_(s1) = G_TRUNC [[COPY42]](s32)
+; GFX9-NEXT: [[COPY43:%[0-9]+]]:_(s32) = COPY $vgpr30
+; GFX9-NEXT: [[TRUNC43:%[0-9]+]]:_(s1) = G_TRUNC [[COPY43]](s32)
+; GFX9-NEXT: [[FRAME1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
+; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME1]](p5) :: (invariant load (s1) from %fixed-stack.3, align 16, addrspace 5)
+; GFX9-NEXT: [[TRUNC44:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD1]](s32)
+; GFX9-NEXT: [[FRAME2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
+; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME2]](p5) :: (invariant load (s1) from %fixed-stack.2, align 4, addrspace 5)
+; GFX9-NEXT: [[TRUNC45:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD2]](s32)
+; GFX9-NEXT: [[FRAME3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
+; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME3]](p5) :: (invariant load (s1) from %fixed-stack.1, align 8, addrspace 5)
+; GFX9-NEXT: [[TRUNC46:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD3]](s32)
+; GFX9-NEXT: [[FRAME4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
+; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME4]](p5) :: (invariant load (s1) from %fixed-stack.0, align 4, addrspace 5)
+; GFX9-NEXT: [[TRUNC47:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD4]](s32)
+;
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: [[CONST1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX9-NEXT: [[PTRADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST1]](s64)
+; GFX9-NEXT: G_STORE [[COPY1]](s1), [[PTRADD1]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+;
+; G_STOREs to COPY2-COPY12, TRUNC13-TRUNC46 omitted
+; GFX9: [[CONST47:%[0-9]+]]:_(s64) = G_CONSTANT i64 47
+; GFX9-NEXT: [[PTRADD47:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST47]](s64)
+; GFX9-NEXT: G_STORE [[TRUNC47]](s1), [[PTRADD47]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 47, addrspace 1)
+
+ store [48 x i1] %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_a64i1([64 x i1] %arg0) {
+; GFX11-LABEL: name: void_func_a64i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr2
+; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY $sgpr3
+; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY $sgpr4
+; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY $sgpr5
+; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY $sgpr6
+; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY $sgpr7
+; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY $sgpr8
+; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY $sgpr9
+; GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY $sgpr10
+; GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY $sgpr11
+; GFX11-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY $sgpr12
+; GFX11-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY $sgpr13
+; GFX11-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY $sgpr14
+; GFX11-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY $sgpr15
+; GFX11-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY $sgpr16
+; GFX11-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY $sgpr17
+; GFX11-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY $sgpr18
+; GFX11-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY $sgpr19
+; GFX11-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY $sgpr20
+; GFX11-NEXT: [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY $sgpr21
+; GFX11-NEXT: [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY $sgpr22
+; GFX11-NEXT: [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY $sgpr23
+; GFX11-NEXT: [[COPY24:%[0-9]+]]:sreg_32(s1) = COPY $sgpr24
+; GFX11-NEXT: [[COPY25:%[0-9]+]]:sreg_32(s1) = COPY $sgpr25
+; GFX11-NEXT: [[COPY26:%[0-9]+]]:sreg_32(s1) = COPY $sgpr26
+; GFX11-NEXT: [[COPY27:%[0-9]+]]:sreg_32(s1) = COPY $sgpr27
+; GFX11-NEXT: [[COPY28:%[0-9]+]]:sreg_32(s1) = COPY $sgpr28
+; GFX11-NEXT: [[COPY29:%[0-9]+]]:sreg_32(s1) = COPY $sgpr29
+; GFX11-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr0
+; GFX11-NEXT: [[TRUNC30:%[0-9]+]]:_(s1) = G_TRUNC [[COPY30]](s32)
+; GFX11-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr1
+; GFX11-NEXT: [[TRUNC31:%[0-9]+]]:_(s1) = G_TRUNC [[COPY31]](s32)
+; GFX11-NEXT: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr2
+; GFX11-NEXT: [[TRUNC32:%[0-9]+]]:_(s1) = G_TRUNC [[COPY32]](s32)
+; GFX11-NEXT: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr3
+; GFX11-NEXT: [[TRUNC33:%[0-9]+]]:_(s1) = G_TRUNC [[COPY33]](s32)
+; GFX11-NEXT: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr4
+; GFX11-NEXT: [[TRUNC34:%[0-9]+]]:_(s1) = G_TRUNC [[COPY34]](s32)
+; GFX11-NEXT: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr5
+; GFX11-NEXT: [[TRUNC35:%[0-9]+]]:_(s1) = G_TRUNC [[COPY35]](s32)
+; GFX11-NEXT: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr6
+; GFX11-NEXT: [[TRUNC36:%[0-9]+]]:_(s1) = G_TRUNC [[COPY36]](s32)
+; GFX11-NEXT: [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr7
+; GFX11-NEXT: [[TRUNC37:%[0-9]+]]:_(s1) = G_TRUNC [[COPY37]](s32)
+; GFX11-NEXT: [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr8
+; GFX11-NEXT: [[TRUNC38:%[0-9]+]]:_(s1) = G_TRUNC [[COPY38]](s32)
+; GFX11-NEXT: [[COPY39:%[0-9]+]]:_(s32) = COPY $vgpr9
+; GFX11-NEXT: [[TRUNC39:%[0-9]+]]:_(s1) = G_TRUNC [[COPY39]](s32)
+; GFX11-NEXT: [[COPY40:%[0-9]+]]:_(s32) = COPY $vgpr10
+; GFX11-NEXT: [[TRUNC40:%[0-9]+]]:_(s1) = G_TRUNC [[COPY40]](s32)
+; GFX11-NEXT: [[COPY41:%[0-9]+]]:_(s32) = COPY $vgpr11
+; GFX11-NEXT: [[TRUNC41:%[0-9]+]]:_(s1) = G_TRUNC [[COPY41]](s32)
+; GFX11-NEXT: [[COPY42:%[0-9]+]]:_(s32) = COPY $vgpr12
+; GFX11-NEXT: [[TRUNC42:%[0-9]+]]:_(s1) = G_TRUNC [[COPY42]](s32)
+; GFX11-NEXT: [[COPY43:%[0-9]+]]:_(s32) = COPY $vgpr13
+; GFX11-NEXT: [[TRUNC43:%[0-9]+]]:_(s1) = G_TRUNC [[COPY43]](s32)
+; GFX11-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY $vgpr14
+; GFX11-NEXT: [[TRUNC44:%[0-9]+]]:_(s1) = G_TRUNC [[COPY44]](s32)
+; GFX11-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY $vgpr15
+; GFX11-NEXT: [[TRUNC45:%[0-9]+]]:_(s1) = G_TRUNC [[COPY45]](s32)
+; GFX11-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY $vgpr16
+; GFX11-NEXT: [[TRUNC46:%[0-9]+]]:_(s1) = G_TRUNC [[COPY46]](s32)
+; GFX11-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY $vgpr17
+; GFX11-NEXT: [[TRUNC47:%[0-9]+]]:_(s1) = G_TRUNC [[COPY47]](s32)
+; GFX11-NEXT: [[COPY48:%[0-9]+]]:_(s32) = COPY $vgpr18
+; GFX11-NEXT: [[TRUNC48:%[0-9]+]]:_(s1) = G_TRUNC [[COPY48]](s32)
+; GFX11-NEXT: [[COPY49:%[0-9]+]]:_(s32) = COPY $vgpr19
+; GFX11-NEXT: [[TRUNC49:%[0-9]+]]:_(s1) = G_TRUNC [[COPY49]](s32)
+; GFX11-NEXT: [[COPY50:%[0-9]+]]:_(s32) = COPY $vgpr20
+; GFX11-NEXT: [[TRUNC50:%[0-9]+]]:_(s1) = G_TRUNC [[COPY50]](s32)
+; GFX11-NEXT: [[COPY51:%[0-9]+]]:_(s32) = COPY $vgpr21
+; GFX11-NEXT: [[TRUNC51:%[0-9]+]]:_(s1) = G_TRUNC [[COPY51]](s32)
+; GFX11-NEXT: [[COPY52:%[0-9]+]]:_(s32) = COPY $vgpr22
+; GFX11-NEXT: [[TRUNC52:%[0-9]+]]:_(s1) = G_TRUNC [[COPY52]](s32)
+; GFX11-NEXT: [[COPY53:%[0-9]+]]:_(s32) = COPY $vgpr23
+; GFX11-NEXT: [[TRUNC53:%[0-9]+]]:_(s1) = G_TRUNC [[COPY53]](s32)
+; GFX11-NEXT: [[COPY54:%[0-9]+]]:_(s32) = COPY $vgpr24
+; GFX11-NEXT: [[TRUNC54:%[0-9]+]]:_(s1) = G_TRUNC [[COPY54]](s32)
+; GFX11-NEXT: [[COPY55:%[0-9]+]]:_(s32) = COPY $vgpr25
+; GFX11-NEXT: [[TRUNC55:%[0-9]+]]:_(s1) = G_TRUNC [[COPY55]](s32)
+; GFX11-NEXT: [[COPY56:%[0-9]+]]:_(s32) = COPY $vgpr26
+; GFX11-NEXT: [[TRUNC56:%[0-9]+]]:_(s1) = G_TRUNC [[COPY56]](s32)
+; GFX11-NEXT: [[COPY57:%[0-9]+]]:_(s32) = COPY $vgpr27
+; GFX11-NEXT: [[TRUNC57:%[0-9]+]]:_(s1) = G_TRUNC [[COPY57]](s32)
+; GFX11-NEXT: [[COPY58:%[0-9]+]]:_(s32) = COPY $vgpr28
+; GFX11-NEXT: [[TRUNC58:%[0-9]+]]:_(s1) = G_TRUNC [[COPY58]](s32)
+; GFX11-NEXT: [[COPY59:%[0-9]+]]:_(s32) = COPY $vgpr29
+; GFX11-NEXT: [[TRUNC59:%[0-9]+]]:_(s1) = G_TRUNC [[COPY59]](s32)
+; GFX11-NEXT: [[COPY60:%[0-9]+]]:_(s32) = COPY $vgpr30
+; GFX11-NEXT: [[TRUNC60:%[0-9]+]]:_(s1) = G_TRUNC [[COPY60]](s32)
+
+; GFX11-NEXT: [[FRAME0:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
+; GFX11-NEXT: [[LOAD0:%[0-9]+]]:_(s32) = G_LOAD [[FRAME0]](p5) :: (invariant load (s1) from %fixed-stack.2, align 16, addrspace 5)
+; GFX11-NEXT: [[TRUNC61:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD0]](s32)
+; GFX11-NEXT: [[FRAME1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
+; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME1]](p5) :: (invariant load (s1) from %fixed-stack.1, align 4, addrspace 5)
+; GFX11-NEXT: [[TRUNC62:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD1]](s32)
+
+; GFX11-NEXT: [[FRAME2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
+; GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME2]](p5) :: (invariant load (s1) from %fixed-stack.0, align 8, addrspace 5)
+; GFX11-NEXT: [[TRUNC63:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD2]](s32)
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+;
+; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: [[CONST1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; GFX11-NEXT: [[PTRADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST1]]
+; GFX11-NEXT: G_STORE [[COPY1]](s1), [[PTRADD1]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 1, addrspace 1)
+;
+; GFX11: [[CONST63:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+; GFX11-NEXT: [[PTRADD63:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[CONST63]]
+; GFX11-NEXT: G_STORE [[TRUNC63]](s1), [[PTRADD63]](p1) :: (store (s1) into `ptr addrspace(1) undef` + 63, addrspace 1)
+
+ store [64 x i1] %arg0, ptr addrspace(1) undef
+ ret void
+}
+
define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
; GFX9-LABEL: name: void_func_i1_i1_inreg
; GFX9: bb.1 (%ir-block.0):
@@ -475,6 +706,41 @@ define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
ret void
}
+define void @test_call_void_func_i1_i1_inreg() {
+; GFX9-LABEL: name: test_call_void_func_i1_i1_inreg
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_i1_inreg]]
+; GFX9-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
+; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
+; GFX9-NEXT: $sgpr6 = COPY [[ANYEXT]](s32)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1_i1_inreg
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_i1_inreg]]
+; GFX11-NEXT: $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
+; GFX11-NEXT: $sgpr1 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: SI_RETURN
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_i1_inreg(i1 %val, i1 inreg true)
+ ret void
+}
+
define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
; GFX9-LABEL: name: void_func_i1_inreg_i1
; GFX9: bb.1 (%ir-block.0):
@@ -504,3 +770,225 @@ define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
ret void
}
+define void @test_call_void_func_i1_inreg_i1() {
+; GFX9-LABEL: name: test_call_void_func_i1_inreg_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_inreg_i1]]
+; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT: $sgpr4 = COPY [[ANYEXT]](s32)
+; GFX9-NEXT: $sgpr6_sgpr7 = COPY [[CONST]](s1)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr4, implicit $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1_inreg_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_inreg_i1]]
+; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr1 = COPY [[CONST]](s1)
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: SI_RETURN
+
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_inreg_i1(i1 inreg %val, i1 true)
+ ret void
+}
+
+define void @void_func_zeroext_i1_i1_inreg(i1 zeroext %arg0, i1 inreg %arg1) {
+; GFX9-LABEL: name: void_func_zeroext_i1_i1_inreg
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $sgpr6, $sgpr4_sgpr5
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: void_func_zeroext_i1_i1_inreg
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_zeroext_i1_i1_inreg() {
+; GFX9-LABEL: name: test_call_void_func_zeroext_i1_i1_inreg
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_zeroext_i1_i1_inreg]]
+; GFX9-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
+; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
+; GFX9-NEXT: $sgpr6 = COPY [[ANYEXT]](s32)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_zeroext_i1_i1_inreg
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_zeroext_i1_i1_inreg]]
+; GFX11-NEXT: $sgpr0 = COPY [[LOAD]](s1)
+; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
+; GFX11-NEXT: $sgpr1 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: SI_RETURN
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_zeroext_i1_i1_inreg(i1 zeroext %val, i1 inreg true)
+ ret void
+}
+
+define void @void_func_i1_inreg_zeroext_i1(i1 inreg %arg0, i1 zeroext %arg1) {
+; GFX9-LABEL: name: void_func_i1_inreg_zeroext_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $sgpr4, $sgpr6_sgpr7
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_inreg_zeroext_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1_inreg_zeroext_i1() {
+; GFX9-LABEL: name: test_call_void_func_i1_inreg_zeroext_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_inreg_zeroext_i1
+; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX9-NEXT: $sgpr4 = COPY [[ANYEXT]](s32)
+; GFX9-NEXT: $sgpr6_sgpr7 = COPY [[CONST]](s1)
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_inreg_zeroext_i1, csr_amdgpu, implicit $sgpr4, implicit $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: test_call_void_func_i1_inreg_zeroext_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_inreg_zeroext_i1]]
+; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+; GFX11-NEXT: $sgpr1 = COPY [[CONST]](s1)
+; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
+; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+; GFX11-NEXT: SI_RETURN
+
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_inreg_zeroext_i1(i1 inreg %val, i1 zeroext true)
+ ret void
+}
+
+define void @void_func_signext_i1_i1_inreg(i1 signext %arg0, i1 inreg %arg1) {
+; GFX9-LABEL: name: void_func_signext_i1_i1_inreg
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $sgpr6, $sgpr4_sgpr5
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s1) = COPY $sgpr4_sgpr5
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr6
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: void_func_signext_i1_i1_inreg
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32(s1) = COPY $sgpr0
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr1
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: G_STORE [[COPY]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_i1_inreg_signext_i1(i1 inreg %arg0, i1 signext %arg1) {
+; GFX9-LABEL: name: void_func_i1_inreg_signext_i1
+; GFX9: bb.1 (%ir-block.0):
+; GFX9-NEXT: liveins: $sgpr4, $sgpr6_sgpr7
+; GFX9-NEXT: {{ $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
+; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s1) = COPY $sgpr6_sgpr7
+; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX9-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX9-NEXT: SI_RETURN
+;
+; GFX11-LABEL: name: void_func_i1_inreg_signext_i1
+; GFX11: bb.1 (%ir-block.0):
+; GFX11-NEXT: liveins: $sgpr0, $sgpr1
+; GFX11-NEXT: {{ $}}
+; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32(s1) = COPY $sgpr1
+; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+; GFX11-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: G_STORE [[COPY2]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1)
+; GFX11-NEXT: SI_RETURN
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
index 252afe1712464..0fa5418962763 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
@@ -8,7 +8,7 @@ define i1 @i1_func_void() #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
- ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
; CHECK-NEXT: SI_RETURN
%val = load i1, ptr addrspace(1) undef
ret i1 %val
@@ -19,7 +19,7 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
- ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
; CHECK-NEXT: SI_RETURN
%val = load i1, ptr addrspace(1) undef
ret i1 %val
@@ -30,7 +30,7 @@ define signext i1 @i1_signext_func_void() #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
- ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](s1)
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
; CHECK-NEXT: SI_RETURN
%val = load i1, ptr addrspace(1) undef
ret i1 %val
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index d0a17bc48c185..6bec8ac074239 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -5306,7 +5306,7 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
- ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[ANYEXT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5318,7 +5318,7 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_i16_inreg(i16 inreg %arg)
@@ -5351,7 +5351,7 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[COPY9]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[COPY9]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5363,7 +5363,7 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_i32_inreg(i32 inreg %arg)
@@ -5399,8 +5399,8 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[UV1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5412,7 +5412,7 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i64_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i64_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_i64_inreg(i64 inreg %arg)
@@ -5448,8 +5448,8 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[UV1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5461,7 +5461,7 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_v2i32_inreg(<2 x i32> inreg %arg)
@@ -5496,7 +5496,7 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
- ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[ANYEXT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5508,7 +5508,7 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_f16_inreg(half inreg %arg)
@@ -5543,7 +5543,7 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
- ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[ANYEXT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5555,7 +5555,7 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_bf16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_bf16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_bf16_inreg(bfloat inreg %arg)
@@ -5588,7 +5588,7 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[COPY9]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[COPY9]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5600,7 +5600,7 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_f32_inreg(float inreg %arg)
@@ -5636,8 +5636,8 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[UV1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5649,7 +5649,7 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f64_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f64_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_f64_inreg(double inreg %arg)
@@ -5682,7 +5682,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[COPY9]](<2 x s16>)
+ ; CHECK-NEXT: $sgpr16 = COPY [[COPY9]](<2 x s16>)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5694,7 +5694,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_v2f16_inreg(<2 x half> inreg %arg)
@@ -5735,8 +5735,8 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[DEF]](s16)
; CHECK-NEXT: [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s16>)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV7]](<2 x s16>)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV8]](<2 x s16>)
+ ; CHECK-NEXT: $sgpr16 = COPY [[UV7]](<2 x s16>)
+ ; CHECK-NEXT: $sgpr17 = COPY [[UV8]](<2 x s16>)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5748,7 +5748,7 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_v3f16_inreg(<3 x half> inreg %arg)
@@ -5784,8 +5784,8 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV]](<2 x s16>)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](<2 x s16>)
+ ; CHECK-NEXT: $sgpr16 = COPY [[UV]](<2 x s16>)
+ ; CHECK-NEXT: $sgpr17 = COPY [[UV1]](<2 x s16>)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5797,7 +5797,7 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_v4f16_inreg(<4 x half> inreg %arg)
@@ -5833,8 +5833,8 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p0)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[UV1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5846,7 +5846,7 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p0_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p0_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_p0_inreg(ptr inreg %arg)
@@ -5882,8 +5882,8 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p1)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[UV1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5895,7 +5895,7 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p1_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p1_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
@@ -5928,7 +5928,7 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[COPY9]](p3)
+ ; CHECK-NEXT: $sgpr16 = COPY [[COPY9]](p3)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5940,7 +5940,7 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p3_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p3_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
@@ -5980,10 +5980,10 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p1>)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
- ; CHECK-NEXT: $sgpr2 = COPY [[UV2]](s32)
- ; CHECK-NEXT: $sgpr3 = COPY [[UV3]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr18 = COPY [[UV2]](s32)
+ ; CHECK-NEXT: $sgpr19 = COPY [[UV3]](s32)
; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY13]](p4)
@@ -5995,7 +5995,7 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
; CHECK-NEXT: $sgpr14 = COPY [[COPY19]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY20]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY21]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p1_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p1_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg)
@@ -6031,8 +6031,8 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p5>)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[UV1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -6044,7 +6044,7 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p5_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p5_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index a8a6f1954edd1..b8758a72998e2 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -26343,31 +26343,31 @@ define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_false_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b64 s[0:1], 0
+; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_false_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_false_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_false_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_false_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_false_bf16:
@@ -26387,7 +26387,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_eq_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_oeq_bf16:
@@ -26397,7 +26397,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_eq_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_eq_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_oeq_bf16:
@@ -26405,7 +26405,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_eq_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_eq_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_oeq_bf16:
@@ -26413,7 +26413,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_eq_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_oeq_bf16:
@@ -26421,7 +26421,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_eq_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_eq_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_oeq_bf16:
@@ -26444,7 +26444,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_gt_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ogt_bf16:
@@ -26454,7 +26454,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_gt_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ogt_bf16:
@@ -26462,7 +26462,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_gt_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ogt_bf16:
@@ -26470,7 +26470,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_gt_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ogt_bf16:
@@ -26478,7 +26478,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_gt_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ogt_bf16:
@@ -26501,7 +26501,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_oge_bf16:
@@ -26511,7 +26511,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_ge_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_oge_bf16:
@@ -26519,7 +26519,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_oge_bf16:
@@ -26527,7 +26527,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_oge_bf16:
@@ -26535,7 +26535,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_ge_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_ge_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_oge_bf16:
@@ -26558,7 +26558,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_olt_bf16:
@@ -26568,7 +26568,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_lt_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_olt_bf16:
@@ -26576,7 +26576,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_lt_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_olt_bf16:
@@ -26584,7 +26584,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_lt_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_olt_bf16:
@@ -26592,7 +26592,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_lt_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_lt_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_olt_bf16:
@@ -26615,7 +26615,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_le_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_le_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ole_bf16:
@@ -26625,7 +26625,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_le_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_le_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ole_bf16:
@@ -26633,7 +26633,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_le_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_le_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ole_bf16:
@@ -26641,7 +26641,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_le_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ole_bf16:
@@ -26649,7 +26649,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_le_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_le_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ole_bf16:
@@ -26672,7 +26672,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_lg_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_lg_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_one_bf16:
@@ -26682,7 +26682,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_lg_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_lg_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_one_bf16:
@@ -26690,7 +26690,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_lg_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_lg_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_one_bf16:
@@ -26698,7 +26698,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_lg_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_lg_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_one_bf16:
@@ -26706,7 +26706,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_lg_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_lg_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_one_bf16:
@@ -26729,7 +26729,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_uno_bf16:
@@ -26739,7 +26739,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_uno_bf16:
@@ -26747,7 +26747,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_uno_bf16:
@@ -26755,7 +26755,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_uno_bf16:
@@ -26763,7 +26763,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_u_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_uno_bf16:
@@ -26786,7 +26786,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_nlg_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_nlg_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ueq_bf16:
@@ -26796,7 +26796,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_nlg_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_nlg_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ueq_bf16:
@@ -26804,7 +26804,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_nlg_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_nlg_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ueq_bf16:
@@ -26812,7 +26812,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_nlg_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_nlg_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ueq_bf16:
@@ -26820,7 +26820,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_nlg_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_nlg_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ueq_bf16:
@@ -26843,7 +26843,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_nle_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_nle_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ugt_bf16:
@@ -26853,7 +26853,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_nle_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_nle_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ugt_bf16:
@@ -26861,7 +26861,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_nle_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_nle_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ugt_bf16:
@@ -26869,7 +26869,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_nle_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_nle_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ugt_bf16:
@@ -26877,7 +26877,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_nle_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_nle_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ugt_bf16:
@@ -26900,7 +26900,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_nlt_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_uge_bf16:
@@ -26910,7 +26910,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_nlt_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_nlt_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_uge_bf16:
@@ -26918,7 +26918,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_nlt_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_nlt_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_uge_bf16:
@@ -26926,7 +26926,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_nlt_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_nlt_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_uge_bf16:
@@ -26934,7 +26934,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_nlt_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_nlt_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_uge_bf16:
@@ -26957,7 +26957,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_nge_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_nge_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ult_bf16:
@@ -26967,7 +26967,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_nge_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_nge_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ult_bf16:
@@ -26975,7 +26975,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_nge_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_nge_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ult_bf16:
@@ -26983,7 +26983,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_nge_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_nge_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ult_bf16:
@@ -26991,7 +26991,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_nge_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_nge_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ult_bf16:
@@ -27014,7 +27014,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_ngt_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_ngt_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ule_bf16:
@@ -27024,7 +27024,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_ngt_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_ngt_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ule_bf16:
@@ -27032,7 +27032,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_ngt_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_ngt_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ule_bf16:
@@ -27040,7 +27040,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_ngt_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_ngt_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ule_bf16:
@@ -27048,7 +27048,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_ngt_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_ngt_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ule_bf16:
@@ -27071,7 +27071,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_cmp_neq_f32_e64 s[0:1], v0, v1
+; GCN-NEXT: v_cmp_neq_f32_e64 s[4:5], v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_une_bf16:
@@ -27081,7 +27081,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_cmp_neq_f32_e64 s[0:1], v0, v1
+; GFX7-NEXT: v_cmp_neq_f32_e64 s[4:5], v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_une_bf16:
@@ -27089,7 +27089,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cmp_neq_f32_e64 s[0:1], v0, v1
+; GFX8-NEXT: v_cmp_neq_f32_e64 s[4:5], v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_une_bf16:
@@ -27097,7 +27097,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_neq_f32_e64 s[0:1], v0, v1
+; GFX9-NEXT: v_cmp_neq_f32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_une_bf16:
@@ -27105,7 +27105,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cmp_neq_f32_e64 s0, v0, v1
+; GFX10-NEXT: v_cmp_neq_f32_e64 s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_une_bf16:
@@ -27124,31 +27124,31 @@ define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_true_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b64 s[0:1], -1
+; GCN-NEXT: s_mov_b64 s[4:5], -1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_true_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[0:1], -1
+; GFX7-NEXT: s_mov_b64 s[4:5], -1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_true_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b64 s[0:1], -1
+; GFX8-NEXT: s_mov_b64 s[4:5], -1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_true_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b64 s[0:1], -1
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_true_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, -1
+; GFX10-NEXT: s_mov_b32 s4, -1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_true_bf16:
@@ -33476,6 +33476,8 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-LABEL: v_select_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -33483,6 +33485,8 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX7-LABEL: v_select_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -33518,7 +33522,8 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-LABEL: v_select_fneg_lhs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -33526,7 +33531,8 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX7-LABEL: v_select_fneg_lhs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, -1.0, v0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -33568,7 +33574,8 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-LABEL: v_select_fneg_rhs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -33576,7 +33583,8 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX7-LABEL: v_select_fneg_rhs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, -1.0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -33618,6 +33626,10 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
; GCN-LABEL: v_select_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -33631,6 +33643,10 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
; GFX7-LABEL: v_select_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -34062,6 +34078,12 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
; GCN-LABEL: v_select_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -34078,9 +34100,15 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
; GFX7-LABEL: v_select_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16
@@ -34126,6 +34154,14 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
; GCN-LABEL: v_select_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -34145,14 +34181,22 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
; GFX7-LABEL: v_select_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v7
; GFX7-NEXT: v_alignbit_b32 v1, v1, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
@@ -34196,6 +34240,18 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
; GCN-LABEL: v_select_v6bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -34222,18 +34278,30 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
; GFX7-LABEL: v_select_v6bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v9
-; GFX7-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v11
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_alignbit_b32 v1, v1, v6, 16
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v8, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v10, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GFX7-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v11
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v10
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
@@ -34284,6 +34352,22 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
; GCN-LABEL: v_select_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -34317,22 +34401,38 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
; GFX7-LABEL: v_select_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v11
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v8, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GFX7-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v13
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v8, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v12
; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v15
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v8, 16
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v10, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v12, 16
-; GFX7-NEXT: v_alignbit_b32 v7, v7, v14, 16
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v15
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v8, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v14
+; GFX7-NEXT: v_alignbit_b32 v7, v7, v8, 16
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5]
@@ -34390,44 +34490,77 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GCN-LABEL: v_select_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_alignbit_b32 v1, v1, v16, 16
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19
-; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_alignbit_b32 v3, v3, v16, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v21
-; GCN-NEXT: v_alignbit_b32 v5, v5, v20, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_alignbit_b32 v5, v5, v16, 16
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v23
-; GCN-NEXT: v_alignbit_b32 v7, v7, v22, 16
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_alignbit_b32 v7, v7, v16, 16
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v25
-; GCN-NEXT: v_alignbit_b32 v9, v9, v24, 16
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_alignbit_b32 v9, v9, v16, 16
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v27
+; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32
-; GCN-NEXT: v_alignbit_b32 v11, v16, v26, 16
+; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16
; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; GCN-NEXT: v_alignbit_b32 v13, v17, v28, 16
+; GCN-NEXT: v_alignbit_b32 v13, v18, v19, 16
; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16
; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v12, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v12, v16, v10, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v8, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v11
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -34438,13 +34571,12 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v18
-; GCN-NEXT: v_alignbit_b32 v15, v15, v30, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_alignbit_b32 v15, v15, v20, 16
; GCN-NEXT: v_cndmask_b32_e64 v15, v15, v14, s[4:5]
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -34453,39 +34585,69 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-LABEL: v_select_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX7-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v19
; GFX7-NEXT: v_alignbit_b32 v1, v1, v16, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v27
-; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v29
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v16, 16
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX7-NEXT: v_alignbit_b32 v16, v16, v26, 16
-; GFX7-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; GFX7-NEXT: v_alignbit_b32 v13, v17, v28, 16
-; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v19
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v21
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v20
; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v23
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v17, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v22
; GFX7-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v25
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_alignbit_b32 v7, v7, v17, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v24
+; GFX7-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v12, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v12, v16, v10, s[4:5]
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v18, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v20, 16
-; GFX7-NEXT: v_alignbit_b32 v7, v7, v22, 16
-; GFX7-NEXT: v_alignbit_b32 v9, v9, v24, 16
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_alignbit_b32 v9, v9, v17, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v26
+; GFX7-NEXT: v_alignbit_b32 v12, v13, v12, 16
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v29
; GFX7-NEXT: v_alignbit_b32 v14, v15, v14, 16
+; GFX7-NEXT: v_alignbit_b32 v11, v11, v17, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v28
+; GFX7-NEXT: v_alignbit_b32 v13, v13, v17, 16
+; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v12, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e64 v9, v9, v8, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[4:5]
@@ -34501,14 +34663,16 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; GFX7-NEXT: v_alignbit_b32 v10, v10, v30, 16
-; GFX7-NEXT: v_cndmask_b32_e64 v15, v10, v14, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v30
+; GFX7-NEXT: v_alignbit_b32 v15, v15, v16, 16
+; GFX7-NEXT: v_cndmask_b32_e64 v15, v15, v14, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -34572,136 +34736,200 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GCN-LABEL: v_select_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GCN-NEXT: v_alignbit_b32 v2, v2, v4, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v7
-; GCN-NEXT: v_alignbit_b32 v3, v3, v6, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v9
-; GCN-NEXT: v_alignbit_b32 v4, v4, v8, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v11
-; GCN-NEXT: v_alignbit_b32 v5, v5, v10, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13
-; GCN-NEXT: v_alignbit_b32 v6, v6, v12, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15
-; GCN-NEXT: v_alignbit_b32 v7, v7, v14, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v17
-; GCN-NEXT: v_alignbit_b32 v8, v8, v16, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v19
-; GCN-NEXT: v_alignbit_b32 v9, v9, v18, 16
-; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v21
-; GCN-NEXT: v_alignbit_b32 v10, v10, v20, 16
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v23
-; GCN-NEXT: v_alignbit_b32 v11, v11, v22, 16
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25
-; GCN-NEXT: v_alignbit_b32 v12, v12, v24, 16
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:16
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v27
-; GCN-NEXT: v_alignbit_b32 v13, v13, v26, 16
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v29
-; GCN-NEXT: v_alignbit_b32 v14, v14, v28, 16
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_alignbit_b32 v14, v14, v19, 16
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:24
; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v17
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_alignbit_b32 v16, v16, v18, 16
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_alignbit_b32 v17, v17, v19, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_alignbit_b32 v17, v17, v20, 16
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36
-; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v20
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_alignbit_b32 v19, v19, v20, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_alignbit_b32 v19, v19, v21, 16
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:52
-; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v22
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_alignbit_b32 v21, v21, v23, 16
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68
-; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v24
-; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_alignbit_b32 v23, v23, v24, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_alignbit_b32 v23, v23, v25, 16
-; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84
-; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v26
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_alignbit_b32 v25, v25, v26, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_alignbit_b32 v25, v25, v27, 16
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:100
-; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v28
-; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_alignbit_b32 v27, v27, v28, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_alignbit_b32 v27, v27, v29, 16
-; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v31
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v31
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_alignbit_b32 v29, v29, v32, 16
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v31
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_alignbit_b32 v29, v29, v31, 16
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
-; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_alignbit_b32 v30, v31, v30, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_alignbit_b32 v31, v31, v33, 16
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16
; GCN-NEXT: v_cndmask_b32_e64 v31, v31, v30, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v29, v29, v14, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v28, v28, v13, s[4:5]
@@ -34755,160 +34983,227 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GFX7-LABEL: v_select_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v9
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v11
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v13
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v15
-; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8
-; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v10, 16
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v12, 16
-; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_alignbit_b32 v7, v7, v14, 16
-; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
-; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v8, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v17
-; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; GFX7-NEXT: v_alignbit_b32 v8, v8, v16, 16
-; GFX7-NEXT: v_alignbit_b32 v24, v25, v24, 16
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8
+; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_alignbit_b32 v16, v17, v16, 16
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72
+; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_alignbit_b32 v12, v13, v12, 16
+; GFX7-NEXT: v_alignbit_b32 v26, v27, v26, 16
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; GFX7-NEXT: v_alignbit_b32 v22, v23, v22, 16
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_alignbit_b32 v14, v15, v14, 16
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_alignbit_b32 v18, v19, v18, 16
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_alignbit_b32 v20, v21, v20, 16
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_alignbit_b32 v24, v25, v24, 16
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_alignbit_b32 v28, v29, v28, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX7-NEXT: v_alignbit_b32 v18, v19, v18, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GFX7-NEXT: v_alignbit_b32 v22, v23, v22, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX7-NEXT: v_alignbit_b32 v26, v27, v26, 16
-; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
-; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104
-; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:28
+; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112
+; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48
+; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96
+; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
+; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80
; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88
-; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60
+; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104
; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120
-; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80
-; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96
-; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: s_waitcnt vmcnt(13)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: s_waitcnt vmcnt(12)
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: s_waitcnt vmcnt(11)
-; GFX7-NEXT: v_alignbit_b32 v9, v9, v10, 16
-; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; GFX7-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: s_waitcnt vmcnt(11)
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: s_waitcnt vmcnt(10)
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT: v_alignbit_b32 v10, v10, v11, 16
-; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX7-NEXT: v_alignbit_b32 v11, v11, v12, 16
-; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT: v_alignbit_b32 v12, v12, v13, 16
-; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX7-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v4, s[4:5]
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_alignbit_b32 v14, v14, v15, 16
-; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_cndmask_b32_e64 v14, v14, v5, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v5, v11, v2, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v14
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:24
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GFX7-NEXT: v_alignbit_b32 v15, v15, v16, 16
-; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_cndmask_b32_e64 v15, v15, v6, s[4:5]
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v7, v7, v8, 16
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v2, s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT: v_alignbit_b32 v16, v16, v17, 16
-; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72
-; GFX7-NEXT: v_cndmask_b32_e64 v16, v16, v7, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v7, v12, v3, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v10, v1, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v9, v0, s[4:5]
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16
+; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v3, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v1, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, v0, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v13
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v14
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v15
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v16
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_alignbit_b32 v9, v9, v31, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_cndmask_b32_e64 v9, v9, v4, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_alignbit_b32 v11, v11, v31, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_alignbit_b32 v13, v13, v31, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v12, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_alignbit_b32 v15, v15, v31, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT: v_cndmask_b32_e64 v15, v15, v14, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v17, v17, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
-; GFX7-NEXT: v_cndmask_b32_e64 v17, v17, v8, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
+; GFX7-NEXT: v_cndmask_b32_e64 v17, v17, v16, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v19, v19, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
; GFX7-NEXT: v_cndmask_b32_e64 v19, v19, v18, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v21, v21, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
; GFX7-NEXT: v_cndmask_b32_e64 v21, v21, v20, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v23, v23, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
; GFX7-NEXT: v_cndmask_b32_e64 v23, v23, v22, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v25, v25, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v24, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v27, v27, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; GFX7-NEXT: v_cndmask_b32_e64 v27, v27, v26, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v29, v29, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v28, s[4:5]
@@ -34920,6 +35215,7 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GFX7-NEXT: v_alignbit_b32 v30, v31, v30, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GFX7-NEXT: v_alignbit_b32 v31, v31, v32, 16
; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v30, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index bcdfb75ab1ef9..0b9b37f85a755 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -193,24 +193,22 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
; DAGISEL-ASM-LABEL: recursive_phis:
; DAGISEL-ASM: ; %bb.0: ; %entry
; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 1, v0
-; DAGISEL-ASM-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; DAGISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; DAGISEL-ASM-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; DAGISEL-ASM-NEXT: v_lshrrev_b32_e64 v1, 6, s32
+; DAGISEL-ASM-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; DAGISEL-ASM-NEXT: ; %bb.1: ; %then
-; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; DAGISEL-ASM-NEXT: v_and_b32_e32 v1, 0xffff, v0
; DAGISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split
-; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5]
-; DAGISEL-ASM-NEXT: s_xor_b64 s[6:7], vcc, -1
+; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7]
+; DAGISEL-ASM-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], 0
; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v0, 7
; DAGISEL-ASM-NEXT: .LBB7_3: ; %finally
; DAGISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1
; DAGISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[6:7]
; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s9
-; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, s9
+; DAGISEL-ASM-NEXT: flat_store_dword v[1:2], v0
; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0)
; DAGISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[4:5]
; DAGISEL-ASM-NEXT: s_cbranch_execnz .LBB7_3
@@ -222,29 +220,27 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
; GISEL-ASM-LABEL: recursive_phis:
; GISEL-ASM: ; %bb.0: ; %entry
; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-ASM-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-ASM-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GISEL-ASM-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-ASM-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-ASM-NEXT: v_lshrrev_b32_e64 v1, 6, s32
+; GISEL-ASM-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GISEL-ASM-NEXT: ; %bb.1: ; %then
-; GISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GISEL-ASM-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split
-; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base
-; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0
-; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9
-; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; GISEL-ASM-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-ASM-NEXT: v_mov_b32_e32 v2, s9
+; GISEL-ASM-NEXT: v_mov_b32_e32 v0, 7
; GISEL-ASM-NEXT: .LBB7_3: ; %finally
; GISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1
-; GISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[4:5]
-; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
-; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
+; GISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[6:7]
+; GISEL-ASM-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; GISEL-ASM-NEXT: flat_store_dword v[1:2], v0
; GISEL-ASM-NEXT: s_waitcnt vmcnt(0)
-; GISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GISEL-ASM-NEXT: s_cbranch_execnz .LBB7_3
; GISEL-ASM-NEXT: ; %bb.4: ; %end
-; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5]
; GISEL-ASM-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 53448df79ee27..1931058d75a99 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -43,8 +43,8 @@ define i1 @divergent_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) {
; GCN-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_BFE_I32_e64_]], killed [[S_MOV_B32_]], implicit $exec
; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], [[COPY]], implicit-def dead $scc
; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
- ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[COPY2]]
- ; GCN-NEXT: SI_RETURN implicit $sgpr0_sgpr1
+ ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY2]]
+ ; GCN-NEXT: SI_RETURN implicit $sgpr4_sgpr5
%setcc = icmp slt i16 %x, 0
%select = select i1 %setcc, i1 true, i1 %z
ret i1 %select
@@ -92,8 +92,8 @@ define i1 @divergent_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) {
; GCN-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY1]], killed [[S_MOV_B32_]], implicit $exec
; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], [[COPY]], implicit-def dead $scc
; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
- ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[COPY2]]
- ; GCN-NEXT: SI_RETURN implicit $sgpr0_sgpr1
+ ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY2]]
+ ; GCN-NEXT: SI_RETURN implicit $sgpr4_sgpr5
%setcc = icmp slt i32 %x, 0
%select = select i1 %setcc, i1 true, i1 %z
ret i1 %select
@@ -148,8 +148,8 @@ define i1 @divergent_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) {
; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE]], [[COPY3]], implicit $exec
; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], [[COPY]], implicit-def dead $scc
; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_1 = COPY [[S_OR_B64_]]
- ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[COPY2]]
- ; GCN-NEXT: SI_RETURN implicit $sgpr0_sgpr1
+ ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY2]]
+ ; GCN-NEXT: SI_RETURN implicit $sgpr4_sgpr5
%setcc = icmp slt i64 %x, 0
%select = select i1 %setcc, i1 true, i1 %z
ret i1 %select
diff --git a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
index 02a3066822e51..d6c7d686976f6 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
@@ -30,7 +30,7 @@ define i1 @extractloadi1(ptr %ptr, i32 %idx) {
; CHECK-NEXT: buffer_load_ubyte v0, v1, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%val = load <8 x i1>, ptr %ptr
%ret = extractelement <8 x i1> %val, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 7c5f6d5e33efe..67d6600b44483 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -600,10 +600,9 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
; VI-LABEL: fmul_pow_select:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 1, v1
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; VI-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc
-; VI-NEXT: v_lshlrev_b32_e32 v0, v0, v1
+; VI-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; VI-NEXT: s_cselect_b32 s4, 1, 2
+; VI-NEXT: v_lshlrev_b32_e64 v0, v0, s4
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -611,10 +610,9 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
; GFX10-LABEL: fmul_pow_select:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, v0, v1
+; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX10-NEXT: s_cselect_b32 s4, 1, 2
+; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s4
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -622,13 +620,12 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
; GFX11-LABEL: fmul_pow_select:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT: s_cselect_b32 s0, 1, 2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s0
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl2 = shl nuw i32 2, %cnt
diff --git a/llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll b/llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll
index 85286841cbcac..d9f062863495a 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll
@@ -666,18 +666,14 @@ define float @fold_f32_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, float
; SDAG-LABEL: fold_f32_select_user_fsub_into_fneg_modifier_ieee:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: fold_f32_select_user_fsub_into_fneg_modifier_ieee:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT: v_max_f32_e64 v0, -v0, -v0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub float -0.0, %v0
%mul = select i1 %cond, float %sub, float %v1
@@ -688,19 +684,15 @@ define float @no_fold_f32_select_user_fsub_into_fneg_modifier_daz(i1 %cond, floa
; SDAG-LABEL: no_fold_f32_select_user_fsub_into_fneg_modifier_daz:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f32_select_user_fsub_into_fneg_modifier_daz:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT: v_max_f32_e64 v0, -v0, -v0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub float -0.0, %v0
%mul = select i1 %cond, float %sub, float %v1
@@ -711,19 +703,15 @@ define float @no_fold_f32_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond,
; SDAG-LABEL: no_fold_f32_select_user_fsub_into_fneg_modifier_dynamic:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f32_select_user_fsub_into_fneg_modifier_dynamic:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT: v_max_f32_e64 v0, -v0, -v0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub float -0.0, %v0
%mul = select i1 %cond, float %sub, float %v1
@@ -734,19 +722,15 @@ define half @fold_f16_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, half %v
; SDAG-LABEL: fold_f16_select_user_fsub_into_fneg_modifier_ieee:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: fold_f16_select_user_fsub_into_fneg_modifier_ieee:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub half -0.0, %v0
%mul = select i1 %cond, half %sub, half %v1
@@ -757,19 +741,15 @@ define half @no_fold_f16_select_user_fsub_into_fneg_modifier_daz(i1 %cond, half
; SDAG-LABEL: no_fold_f16_select_user_fsub_into_fneg_modifier_daz:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_sub_f16_e32 v1, 0x8000, v1
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT: v_sub_f16_e32 v0, 0x8000, v0
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f16_select_user_fsub_into_fneg_modifier_daz:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub half -0.0, %v0
%mul = select i1 %cond, half %sub, half %v1
@@ -780,19 +760,15 @@ define half @no_fold_f16_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond, h
; SDAG-LABEL: no_fold_f16_select_user_fsub_into_fneg_modifier_dynamic:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_sub_f16_e32 v1, 0x8000, v1
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT: v_sub_f16_e32 v0, 0x8000, v0
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f16_select_user_fsub_into_fneg_modifier_dynamic:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub half -0.0, %v0
%mul = select i1 %cond, half %sub, half %v1
@@ -803,21 +779,17 @@ define double @fold_f64_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, doubl
; SDAG-LABEL: fold_f64_select_user_fsub_into_fneg_modifier_ieee:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: fold_f64_select_user_fsub_into_fneg_modifier_ieee:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_max_f64 v[1:2], -v[1:2], -v[1:2]
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; GISEL-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub double -0.0, %v0
%mul = select i1 %cond, double %sub, double %v1
@@ -828,21 +800,17 @@ define double @no_fold_f64_select_user_fsub_into_fneg_modifier_daz(i1 %cond, dou
; SDAG-LABEL: no_fold_f64_select_user_fsub_into_fneg_modifier_daz:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f64_select_user_fsub_into_fneg_modifier_daz:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_max_f64 v[1:2], -v[1:2], -v[1:2]
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; GISEL-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub double -0.0, %v0
%mul = select i1 %cond, double %sub, double %v1
@@ -853,21 +821,17 @@ define double @no_fold_f64_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond,
; SDAG-LABEL: no_fold_f64_select_user_fsub_into_fneg_modifier_dynamic:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f64_select_user_fsub_into_fneg_modifier_dynamic:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_max_f64 v[1:2], -v[1:2], -v[1:2]
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; GISEL-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub double -0.0, %v0
%mul = select i1 %cond, double %sub, double %v1
@@ -878,19 +842,15 @@ define <2 x half> @fold_v2f16_select_user_fsub_into_fneg_modifier_ieee(i1 %cond,
; SDAG-LABEL: fold_v2f16_select_user_fsub_into_fneg_modifier_ieee:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: fold_v2f16_select_user_fsub_into_fneg_modifier_ieee:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub <2 x half> <half -0.0, half -0.0>, %v0
%mul = select i1 %cond, <2 x half> %sub, <2 x half> %v1
@@ -901,19 +861,15 @@ define <2 x half> @no_fold_v2f16_select_user_fsub_into_fneg_modifier_daz(i1 %con
; SDAG-LABEL: no_fold_v2f16_select_user_fsub_into_fneg_modifier_daz:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_v2f16_select_user_fsub_into_fneg_modifier_daz:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub <2 x half> <half -0.0, half -0.0>, %v0
%mul = select i1 %cond, <2 x half> %sub, <2 x half> %v1
@@ -924,19 +880,15 @@ define <2 x half> @no_fold_v2f16_select_user_fsub_into_fneg_modifier_dynamic(i1
; SDAG-LABEL: no_fold_v2f16_select_user_fsub_into_fneg_modifier_dynamic:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_v2f16_select_user_fsub_into_fneg_modifier_dynamic:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub <2 x half> <half -0.0, half -0.0>, %v0
%mul = select i1 %cond, <2 x half> %sub, <2 x half> %v1
@@ -984,7 +936,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_ieee(float %v0) #0 {
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_cmp_class_f32_e64 s[4:5], -v0, 1
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_issnan_ieee:
@@ -992,7 +943,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_ieee(float %v0) #0 {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f32_e64 v0, -v0, -v0
; GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub float -0.0, %v0
%class = call i1 @llvm.is.fpclass.f32(float %sub, i32 1)
@@ -1005,7 +955,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_daz(float %v0) #1 {
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
; SDAG-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 1
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_issnan_daz:
@@ -1013,7 +962,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_daz(float %v0) #1 {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f32_e64 v0, -v0, -v0
; GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub float -0.0, %v0
%class = call i1 @llvm.is.fpclass.f32(float %sub, i32 1)
@@ -1026,7 +974,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_dynamic(float %v0) #
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
; SDAG-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 1
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_issnan_dynamic:
@@ -1034,7 +981,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_dynamic(float %v0) #
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f32_e64 v0, -v0, -v0
; GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub float -0.0, %v0
%class = call i1 @llvm.is.fpclass.f32(float %sub, i32 1)
@@ -1047,7 +993,6 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_ieee(float %v0)
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v1, 0x90
; SDAG-NEXT: v_cmp_class_f32_e64 s[4:5], -v0, v1
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_ieee:
@@ -1055,8 +1000,7 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_ieee(float %v0)
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f32_e64 v0, -v0, -v0
; GISEL-NEXT: v_mov_b32_e32 v1, 0x90
-; GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub float -0.0, %v0
%class = call i1 @llvm.is.fpclass.f32(float %sub, i32 144)
@@ -1069,8 +1013,7 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_daz(float %v0) #
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
; SDAG-NEXT: v_mov_b32_e32 v1, 0x90
-; SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_daz:
@@ -1078,8 +1021,7 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_daz(float %v0) #
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f32_e64 v0, -v0, -v0
; GISEL-NEXT: v_mov_b32_e32 v1, 0x90
-; GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub float -0.0, %v0
%class = call i1 @llvm.is.fpclass.f32(float %sub, i32 144)
@@ -1092,8 +1034,7 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_dynamic(float %v
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
; SDAG-NEXT: v_mov_b32_e32 v1, 0x90
-; SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_dynamic:
@@ -1101,8 +1042,7 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_dynamic(float %v
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f32_e64 v0, -v0, -v0
; GISEL-NEXT: v_mov_b32_e32 v1, 0x90
-; GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub float -0.0, %v0
%class = call i1 @llvm.is.fpclass.f32(float %sub, i32 144)
@@ -1114,15 +1054,13 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_var_ieee(float %v0, i32 %te
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_cmp_class_f32_e64 s[4:5], -v0, v1
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_var_ieee:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f32_e64 v0, -v0, -v0
-; GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub float -0.0, %v0
%class = call i1 @llvm.amdgcn.class.f32(float %sub, i32 %testmask)
@@ -1134,16 +1072,14 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_var_daz(float %v0, i32 %tes
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
-; SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_var_daz:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f32_e64 v0, -v0, -v0
-; GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub float -0.0, %v0
%class = call i1 @llvm.amdgcn.class.f32(float %sub, i32 %testmask)
@@ -1155,16 +1091,14 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_var_dynamic(float %v0, i32
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
-; SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_var_dynamic:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f32_e64 v0, -v0, -v0
-; GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub float -0.0, %v0
%class = call i1 @llvm.amdgcn.class.f32(float %sub, i32 %testmask)
@@ -1176,15 +1110,13 @@ define i1 @no_fold_f64_fsub_into_fneg_modifier_class_var_daz(double %v0, i32 %te
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], -v[0:1], v2
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f64_fsub_into_fneg_modifier_class_var_daz:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1]
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v2
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub double -0.0, %v0
%class = call i1 @llvm.amdgcn.class.f64(double %sub, i32 %testmask)
@@ -1196,16 +1128,14 @@ define i1 @no_fold_f16_fsub_into_fneg_modifier_class_var_daz(half %v0, i32 %test
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_sub_f16_e32 v0, 0x8000, v0
-; SDAG-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f16_fsub_into_fneg_modifier_class_var_daz:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GISEL-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub half -0.0, %v0
%class = call i1 @llvm.amdgcn.class.f16(half %sub, i32 %testmask)
@@ -1218,7 +1148,6 @@ define i1 @no_fold_f64_fsub_into_fneg_modifier_class_daz(double %v0) #1 {
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v2, 0x90
; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], -v[0:1], v2
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f64_fsub_into_fneg_modifier_class_daz:
@@ -1226,8 +1155,7 @@ define i1 @no_fold_f64_fsub_into_fneg_modifier_class_daz(double %v0) #1 {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1]
; GISEL-NEXT: v_mov_b32_e32 v2, 0x90
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v2
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub double -0.0, %v0
%class = call i1 @llvm.is.fpclass.f64(double %sub, i32 144)
@@ -1240,8 +1168,7 @@ define i1 @no_fold_f16_fsub_into_fneg_modifier_class_daz(half %v0) #1 {
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_sub_f16_e32 v0, 0x8000, v0
; SDAG-NEXT: v_mov_b32_e32 v1, 0x90
-; SDAG-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: no_fold_f16_fsub_into_fneg_modifier_class_daz:
@@ -1249,8 +1176,7 @@ define i1 @no_fold_f16_fsub_into_fneg_modifier_class_daz(half %v0) #1 {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_max_f16_e64 v0, -v0, -v0
; GISEL-NEXT: v_mov_b32_e32 v1, 0x90
-; GISEL-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GISEL-NEXT: s_setpc_b64 s[30:31]
%sub = fsub half -0.0, %v0
%class = call i1 @llvm.is.fpclass.f16(half %sub, i32 144)
diff --git a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
index 0b3366f71d89c..c91f8cd889c88 100644
--- a/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-call-i1-return.ll
@@ -17,7 +17,7 @@ define i1 @i1_func_void() {
; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_func_void:
@@ -57,7 +57,7 @@ define void @test_call_i1_func_void() {
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
@@ -118,7 +118,7 @@ define zeroext i1 @zeroext_i1_func_void() {
; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: zeroext_i1_func_void:
@@ -158,7 +158,7 @@ define void @test_call_zeroext_i1_func_void() {
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
@@ -219,7 +219,7 @@ define signext i1 @signext_i1_func_void() {
; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: signext_i1_func_void:
@@ -259,7 +259,7 @@ define void @test_call_signext_i1_func_void() {
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
@@ -419,8 +419,8 @@ define [2 x i1] @a2i1_func_void() {
; GFX9-NEXT: v_and_b32_e32 v0, 1, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v1
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: a2i1_func_void:
@@ -451,7 +451,7 @@ define void @test_call_a2i1_func_void() {
; GFX9-LABEL: test_call_a2i1_func_void:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s8, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -465,10 +465,10 @@ define void @test_call_a2i1_func_void() {
; GFX9-NEXT: v_writelane_b32 v3, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v3, 1
@@ -477,7 +477,7 @@ define void @test_call_a2i1_func_void() {
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_mov_b32 s33, s8
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index fb5b4a704b8a1..9d6e78aca8692 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -13,7 +13,7 @@ define i1 @i1_func_void() #0 {
; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
; GFX789-NEXT: s_waitcnt vmcnt(0)
; GFX789-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX789-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX789-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX789-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_func_void:
@@ -41,7 +41,7 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
; GFX789-NEXT: s_waitcnt vmcnt(0)
; GFX789-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX789-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX789-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX789-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_zeroext_func_void:
@@ -68,7 +68,7 @@ define signext i1 @i1_signext_func_void() #0 {
; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
; GFX789-NEXT: s_waitcnt vmcnt(0)
; GFX789-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX789-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX789-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX789-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_signext_func_void:
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 297b5180dfe9b..9fedb39dad045 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -5,375 +5,334 @@ define void @main(i1 %arg) #0 {
; CHECK-LABEL: main:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-NEXT: v_writelane_b32 v8, s30, 0
-; CHECK-NEXT: v_writelane_b32 v8, s31, 1
-; CHECK-NEXT: v_writelane_b32 v8, s36, 2
-; CHECK-NEXT: v_writelane_b32 v8, s37, 3
-; CHECK-NEXT: v_writelane_b32 v8, s38, 4
-; CHECK-NEXT: v_writelane_b32 v8, s39, 5
-; CHECK-NEXT: v_writelane_b32 v8, s40, 6
-; CHECK-NEXT: v_writelane_b32 v8, s41, 7
-; CHECK-NEXT: v_writelane_b32 v8, s42, 8
-; CHECK-NEXT: v_writelane_b32 v8, s43, 9
-; CHECK-NEXT: v_writelane_b32 v8, s44, 10
-; CHECK-NEXT: v_writelane_b32 v8, s45, 11
-; CHECK-NEXT: v_writelane_b32 v8, s46, 12
-; CHECK-NEXT: v_writelane_b32 v8, s47, 13
-; CHECK-NEXT: v_writelane_b32 v8, s48, 14
-; CHECK-NEXT: v_writelane_b32 v8, s49, 15
-; CHECK-NEXT: s_getpc_b64 s[24:25]
-; CHECK-NEXT: v_writelane_b32 v8, s50, 16
-; CHECK-NEXT: s_movk_i32 s4, 0xf0
-; CHECK-NEXT: s_mov_b32 s5, s24
-; CHECK-NEXT: v_writelane_b32 v8, s51, 17
-; CHECK-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
-; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
-; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0
-; CHECK-NEXT: s_movk_i32 s4, 0x130
-; CHECK-NEXT: s_mov_b32 s5, s24
+; CHECK-NEXT: s_xor_saveexec_b64 s[6:7], -1
+; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: v_writelane_b32 v6, s30, 0
+; CHECK-NEXT: v_writelane_b32 v6, s31, 1
+; CHECK-NEXT: v_writelane_b32 v6, s36, 2
+; CHECK-NEXT: v_writelane_b32 v6, s37, 3
+; CHECK-NEXT: v_writelane_b32 v6, s38, 4
+; CHECK-NEXT: v_writelane_b32 v6, s39, 5
+; CHECK-NEXT: v_writelane_b32 v6, s40, 6
+; CHECK-NEXT: v_writelane_b32 v6, s41, 7
+; CHECK-NEXT: v_writelane_b32 v6, s42, 8
+; CHECK-NEXT: v_writelane_b32 v6, s43, 9
+; CHECK-NEXT: v_writelane_b32 v6, s44, 10
+; CHECK-NEXT: v_writelane_b32 v6, s45, 11
+; CHECK-NEXT: v_writelane_b32 v6, s46, 12
+; CHECK-NEXT: v_writelane_b32 v6, s47, 13
+; CHECK-NEXT: v_writelane_b32 v6, s48, 14
+; CHECK-NEXT: v_writelane_b32 v6, s49, 15
+; CHECK-NEXT: s_getpc_b64 s[6:7]
+; CHECK-NEXT: v_writelane_b32 v6, s50, 16
+; CHECK-NEXT: s_movk_i32 s8, 0xf0
+; CHECK-NEXT: s_mov_b32 s9, s6
+; CHECK-NEXT: v_writelane_b32 v6, s51, 17
+; CHECK-NEXT: s_load_dwordx16 s[36:51], s[8:9], 0x0
+; CHECK-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; CHECK-NEXT: s_mov_b64 s[8:9], 0
+; CHECK-NEXT: s_load_dwordx4 s[28:31], s[8:9], 0x0
+; CHECK-NEXT: s_movk_i32 s8, 0x130
+; CHECK-NEXT: s_mov_b32 s9, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_writelane_b32 v4, s36, 0
-; CHECK-NEXT: v_writelane_b32 v4, s37, 1
-; CHECK-NEXT: v_writelane_b32 v4, s38, 2
-; CHECK-NEXT: v_writelane_b32 v4, s39, 3
-; CHECK-NEXT: v_writelane_b32 v4, s40, 4
-; CHECK-NEXT: v_writelane_b32 v4, s41, 5
-; CHECK-NEXT: v_writelane_b32 v4, s42, 6
-; CHECK-NEXT: v_writelane_b32 v4, s43, 7
-; CHECK-NEXT: v_writelane_b32 v4, s44, 8
-; CHECK-NEXT: v_writelane_b32 v4, s45, 9
-; CHECK-NEXT: v_writelane_b32 v4, s46, 10
-; CHECK-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v4, s47, 11
-; CHECK-NEXT: v_writelane_b32 v4, s48, 12
-; CHECK-NEXT: v_writelane_b32 v4, s49, 13
-; CHECK-NEXT: s_mov_b32 s20, 0
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_writelane_b32 v4, s50, 14
-; CHECK-NEXT: v_mov_b32_e32 v5, s28
-; CHECK-NEXT: v_mov_b32_e32 v6, v1
-; CHECK-NEXT: s_mov_b32 s21, s20
-; CHECK-NEXT: s_mov_b32 s22, s20
-; CHECK-NEXT: s_mov_b32 s23, s20
-; CHECK-NEXT: v_writelane_b32 v4, s51, 15
-; CHECK-NEXT: v_mov_b32_e32 v2, v1
-; CHECK-NEXT: image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT: v_writelane_b32 v2, s36, 0
+; CHECK-NEXT: v_writelane_b32 v2, s37, 1
+; CHECK-NEXT: v_writelane_b32 v2, s38, 2
+; CHECK-NEXT: v_writelane_b32 v2, s39, 3
+; CHECK-NEXT: v_writelane_b32 v2, s40, 4
+; CHECK-NEXT: v_writelane_b32 v2, s41, 5
+; CHECK-NEXT: v_writelane_b32 v2, s42, 6
+; CHECK-NEXT: v_writelane_b32 v2, s43, 7
+; CHECK-NEXT: v_writelane_b32 v2, s44, 8
+; CHECK-NEXT: v_writelane_b32 v2, s45, 9
+; CHECK-NEXT: v_writelane_b32 v2, s46, 10
+; CHECK-NEXT: s_load_dwordx16 s[8:23], s[8:9], 0x0
+; CHECK-NEXT: v_writelane_b32 v2, s47, 11
+; CHECK-NEXT: v_writelane_b32 v2, s48, 12
+; CHECK-NEXT: v_writelane_b32 v2, s49, 13
+; CHECK-NEXT: s_mov_b32 s24, 0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_writelane_b32 v2, s50, 14
+; CHECK-NEXT: v_mov_b32_e32 v3, s28
+; CHECK-NEXT: v_mov_b32_e32 v4, v0
+; CHECK-NEXT: s_mov_b32 s25, s24
+; CHECK-NEXT: s_mov_b32 s26, s24
+; CHECK-NEXT: s_mov_b32 s27, s24
+; CHECK-NEXT: v_writelane_b32 v2, s51, 15
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: image_sample_lz v3, v[3:4], s[44:51], s[24:27] dmask:0x1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_writelane_b32 v4, s4, 16
-; CHECK-NEXT: v_writelane_b32 v4, s5, 17
-; CHECK-NEXT: v_writelane_b32 v4, s6, 18
-; CHECK-NEXT: v_writelane_b32 v4, s7, 19
-; CHECK-NEXT: v_writelane_b32 v4, s8, 20
-; CHECK-NEXT: v_writelane_b32 v4, s9, 21
-; CHECK-NEXT: image_sample_lz v6, v[1:2], s[4:11], s[20:23] dmask:0x1
-; CHECK-NEXT: v_writelane_b32 v4, s10, 22
-; CHECK-NEXT: v_writelane_b32 v4, s11, 23
-; CHECK-NEXT: v_writelane_b32 v4, s12, 24
-; CHECK-NEXT: v_writelane_b32 v4, s13, 25
-; CHECK-NEXT: v_writelane_b32 v4, s14, 26
-; CHECK-NEXT: v_writelane_b32 v4, s15, 27
-; CHECK-NEXT: v_writelane_b32 v4, s16, 28
-; CHECK-NEXT: v_writelane_b32 v8, s52, 18
-; CHECK-NEXT: v_writelane_b32 v4, s17, 29
-; CHECK-NEXT: v_writelane_b32 v8, s53, 19
-; CHECK-NEXT: v_writelane_b32 v4, s18, 30
-; CHECK-NEXT: v_writelane_b32 v8, s54, 20
-; CHECK-NEXT: v_writelane_b32 v4, s19, 31
-; CHECK-NEXT: s_mov_b32 s4, 48
-; CHECK-NEXT: s_mov_b32 s5, s24
-; CHECK-NEXT: v_writelane_b32 v8, s55, 21
-; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v8, s56, 22
-; CHECK-NEXT: v_writelane_b32 v8, s57, 23
-; CHECK-NEXT: v_writelane_b32 v8, s58, 24
-; CHECK-NEXT: v_writelane_b32 v8, s59, 25
-; CHECK-NEXT: v_writelane_b32 v8, s60, 26
+; CHECK-NEXT: v_writelane_b32 v2, s8, 16
+; CHECK-NEXT: v_writelane_b32 v2, s9, 17
+; CHECK-NEXT: v_writelane_b32 v2, s10, 18
+; CHECK-NEXT: v_writelane_b32 v2, s11, 19
+; CHECK-NEXT: v_writelane_b32 v2, s12, 20
+; CHECK-NEXT: v_writelane_b32 v2, s13, 21
+; CHECK-NEXT: image_sample_lz v4, v[0:1], s[8:15], s[24:27] dmask:0x1
+; CHECK-NEXT: v_writelane_b32 v2, s14, 22
+; CHECK-NEXT: v_writelane_b32 v2, s15, 23
+; CHECK-NEXT: v_writelane_b32 v2, s16, 24
+; CHECK-NEXT: v_writelane_b32 v2, s17, 25
+; CHECK-NEXT: v_writelane_b32 v2, s18, 26
+; CHECK-NEXT: v_writelane_b32 v2, s19, 27
+; CHECK-NEXT: v_writelane_b32 v2, s20, 28
+; CHECK-NEXT: v_writelane_b32 v2, s21, 29
+; CHECK-NEXT: v_writelane_b32 v2, s22, 30
+; CHECK-NEXT: v_writelane_b32 v2, s23, 31
+; CHECK-NEXT: s_mov_b32 s8, 48
+; CHECK-NEXT: s_mov_b32 s9, s6
+; CHECK-NEXT: s_movk_i32 s12, 0x1f0
+; CHECK-NEXT: s_xor_b64 s[14:15], s[4:5], -1
+; CHECK-NEXT: s_mov_b32 s13, s6
+; CHECK-NEXT: s_mov_b32 s29, s6
+; CHECK-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_load_dwordx16 s[36:51], s[12:13], 0x0
+; CHECK-NEXT: v_writelane_b32 v6, s52, 18
+; CHECK-NEXT: v_writelane_b32 v6, s53, 19
+; CHECK-NEXT: v_writelane_b32 v6, s54, 20
+; CHECK-NEXT: v_writelane_b32 v6, s55, 21
+; CHECK-NEXT: v_writelane_b32 v6, s56, 22
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_writelane_b32 v4, s4, 32
-; CHECK-NEXT: v_writelane_b32 v8, s61, 27
-; CHECK-NEXT: v_writelane_b32 v4, s5, 33
-; CHECK-NEXT: v_writelane_b32 v8, s62, 28
-; CHECK-NEXT: v_writelane_b32 v4, s6, 34
-; CHECK-NEXT: v_writelane_b32 v8, s63, 29
-; CHECK-NEXT: v_writelane_b32 v4, s7, 35
-; CHECK-NEXT: v_writelane_b32 v8, s64, 30
-; CHECK-NEXT: v_writelane_b32 v4, s8, 36
-; CHECK-NEXT: v_writelane_b32 v8, s65, 31
-; CHECK-NEXT: v_writelane_b32 v4, s9, 37
-; CHECK-NEXT: v_writelane_b32 v8, s66, 32
-; CHECK-NEXT: s_movk_i32 s26, 0x1f0
+; CHECK-NEXT: v_writelane_b32 v2, s36, 32
+; CHECK-NEXT: v_writelane_b32 v6, s57, 23
+; CHECK-NEXT: v_writelane_b32 v2, s37, 33
+; CHECK-NEXT: v_writelane_b32 v6, s58, 24
+; CHECK-NEXT: v_writelane_b32 v2, s38, 34
+; CHECK-NEXT: v_writelane_b32 v6, s59, 25
+; CHECK-NEXT: v_writelane_b32 v2, s39, 35
+; CHECK-NEXT: v_writelane_b32 v6, s60, 26
+; CHECK-NEXT: v_writelane_b32 v2, s40, 36
+; CHECK-NEXT: v_writelane_b32 v6, s61, 27
+; CHECK-NEXT: v_writelane_b32 v2, s41, 37
+; CHECK-NEXT: v_writelane_b32 v6, s62, 28
+; CHECK-NEXT: v_writelane_b32 v2, s42, 38
+; CHECK-NEXT: v_writelane_b32 v6, s63, 29
+; CHECK-NEXT: v_writelane_b32 v2, s43, 39
+; CHECK-NEXT: v_writelane_b32 v6, s64, 30
+; CHECK-NEXT: v_writelane_b32 v2, s44, 40
+; CHECK-NEXT: v_writelane_b32 v6, s65, 31
+; CHECK-NEXT: v_writelane_b32 v2, s45, 41
+; CHECK-NEXT: v_writelane_b32 v6, s66, 32
; CHECK-NEXT: s_movk_i32 s28, 0x2f0
-; CHECK-NEXT: s_mov_b32 s27, s24
-; CHECK-NEXT: s_mov_b32 s29, s24
-; CHECK-NEXT: v_writelane_b32 v4, s10, 38
-; CHECK-NEXT: v_writelane_b32 v8, s67, 33
-; CHECK-NEXT: v_writelane_b32 v4, s11, 39
-; CHECK-NEXT: s_load_dwordx16 s[52:67], s[26:27], 0x0
-; CHECK-NEXT: s_load_dwordx16 s[4:19], s[28:29], 0x0
-; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1
-; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
+; CHECK-NEXT: v_writelane_b32 v2, s46, 42
+; CHECK-NEXT: v_writelane_b32 v6, s67, 33
+; CHECK-NEXT: v_writelane_b32 v2, s47, 43
+; CHECK-NEXT: s_load_dwordx16 s[52:67], s[28:29], 0x0
+; CHECK-NEXT: v_writelane_b32 v2, s48, 44
+; CHECK-NEXT: v_writelane_b32 v2, s49, 45
+; CHECK-NEXT: v_writelane_b32 v2, s50, 46
+; CHECK-NEXT: v_writelane_b32 v2, s51, 47
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mul_f32_e32 v0, v6, v5
-; CHECK-NEXT: s_and_saveexec_b64 s[26:27], s[24:25]
-; CHECK-NEXT: s_xor_b64 s[26:27], exec, s[26:27]
+; CHECK-NEXT: v_mul_f32_e32 v3, v4, v3
+; CHECK-NEXT: s_and_saveexec_b64 s[12:13], s[14:15]
+; CHECK-NEXT: s_xor_b64 s[12:13], exec, s[12:13]
; CHECK-NEXT: s_cbranch_execz .LBB0_3
; CHECK-NEXT: ; %bb.1: ; %bb48
-; CHECK-NEXT: v_readlane_b32 s36, v4, 0
-; CHECK-NEXT: v_readlane_b32 s44, v4, 8
-; CHECK-NEXT: v_readlane_b32 s45, v4, 9
-; CHECK-NEXT: v_readlane_b32 s46, v4, 10
-; CHECK-NEXT: v_readlane_b32 s47, v4, 11
-; CHECK-NEXT: v_readlane_b32 s48, v4, 12
-; CHECK-NEXT: v_readlane_b32 s49, v4, 13
-; CHECK-NEXT: v_readlane_b32 s50, v4, 14
-; CHECK-NEXT: v_readlane_b32 s51, v4, 15
+; CHECK-NEXT: v_readlane_b32 s36, v2, 0
+; CHECK-NEXT: v_readlane_b32 s44, v2, 8
+; CHECK-NEXT: v_readlane_b32 s45, v2, 9
+; CHECK-NEXT: v_readlane_b32 s46, v2, 10
+; CHECK-NEXT: v_readlane_b32 s47, v2, 11
+; CHECK-NEXT: v_readlane_b32 s48, v2, 12
+; CHECK-NEXT: v_readlane_b32 s49, v2, 13
+; CHECK-NEXT: v_readlane_b32 s50, v2, 14
+; CHECK-NEXT: v_readlane_b32 s51, v2, 15
; CHECK-NEXT: s_and_b64 vcc, exec, -1
-; CHECK-NEXT: v_readlane_b32 s37, v4, 1
-; CHECK-NEXT: v_readlane_b32 s38, v4, 2
-; CHECK-NEXT: v_readlane_b32 s39, v4, 3
-; CHECK-NEXT: v_readlane_b32 s40, v4, 4
-; CHECK-NEXT: image_sample_lz v5, v[1:2], s[44:51], s[20:23] dmask:0x1
-; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: v_readlane_b32 s41, v4, 5
-; CHECK-NEXT: v_readlane_b32 s42, v4, 6
-; CHECK-NEXT: v_readlane_b32 s43, v4, 7
+; CHECK-NEXT: v_readlane_b32 s37, v2, 1
+; CHECK-NEXT: v_readlane_b32 s38, v2, 2
+; CHECK-NEXT: v_readlane_b32 s39, v2, 3
+; CHECK-NEXT: v_readlane_b32 s40, v2, 4
+; CHECK-NEXT: image_sample_lz v4, v[0:1], s[44:51], s[24:27] dmask:0x1
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_readlane_b32 s41, v2, 5
+; CHECK-NEXT: v_readlane_b32 s42, v2, 6
+; CHECK-NEXT: v_readlane_b32 s43, v2, 7
; CHECK-NEXT: .LBB0_2: ; %bb50
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_readlane_b32 s36, v4, 32
-; CHECK-NEXT: v_readlane_b32 s40, v4, 36
-; CHECK-NEXT: v_readlane_b32 s41, v4, 37
-; CHECK-NEXT: v_readlane_b32 s42, v4, 38
-; CHECK-NEXT: v_readlane_b32 s43, v4, 39
-; CHECK-NEXT: s_mov_b32 s21, s20
-; CHECK-NEXT: s_mov_b32 s22, s20
-; CHECK-NEXT: s_mov_b32 s23, s20
-; CHECK-NEXT: v_readlane_b32 s37, v4, 33
-; CHECK-NEXT: v_readlane_b32 s38, v4, 34
+; CHECK-NEXT: v_readlane_b32 s36, v2, 32
+; CHECK-NEXT: v_readlane_b32 s44, v2, 40
+; CHECK-NEXT: v_readlane_b32 s45, v2, 41
+; CHECK-NEXT: v_readlane_b32 s46, v2, 42
+; CHECK-NEXT: v_readlane_b32 s47, v2, 43
+; CHECK-NEXT: v_readlane_b32 s48, v2, 44
+; CHECK-NEXT: v_readlane_b32 s49, v2, 45
+; CHECK-NEXT: v_readlane_b32 s50, v2, 46
+; CHECK-NEXT: v_readlane_b32 s51, v2, 47
+; CHECK-NEXT: s_mov_b32 s25, s24
+; CHECK-NEXT: s_mov_b32 s26, s24
+; CHECK-NEXT: s_mov_b32 s27, s24
+; CHECK-NEXT: v_readlane_b32 s37, v2, 33
+; CHECK-NEXT: v_readlane_b32 s38, v2, 34
+; CHECK-NEXT: image_sample_lz v5, v[0:1], s[44:51], s[8:11] dmask:0x1
+; CHECK-NEXT: v_readlane_b32 s39, v2, 35
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: image_sample_lz v6, v[1:2], s[60:67], s[40:43] dmask:0x1
-; CHECK-NEXT: v_readlane_b32 s39, v4, 35
-; CHECK-NEXT: image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1
+; CHECK-NEXT: image_sample_lz v0, v[0:1], s[60:67], s[24:27] dmask:0x1
+; CHECK-NEXT: v_readlane_b32 s40, v2, 36
+; CHECK-NEXT: v_readlane_b32 s41, v2, 37
+; CHECK-NEXT: v_readlane_b32 s42, v2, 38
+; CHECK-NEXT: v_readlane_b32 s43, v2, 39
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_sub_f32_e32 v1, v1, v6
-; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0
-; CHECK-NEXT: v_mul_f32_e32 v1, v1, v5
+; CHECK-NEXT: v_sub_f32_e32 v0, v0, v5
+; CHECK-NEXT: v_mul_f32_e32 v0, v0, v3
+; CHECK-NEXT: v_mul_f32_e32 v0, v0, v4
; CHECK-NEXT: s_mov_b64 vcc, vcc
; CHECK-NEXT: s_cbranch_vccnz .LBB0_2
; CHECK-NEXT: .LBB0_3: ; %Flow14
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_readlane_b32 s12, v4, 32
-; CHECK-NEXT: v_readlane_b32 s13, v4, 33
-; CHECK-NEXT: v_readlane_b32 s14, v4, 34
-; CHECK-NEXT: v_readlane_b32 s15, v4, 35
-; CHECK-NEXT: v_readlane_b32 s16, v4, 36
-; CHECK-NEXT: v_readlane_b32 s17, v4, 37
-; CHECK-NEXT: v_readlane_b32 s18, v4, 38
-; CHECK-NEXT: v_readlane_b32 s19, v4, 39
-; CHECK-NEXT: v_writelane_b32 v4, s4, 40
-; CHECK-NEXT: v_writelane_b32 v4, s5, 41
-; CHECK-NEXT: v_writelane_b32 v4, s6, 42
-; CHECK-NEXT: v_writelane_b32 v4, s7, 43
-; CHECK-NEXT: v_writelane_b32 v4, s8, 44
-; CHECK-NEXT: v_writelane_b32 v4, s9, 45
-; CHECK-NEXT: v_writelane_b32 v4, s10, 46
-; CHECK-NEXT: v_writelane_b32 v4, s11, 47
-; CHECK-NEXT: v_writelane_b32 v4, s12, 48
-; CHECK-NEXT: v_writelane_b32 v4, s13, 49
-; CHECK-NEXT: v_writelane_b32 v4, s14, 50
-; CHECK-NEXT: v_writelane_b32 v4, s15, 51
-; CHECK-NEXT: v_writelane_b32 v4, s16, 52
-; CHECK-NEXT: v_writelane_b32 v4, s17, 53
-; CHECK-NEXT: v_writelane_b32 v4, s18, 54
-; CHECK-NEXT: v_writelane_b32 v4, s19, 55
-; CHECK-NEXT: v_writelane_b32 v4, s52, 56
-; CHECK-NEXT: v_writelane_b32 v3, s60, 0
-; CHECK-NEXT: v_writelane_b32 v4, s53, 57
-; CHECK-NEXT: v_writelane_b32 v3, s61, 1
-; CHECK-NEXT: v_writelane_b32 v4, s54, 58
-; CHECK-NEXT: v_writelane_b32 v3, s62, 2
-; CHECK-NEXT: v_writelane_b32 v4, s55, 59
-; CHECK-NEXT: v_writelane_b32 v3, s63, 3
-; CHECK-NEXT: v_writelane_b32 v4, s56, 60
-; CHECK-NEXT: v_writelane_b32 v3, s64, 4
-; CHECK-NEXT: v_writelane_b32 v4, s57, 61
-; CHECK-NEXT: v_writelane_b32 v3, s65, 5
-; CHECK-NEXT: v_writelane_b32 v4, s58, 62
-; CHECK-NEXT: v_writelane_b32 v3, s66, 6
-; CHECK-NEXT: v_writelane_b32 v4, s59, 63
-; CHECK-NEXT: v_writelane_b32 v3, s67, 7
-; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[26:27]
+; CHECK-NEXT: v_readlane_b32 s16, v2, 32
+; CHECK-NEXT: v_readlane_b32 s17, v2, 33
+; CHECK-NEXT: v_readlane_b32 s18, v2, 34
+; CHECK-NEXT: v_readlane_b32 s19, v2, 35
+; CHECK-NEXT: v_readlane_b32 s20, v2, 36
+; CHECK-NEXT: v_readlane_b32 s21, v2, 37
+; CHECK-NEXT: v_readlane_b32 s22, v2, 38
+; CHECK-NEXT: v_readlane_b32 s23, v2, 39
+; CHECK-NEXT: v_readlane_b32 s24, v2, 40
+; CHECK-NEXT: v_readlane_b32 s25, v2, 41
+; CHECK-NEXT: v_readlane_b32 s26, v2, 42
+; CHECK-NEXT: v_readlane_b32 s27, v2, 43
+; CHECK-NEXT: v_readlane_b32 s28, v2, 44
+; CHECK-NEXT: v_readlane_b32 s29, v2, 45
+; CHECK-NEXT: v_readlane_b32 s30, v2, 46
+; CHECK-NEXT: v_readlane_b32 s31, v2, 47
+; CHECK-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13]
; CHECK-NEXT: s_cbranch_execz .LBB0_10
; CHECK-NEXT: ; %bb.4: ; %bb32
-; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[24:25]
-; CHECK-NEXT: s_xor_b64 s[22:23], exec, s[8:9]
+; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[14:15]
+; CHECK-NEXT: s_xor_b64 s[14:15], exec, s[8:9]
; CHECK-NEXT: s_cbranch_execz .LBB0_6
; CHECK-NEXT: ; %bb.5: ; %bb43
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_mov_b32 s9, s8
; CHECK-NEXT: v_mov_b32_e32 v0, s8
-; CHECK-NEXT: v_readlane_b32 s36, v4, 0
+; CHECK-NEXT: v_readlane_b32 s36, v2, 0
; CHECK-NEXT: v_mov_b32_e32 v1, s9
; CHECK-NEXT: s_mov_b32 s10, s8
; CHECK-NEXT: s_mov_b32 s11, s8
-; CHECK-NEXT: v_readlane_b32 s37, v4, 1
-; CHECK-NEXT: v_readlane_b32 s38, v4, 2
-; CHECK-NEXT: v_readlane_b32 s39, v4, 3
-; CHECK-NEXT: v_readlane_b32 s40, v4, 4
-; CHECK-NEXT: v_readlane_b32 s41, v4, 5
-; CHECK-NEXT: v_readlane_b32 s42, v4, 6
-; CHECK-NEXT: v_readlane_b32 s43, v4, 7
-; CHECK-NEXT: v_readlane_b32 s44, v4, 8
-; CHECK-NEXT: v_readlane_b32 s45, v4, 9
-; CHECK-NEXT: v_readlane_b32 s46, v4, 10
-; CHECK-NEXT: v_readlane_b32 s47, v4, 11
-; CHECK-NEXT: v_readlane_b32 s48, v4, 12
-; CHECK-NEXT: v_readlane_b32 s49, v4, 13
-; CHECK-NEXT: v_readlane_b32 s50, v4, 14
-; CHECK-NEXT: v_readlane_b32 s51, v4, 15
-; CHECK-NEXT: image_sample_lz v5, v[0:1], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT: v_readlane_b32 s36, v4, 16
-; CHECK-NEXT: v_readlane_b32 s44, v4, 24
-; CHECK-NEXT: v_readlane_b32 s45, v4, 25
-; CHECK-NEXT: v_readlane_b32 s46, v4, 26
-; CHECK-NEXT: v_readlane_b32 s47, v4, 27
-; CHECK-NEXT: v_readlane_b32 s48, v4, 28
-; CHECK-NEXT: v_readlane_b32 s49, v4, 29
-; CHECK-NEXT: v_readlane_b32 s50, v4, 30
-; CHECK-NEXT: v_readlane_b32 s51, v4, 31
-; CHECK-NEXT: v_mov_b32_e32 v6, 0
-; CHECK-NEXT: v_mov_b32_e32 v7, v6
-; CHECK-NEXT: v_readlane_b32 s37, v4, 17
-; CHECK-NEXT: v_readlane_b32 s38, v4, 18
-; CHECK-NEXT: v_readlane_b32 s39, v4, 19
-; CHECK-NEXT: image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1
-; CHECK-NEXT: v_readlane_b32 s40, v4, 20
-; CHECK-NEXT: v_readlane_b32 s41, v4, 21
-; CHECK-NEXT: v_readlane_b32 s42, v4, 22
-; CHECK-NEXT: v_readlane_b32 s43, v4, 23
+; CHECK-NEXT: v_readlane_b32 s37, v2, 1
+; CHECK-NEXT: v_readlane_b32 s38, v2, 2
+; CHECK-NEXT: v_readlane_b32 s39, v2, 3
+; CHECK-NEXT: v_readlane_b32 s40, v2, 4
+; CHECK-NEXT: v_readlane_b32 s41, v2, 5
+; CHECK-NEXT: v_readlane_b32 s42, v2, 6
+; CHECK-NEXT: v_readlane_b32 s43, v2, 7
+; CHECK-NEXT: v_readlane_b32 s44, v2, 8
+; CHECK-NEXT: v_readlane_b32 s45, v2, 9
+; CHECK-NEXT: v_readlane_b32 s46, v2, 10
+; CHECK-NEXT: v_readlane_b32 s47, v2, 11
+; CHECK-NEXT: v_readlane_b32 s48, v2, 12
+; CHECK-NEXT: v_readlane_b32 s49, v2, 13
+; CHECK-NEXT: v_readlane_b32 s50, v2, 14
+; CHECK-NEXT: v_readlane_b32 s51, v2, 15
+; CHECK-NEXT: image_sample_lz v3, v[0:1], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT: v_readlane_b32 s36, v2, 16
+; CHECK-NEXT: v_readlane_b32 s44, v2, 24
+; CHECK-NEXT: v_readlane_b32 s45, v2, 25
+; CHECK-NEXT: v_readlane_b32 s46, v2, 26
+; CHECK-NEXT: v_readlane_b32 s47, v2, 27
+; CHECK-NEXT: v_readlane_b32 s48, v2, 28
+; CHECK-NEXT: v_readlane_b32 s49, v2, 29
+; CHECK-NEXT: v_readlane_b32 s50, v2, 30
+; CHECK-NEXT: v_readlane_b32 s51, v2, 31
+; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: v_mov_b32_e32 v5, v4
+; CHECK-NEXT: v_readlane_b32 s37, v2, 17
+; CHECK-NEXT: v_readlane_b32 s38, v2, 18
+; CHECK-NEXT: v_readlane_b32 s39, v2, 19
+; CHECK-NEXT: image_sample_lz v0, v[0:1], s[44:51], s[4:7] dmask:0x1
+; CHECK-NEXT: v_readlane_b32 s40, v2, 20
+; CHECK-NEXT: v_readlane_b32 s41, v2, 21
+; CHECK-NEXT: v_readlane_b32 s42, v2, 22
+; CHECK-NEXT: v_readlane_b32 s43, v2, 23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dwordx3 v[5:7], off, s[8:11], 0
+; CHECK-NEXT: buffer_store_dwordx3 v[3:5], off, s[8:11], 0
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; CHECK-NEXT: ; implicit-def: $vgpr0
+; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: .LBB0_6: ; %Flow12
-; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[22:23]
-; CHECK-NEXT: v_readlane_b32 s52, v4, 40
-; CHECK-NEXT: v_readlane_b32 s53, v4, 41
-; CHECK-NEXT: v_readlane_b32 s54, v4, 42
-; CHECK-NEXT: v_readlane_b32 s55, v4, 43
-; CHECK-NEXT: v_readlane_b32 s56, v4, 44
-; CHECK-NEXT: v_readlane_b32 s57, v4, 45
-; CHECK-NEXT: v_readlane_b32 s58, v4, 46
-; CHECK-NEXT: v_readlane_b32 s59, v4, 47
-; CHECK-NEXT: v_readlane_b32 s60, v4, 48
-; CHECK-NEXT: v_readlane_b32 s61, v4, 49
-; CHECK-NEXT: v_readlane_b32 s62, v4, 50
-; CHECK-NEXT: v_readlane_b32 s63, v4, 51
-; CHECK-NEXT: v_readlane_b32 s64, v4, 52
-; CHECK-NEXT: v_readlane_b32 s65, v4, 53
-; CHECK-NEXT: v_readlane_b32 s66, v4, 54
-; CHECK-NEXT: v_readlane_b32 s67, v4, 55
+; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[14:15]
+; CHECK-NEXT: s_mov_b64 s[42:43], s[22:23]
+; CHECK-NEXT: s_mov_b64 s[40:41], s[20:21]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[18:19]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[16:17]
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB0_9
; CHECK-NEXT: ; %bb.7: ; %bb33.preheader
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_mov_b32 s6, s8
; CHECK-NEXT: s_mov_b32 s7, s8
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: v_readlane_b32 s36, v4, 56
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
; CHECK-NEXT: s_mov_b32 s9, s8
; CHECK-NEXT: s_mov_b32 s10, s8
; CHECK-NEXT: s_mov_b32 s11, s8
-; CHECK-NEXT: v_mov_b32_e32 v2, s7
-; CHECK-NEXT: v_readlane_b32 s37, v4, 57
-; CHECK-NEXT: v_readlane_b32 s38, v4, 58
-; CHECK-NEXT: v_readlane_b32 s39, v4, 59
-; CHECK-NEXT: v_readlane_b32 s40, v4, 60
-; CHECK-NEXT: v_readlane_b32 s41, v4, 61
-; CHECK-NEXT: v_readlane_b32 s42, v4, 62
-; CHECK-NEXT: v_readlane_b32 s43, v4, 63
-; CHECK-NEXT: s_nop 4
-; CHECK-NEXT: image_sample_lz v5, v[1:2], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT: image_sample_lz v6, v[1:2], s[52:59], s[8:11] dmask:0x1
-; CHECK-NEXT: ; kill: killed $vgpr1_vgpr2
-; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37]
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
+; CHECK-NEXT: image_sample_lz v4, v[0:1], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: image_sample_lz v5, v[0:1], s[52:59], s[8:11] dmask:0x1
+; CHECK-NEXT: ; kill: killed $vgpr0_vgpr1
+; CHECK-NEXT: s_mov_b64 s[16:17], s[52:53]
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_and_b64 vcc, exec, 0
-; CHECK-NEXT: v_readlane_b32 s44, v3, 0
-; CHECK-NEXT: v_readlane_b32 s45, v3, 1
-; CHECK-NEXT: v_readlane_b32 s46, v3, 2
-; CHECK-NEXT: v_readlane_b32 s47, v3, 3
-; CHECK-NEXT: v_readlane_b32 s48, v3, 4
-; CHECK-NEXT: v_readlane_b32 s49, v3, 5
-; CHECK-NEXT: v_readlane_b32 s50, v3, 6
-; CHECK-NEXT: v_readlane_b32 s51, v3, 7
-; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41]
-; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43]
-; CHECK-NEXT: ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
+; CHECK-NEXT: s_mov_b64 s[18:19], s[54:55]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[56:57]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[58:59]
+; CHECK-NEXT: ; kill: killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43
; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
+; CHECK-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_sub_f32_e32 v1, v6, v5
-; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_sub_f32_e32 v0, v5, v4
+; CHECK-NEXT: v_mul_f32_e32 v0, v0, v3
; CHECK-NEXT: .LBB0_8: ; %bb33
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_add_f32_e32 v2, v1, v0
-; CHECK-NEXT: v_sub_f32_e32 v1, v1, v2
+; CHECK-NEXT: v_add_f32_e32 v3, v1, v0
+; CHECK-NEXT: v_sub_f32_e32 v1, v1, v3
; CHECK-NEXT: s_mov_b64 vcc, vcc
; CHECK-NEXT: s_cbranch_vccz .LBB0_8
; CHECK-NEXT: .LBB0_9: ; %Flow13
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock
-; CHECK-NEXT: s_or_b64 exec, exec, s[20:21]
-; CHECK-NEXT: v_readlane_b32 s67, v8, 33
-; CHECK-NEXT: v_readlane_b32 s66, v8, 32
-; CHECK-NEXT: v_readlane_b32 s65, v8, 31
-; CHECK-NEXT: v_readlane_b32 s64, v8, 30
-; CHECK-NEXT: v_readlane_b32 s63, v8, 29
-; CHECK-NEXT: v_readlane_b32 s62, v8, 28
-; CHECK-NEXT: v_readlane_b32 s61, v8, 27
-; CHECK-NEXT: v_readlane_b32 s60, v8, 26
-; CHECK-NEXT: v_readlane_b32 s59, v8, 25
-; CHECK-NEXT: v_readlane_b32 s58, v8, 24
-; CHECK-NEXT: v_readlane_b32 s57, v8, 23
-; CHECK-NEXT: v_readlane_b32 s56, v8, 22
-; CHECK-NEXT: v_readlane_b32 s55, v8, 21
-; CHECK-NEXT: v_readlane_b32 s54, v8, 20
-; CHECK-NEXT: v_readlane_b32 s53, v8, 19
-; CHECK-NEXT: v_readlane_b32 s52, v8, 18
-; CHECK-NEXT: v_readlane_b32 s51, v8, 17
-; CHECK-NEXT: v_readlane_b32 s50, v8, 16
-; CHECK-NEXT: v_readlane_b32 s49, v8, 15
-; CHECK-NEXT: v_readlane_b32 s48, v8, 14
-; CHECK-NEXT: v_readlane_b32 s47, v8, 13
-; CHECK-NEXT: v_readlane_b32 s46, v8, 12
-; CHECK-NEXT: v_readlane_b32 s45, v8, 11
-; CHECK-NEXT: v_readlane_b32 s44, v8, 10
-; CHECK-NEXT: v_readlane_b32 s43, v8, 9
-; CHECK-NEXT: v_readlane_b32 s42, v8, 8
-; CHECK-NEXT: v_readlane_b32 s41, v8, 7
-; CHECK-NEXT: v_readlane_b32 s40, v8, 6
-; CHECK-NEXT: v_readlane_b32 s39, v8, 5
-; CHECK-NEXT: v_readlane_b32 s38, v8, 4
-; CHECK-NEXT: v_readlane_b32 s37, v8, 3
-; CHECK-NEXT: v_readlane_b32 s36, v8, 2
-; CHECK-NEXT: v_readlane_b32 s31, v8, 1
-; CHECK-NEXT: v_readlane_b32 s30, v8, 0
-; CHECK-NEXT: ; kill: killed $vgpr4
-; CHECK-NEXT: ; kill: killed $vgpr3
+; CHECK-NEXT: s_or_b64 exec, exec, s[12:13]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_readlane_b32 s67, v6, 33
+; CHECK-NEXT: v_readlane_b32 s66, v6, 32
+; CHECK-NEXT: v_readlane_b32 s65, v6, 31
+; CHECK-NEXT: v_readlane_b32 s64, v6, 30
+; CHECK-NEXT: v_readlane_b32 s63, v6, 29
+; CHECK-NEXT: v_readlane_b32 s62, v6, 28
+; CHECK-NEXT: v_readlane_b32 s61, v6, 27
+; CHECK-NEXT: v_readlane_b32 s60, v6, 26
+; CHECK-NEXT: v_readlane_b32 s59, v6, 25
+; CHECK-NEXT: v_readlane_b32 s58, v6, 24
+; CHECK-NEXT: v_readlane_b32 s57, v6, 23
+; CHECK-NEXT: v_readlane_b32 s56, v6, 22
+; CHECK-NEXT: v_readlane_b32 s55, v6, 21
+; CHECK-NEXT: v_readlane_b32 s54, v6, 20
+; CHECK-NEXT: v_readlane_b32 s53, v6, 19
+; CHECK-NEXT: v_readlane_b32 s52, v6, 18
+; CHECK-NEXT: v_readlane_b32 s51, v6, 17
+; CHECK-NEXT: v_readlane_b32 s50, v6, 16
+; CHECK-NEXT: v_readlane_b32 s49, v6, 15
+; CHECK-NEXT: v_readlane_b32 s48, v6, 14
+; CHECK-NEXT: v_readlane_b32 s47, v6, 13
+; CHECK-NEXT: v_readlane_b32 s46, v6, 12
+; CHECK-NEXT: v_readlane_b32 s45, v6, 11
+; CHECK-NEXT: v_readlane_b32 s44, v6, 10
+; CHECK-NEXT: v_readlane_b32 s43, v6, 9
+; CHECK-NEXT: v_readlane_b32 s42, v6, 8
+; CHECK-NEXT: v_readlane_b32 s41, v6, 7
+; CHECK-NEXT: v_readlane_b32 s40, v6, 6
+; CHECK-NEXT: v_readlane_b32 s39, v6, 5
+; CHECK-NEXT: v_readlane_b32 s38, v6, 4
+; CHECK-NEXT: v_readlane_b32 s37, v6, 3
+; CHECK-NEXT: v_readlane_b32 s36, v6, 2
+; CHECK-NEXT: v_readlane_b32 s31, v6, 1
+; CHECK-NEXT: v_readlane_b32 s30, v6, 0
+; CHECK-NEXT: ; kill: killed $vgpr2
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 7799b9509ceb0..493ed5956f18f 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -646,12 +646,12 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GCN-LABEL: test_indirect_call_vgpr_ptr_in_branch:
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s16, s33
+; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GCN-NEXT: s_or_saveexec_b64 s[20:21], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[18:19]
-; GCN-NEXT: v_writelane_b32 v40, s16, 20
+; GCN-NEXT: s_mov_b64 exec, s[20:21]
+; GCN-NEXT: v_writelane_b32 v40, s18, 20
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
@@ -681,9 +681,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GCN-NEXT: s_mov_b64 s[36:37], s[8:9]
; GCN-NEXT: s_mov_b64 s[38:39], s[6:7]
; GCN-NEXT: s_mov_b64 s[40:41], s[4:5]
-; GCN-NEXT: v_and_b32_e32 v2, 1, v2
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc
+; GCN-NEXT: s_and_saveexec_b64 s[46:47], s[16:17]
; GCN-NEXT: s_cbranch_execz .LBB5_4
; GCN-NEXT: ; %bb.1: ; %bb1
; GCN-NEXT: s_mov_b64 s[48:49], exec
@@ -741,12 +739,12 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GISEL-LABEL: test_indirect_call_vgpr_ptr_in_branch:
; GISEL: ; %bb.0: ; %bb0
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s16, s33
+; GISEL-NEXT: s_mov_b32 s18, s33
; GISEL-NEXT: s_mov_b32 s33, s32
-; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GISEL-NEXT: s_or_saveexec_b64 s[20:21], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GISEL-NEXT: s_mov_b64 exec, s[18:19]
-; GISEL-NEXT: v_writelane_b32 v40, s16, 20
+; GISEL-NEXT: s_mov_b64 exec, s[20:21]
+; GISEL-NEXT: v_writelane_b32 v40, s18, 20
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
@@ -776,9 +774,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9]
; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7]
; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5]
-; GISEL-NEXT: v_and_b32_e32 v2, 1, v2
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: s_and_saveexec_b64 s[46:47], vcc
+; GISEL-NEXT: s_and_saveexec_b64 s[46:47], s[16:17]
; GISEL-NEXT: s_cbranch_execz .LBB5_4
; GISEL-NEXT: ; %bb.1: ; %bb1
; GISEL-NEXT: s_mov_b64 s[48:49], exec
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 3b3e107a62967..13372dd94619b 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -162,9 +162,7 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX8-SDAG-LABEL: func_uses_lds_multi:
; GFX8-SDAG: ; %bb.0: ; %entry
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
@@ -199,9 +197,7 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX8-GISEL-LABEL: func_uses_lds_multi:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GFX8-GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX8-GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX8-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GFX8-GISEL-NEXT: s_cbranch_execz .LBB2_2
@@ -239,9 +235,7 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX9-SDAG-LABEL: func_uses_lds_multi:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GFX9-SDAG-NEXT: s_cbranch_execz .LBB2_2
@@ -267,9 +261,7 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX9-GISEL-LABEL: func_uses_lds_multi:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GFX9-GISEL-NEXT: s_cbranch_execz .LBB2_2
@@ -295,9 +287,7 @@ define void @func_uses_lds_multi(i1 %cond) {
; SDAG-LABEL: func_uses_lds_multi:
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1
+; SDAG-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; SDAG-NEXT: s_cbranch_execz .LBB2_2
@@ -326,9 +316,7 @@ define void @func_uses_lds_multi(i1 %cond) {
; GISEL-LABEL: func_uses_lds_multi:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GISEL-NEXT: s_cbranch_execz .LBB2_3
@@ -462,113 +450,108 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX8-SDAG-LABEL: func_uses_lds_phi_after:
; GFX8-SDAG: ; %bb.0: ; %entry
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-SDAG-NEXT: flat_load_dword v0, v[1:2] glc
+; GFX8-SDAG-NEXT: flat_load_dword v2, v[0:1] glc
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX8-SDAG-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX8-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX8-SDAG-NEXT: s_cbranch_execz .LBB4_2
; GFX8-SDAG-NEXT: ; %bb.1: ; %use.bb
-; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8
-; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
-; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8
+; GFX8-SDAG-NEXT: ds_write_b32 v0, v2
+; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-SDAG-NEXT: s_trap 2
-; GFX8-SDAG-NEXT: flat_load_dword v0, v[1:2] glc
+; GFX8-SDAG-NEXT: flat_load_dword v2, v[0:1] glc
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX8-SDAG-NEXT: .LBB4_2: ; %ret
-; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, v2
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: func_uses_lds_phi_after:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-GISEL-NEXT: flat_load_dword v0, v[1:2] glc
+; GFX8-GISEL-NEXT: flat_load_dword v2, v[0:1] glc
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX8-GISEL-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX8-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX8-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX8-GISEL-NEXT: s_cbranch_execz .LBB4_2
; GFX8-GISEL-NEXT: ; %bb.1: ; %use.bb
-; GFX8-GISEL-NEXT: s_mov_b64 s[6:7], 0xc8
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], 0xc8
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX8-GISEL-NEXT: s_mov_b32 m0, -1
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_trap 2
-; GFX8-GISEL-NEXT: ds_write_b32 v0, v0
-; GFX8-GISEL-NEXT: flat_load_dword v0, v[1:2] glc
+; GFX8-GISEL-NEXT: ds_write_b32 v0, v2
+; GFX8-GISEL-NEXT: flat_load_dword v2, v[0:1] glc
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX8-GISEL-NEXT: .LBB4_2: ; %ret
-; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v2
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: func_uses_lds_phi_after:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-SDAG-NEXT: global_load_dword v0, v[1:2], off glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v[0:1], off glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX9-SDAG-NEXT: s_cbranch_execz .LBB4_2
; GFX9-SDAG-NEXT: ; %bb.1: ; %use.bb
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT: ds_write_b32 v0, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT: ds_write_b32 v0, v2
; GFX9-SDAG-NEXT: s_trap 2
-; GFX9-SDAG-NEXT: global_load_dword v0, v[1:2], off glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v[0:1], off glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB4_2: ; %ret
-; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, v2
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: func_uses_lds_phi_after:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-GISEL-NEXT: global_load_dword v0, v[1:2], off glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v[0:1], off glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX9-GISEL-NEXT: s_cbranch_execz .LBB4_2
; GFX9-GISEL-NEXT: ; %bb.1: ; %use.bb
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9-GISEL-NEXT: s_trap 2
-; GFX9-GISEL-NEXT: ds_write_b32 v0, v0
-; GFX9-GISEL-NEXT: global_load_dword v0, v[1:2], off glc
+; GFX9-GISEL-NEXT: ds_write_b32 v0, v2
+; GFX9-GISEL-NEXT: global_load_dword v2, v[0:1], off glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: .LBB4_2: ; %ret
-; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, v2
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-LABEL: func_uses_lds_phi_after:
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v3, v0
-; SDAG-NEXT: global_load_dword v0, v[1:2], off glc
+; SDAG-NEXT: global_load_dword v2, v[0:1], off glc
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v3, 1, v3
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB4_3
; SDAG-NEXT: ; %bb.1: ; %use.bb
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
-; SDAG-NEXT: ds_write_b32 v0, v0
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: ds_write_b32 v0, v2
; SDAG-NEXT: s_cbranch_execnz .LBB4_4
; SDAG-NEXT: ; %bb.2: ; %use.bb
-; SDAG-NEXT: global_load_dword v0, v[1:2], off glc
+; SDAG-NEXT: global_load_dword v2, v[0:1], off glc
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: .LBB4_3: ; %ret
-; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v0, v2
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
; SDAG-NEXT: .LBB4_4:
@@ -577,22 +560,21 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GISEL-LABEL: func_uses_lds_phi_after:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v3, v0
-; GISEL-NEXT: global_load_dword v0, v[1:2], off glc
+; GISEL-NEXT: global_load_dword v2, v[0:1], off glc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v3, 1, v3
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB4_3
; GISEL-NEXT: ; %bb.1: ; %use.bb
; GISEL-NEXT: s_cbranch_execnz .LBB4_4
; GISEL-NEXT: ; %bb.2: ; %use.bb
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-NEXT: ds_write_b32 v0, v0
-; GISEL-NEXT: global_load_dword v0, v[1:2], off glc
+; GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GISEL-NEXT: ds_write_b32 v0, v2
+; GISEL-NEXT: global_load_dword v2, v[0:1], off glc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: .LBB4_3: ; %ret
-; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, v2
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
; GISEL-NEXT: .LBB4_4:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index 27fb4e5f965c9..68043e807f297 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -509,8 +509,7 @@ define amdgpu_kernel void @test_class_undef_f32(ptr addrspace(1) %out, float %a,
; SI-LABEL: {{^}}test_fold_and_ord:
; SI: s_waitcnt
-; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 32{{$}}
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
+; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3
; SI-NEXT: s_setpc_b64
define i1 @test_fold_and_ord(float %a) {
%class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
@@ -521,8 +520,7 @@ define i1 @test_fold_and_ord(float %a) {
; SI-LABEL: {{^}}test_fold_and_unord:
; SI: s_waitcnt
-; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 3{{$}}
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
+; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3
; SI-NEXT: s_setpc_b64
define i1 @test_fold_and_unord(float %a) {
%class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index ea823f30f26c2..9afcfdb7a23ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -92,31 +92,31 @@ define i1 @zeromask_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: zeromask_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX7CHECK-NEXT: s_mov_b64 s[4:5], 0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: zeromask_bf16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX8CHECK-NEXT: s_mov_b64 s[4:5], 0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: zeromask_bf16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX9CHECK-NEXT: s_mov_b64 s[4:5], 0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: zeromask_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX10CHECK-NEXT: s_mov_b32 s4, 0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: zeromask_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX11CHECK-NEXT: s_mov_b32 s0, 0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 0)
ret i1 %1
@@ -127,31 +127,31 @@ define i1 @allflags_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: allflags_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mov_b32_e32 v0, 1
+; GFX7CHECK-NEXT: s_mov_b64 s[4:5], -1
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: allflags_bf16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, 1
+; GFX8CHECK-NEXT: s_mov_b64 s[4:5], -1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: allflags_bf16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 1
+; GFX9CHECK-NEXT: s_mov_b64 s[4:5], -1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: allflags_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 1
+; GFX10CHECK-NEXT: s_mov_b32 s4, -1
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: allflags_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 1
+; GFX11CHECK-NEXT: s_mov_b32 s0, -1
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1023) ; 0x3ff
ret i1 %1
@@ -168,7 +168,6 @@ define i1 @snan_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: snan_bf16:
@@ -180,7 +179,6 @@ define i1 @snan_bf16(bfloat %x) nounwind {
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX8CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: snan_bf16:
@@ -192,7 +190,6 @@ define i1 @snan_bf16(bfloat %x) nounwind {
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX9CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: snan_bf16:
@@ -202,7 +199,6 @@ define i1 @snan_bf16(bfloat %x) nounwind {
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7f80, v0
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: snan_bf16:
@@ -212,7 +208,6 @@ define i1 @snan_bf16(bfloat %x) nounwind {
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1) ; 0x001
ret i1 %1
@@ -225,8 +220,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fbf
-; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: qnan_bf16:
@@ -234,8 +228,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7fbf
-; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: qnan_bf16:
@@ -243,24 +236,21 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7fbf
-; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: qnan_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7fbf, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: qnan_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7fbf, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 2) ; 0x002
ret i1 %1
@@ -273,38 +263,33 @@ define i1 @posinf_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: posinf_bf16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: posinf_bf16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: posinf_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0x7f80, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: posinf_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0x7f80, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 512) ; 0x200
ret i1 %1
@@ -317,38 +302,33 @@ define i1 @neginf_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7CHECK-NEXT: s_mov_b32 s4, 0xff80
-; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: neginf_bf16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: s_movk_i32 s4, 0xff80
-; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: neginf_bf16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: s_movk_i32 s4, 0xff80
-; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: neginf_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0xff80, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: neginf_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0xff80, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 4) ; 0x004
ret i1 %1
@@ -367,7 +347,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v1
; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
; GFX7CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: posnormal_bf16:
@@ -379,7 +358,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f00
; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: posnormal_bf16:
@@ -391,7 +369,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f00
; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: posnormal_bf16:
@@ -402,7 +379,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: posnormal_bf16:
@@ -413,7 +389,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 256) ; 0x100
ret i1 %1
@@ -432,7 +407,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
; GFX7CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: negnormal_bf16:
@@ -444,7 +418,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f00
; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: negnormal_bf16:
@@ -456,7 +429,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f00
; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: negnormal_bf16:
@@ -467,7 +439,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: negnormal_bf16:
@@ -478,7 +449,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 8) ; 0x008
ret i1 %1
@@ -493,8 +463,7 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f
-; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: possubnormal_bf16:
@@ -502,8 +471,7 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_add_u16_e32 v0, -1, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f
-; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: possubnormal_bf16:
@@ -511,24 +479,21 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_add_u16_e32 v0, -1, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f
-; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: possubnormal_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, -1
-; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: possubnormal_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, -1
-; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 128) ; 0x080
ret i1 %1
@@ -546,7 +511,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX7CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: negsubnormal_bf16:
@@ -558,7 +522,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f
; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: negsubnormal_bf16:
@@ -570,7 +533,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f
; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: negsubnormal_bf16:
@@ -581,7 +543,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, -1
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: negsubnormal_bf16:
@@ -592,7 +553,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, -1
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 16) ; 0x010
ret i1 %1
@@ -604,36 +564,31 @@ define i1 @poszero_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: poszero_bf16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: poszero_bf16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: poszero_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: poszero_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 64) ; 0x040
ret i1 %1
@@ -646,38 +601,33 @@ define i1 @negzero_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7CHECK-NEXT: s_mov_b32 s4, 0x8000
-; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: negzero_bf16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x8000
-; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: negzero_bf16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x8000
-; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: negzero_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: negzero_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 32) ; 0x020
ret i1 %1
@@ -690,38 +640,33 @@ define i1 @posfinite_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: posfinite_bf16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: posfinite_bf16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: posfinite_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f80, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: posfinite_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f80, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 448) ; 0x1c0
ret i1 %1
@@ -738,7 +683,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: negfinite_bf16:
@@ -749,7 +693,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX8CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: negfinite_bf16:
@@ -760,7 +703,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX9CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: negfinite_bf16:
@@ -770,7 +712,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
; GFX10CHECK-NEXT: v_cmp_gt_i16_e64 s4, 0x7f80, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: negfinite_bf16:
@@ -780,7 +721,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
; GFX11CHECK-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 56) ; 0x038
ret i1 %1
@@ -793,8 +733,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isnan_bf16:
@@ -802,8 +741,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isnan_bf16:
@@ -811,24 +749,21 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isnan_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7f80, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isnan_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 3) ; nan
ret i1 %1
@@ -841,8 +776,7 @@ define i1 @not_isnan_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81
-; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_isnan_bf16:
@@ -850,8 +784,7 @@ define i1 @not_isnan_bf16(bfloat %x) {
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f81
-; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_isnan_bf16:
@@ -859,24 +792,21 @@ define i1 @not_isnan_bf16(bfloat %x) {
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f81
-; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_isnan_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_gt_i16_e64 s4, 0x7f81, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_isnan_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_gt_i16_e64 s0, 0x7f81, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1020) ; ~nan
ret i1 %class
@@ -1130,8 +1060,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isinf_bf16:
@@ -1139,8 +1068,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isinf_bf16:
@@ -1148,24 +1076,21 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isinf_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0x7f80, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isinf_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0x7f80, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 516) ; 0x204 = "inf"
ret i1 %1
@@ -1178,8 +1103,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isfinite_bf16:
@@ -1187,8 +1111,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isfinite_bf16:
@@ -1196,24 +1119,21 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isfinite_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_gt_i16_e64 s4, 0x7f80, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isfinite_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 504) ; 0x1f8 = "finite"
ret i1 %1
@@ -1226,40 +1146,35 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0
-; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: issubnormal_or_zero_bf16:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0
-; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: issubnormal_or_zero_bf16:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0
-; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: issubnormal_or_zero_bf16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0
-; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: issubnormal_or_zero_bf16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0
-; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 240) ; 0xf0 = "subnormal|zero"
@@ -1273,40 +1188,35 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0
-; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_issubnormal_or_zero_bf16:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0
-; GFX8CHECK-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_issubnormal_or_zero_bf16:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0
-; GFX9CHECK-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_issubnormal_or_zero_bf16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0
-; GFX10CHECK-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_ne_u16_e64 s4, 0, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_issubnormal_or_zero_bf16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0
-; GFX11CHECK-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_ne_u16_e64 s0, 0, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 783) ; ~0xf0 = "~(subnormal|zero)"
@@ -1322,8 +1232,7 @@ define i1 @isnormal_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f00
-; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isnormal_bf16:
@@ -1332,8 +1241,7 @@ define i1 @isnormal_bf16(bfloat %x) {
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f00
-; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isnormal_bf16:
@@ -1342,8 +1250,7 @@ define i1 @isnormal_bf16(bfloat %x) {
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f00
-; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isnormal_bf16:
@@ -1351,8 +1258,7 @@ define i1 @isnormal_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
-; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isnormal_bf16:
@@ -1360,8 +1266,7 @@ define i1 @isnormal_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
-; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 264) ; 0x108 = "normal"
ret i1 %class
@@ -1376,8 +1281,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7eff
-; GFX7CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_isnormal_bf16:
@@ -1386,8 +1290,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7eff
-; GFX8CHECK-NEXT: v_cmp_lt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_lt_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_isnormal_bf16:
@@ -1396,8 +1299,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7eff
-; GFX9CHECK-NEXT: v_cmp_lt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_lt_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_isnormal_bf16:
@@ -1405,8 +1307,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
-; GFX10CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_isnormal_bf16:
@@ -1414,8 +1315,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
-; GFX11CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 759) ; ~0x108 = "~normal"
ret i1 %class
@@ -1434,7 +1334,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
; GFX7CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0
; GFX7CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_is_plus_normal_bf16:
@@ -1446,7 +1345,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7eff
; GFX8CHECK-NEXT: v_cmp_lt_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_is_plus_normal_bf16:
@@ -1458,7 +1356,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7eff
; GFX9CHECK-NEXT: v_cmp_lt_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_is_plus_normal_bf16:
@@ -1469,7 +1366,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v1
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_is_plus_normal_bf16:
@@ -1480,7 +1376,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 767) ; ~0x100 = ~"+normal"
ret i1 %class
@@ -1499,7 +1394,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v1
; GFX7CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0
; GFX7CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_is_neg_normal_bf16:
@@ -1511,7 +1405,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7eff
; GFX8CHECK-NEXT: v_cmp_lt_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_is_neg_normal_bf16:
@@ -1523,7 +1416,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7eff
; GFX9CHECK-NEXT: v_cmp_lt_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_is_neg_normal_bf16:
@@ -1534,7 +1426,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v1
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_is_neg_normal_bf16:
@@ -1545,7 +1436,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1015) ; ~0x008 = ~"-normal"
ret i1 %class
@@ -1559,8 +1449,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f
-; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: issubnormal_bf16:
@@ -1569,8 +1458,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: v_add_u16_e32 v0, -1, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f
-; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: issubnormal_bf16:
@@ -1579,8 +1467,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: v_add_u16_e32 v0, -1, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f
-; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: issubnormal_bf16:
@@ -1588,8 +1475,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, -1
-; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: issubnormal_bf16:
@@ -1597,8 +1483,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, -1
-; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 144) ; 0x90 = "subnormal"
ret i1 %class
@@ -1612,8 +1497,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7e
-; GFX7CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_issubnormal_bf16:
@@ -1622,8 +1506,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: v_add_u16_e32 v0, -1, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7e
-; GFX8CHECK-NEXT: v_cmp_lt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_lt_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_issubnormal_bf16:
@@ -1632,8 +1515,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: v_add_u16_e32 v0, -1, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7e
-; GFX9CHECK-NEXT: v_cmp_lt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_lt_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_issubnormal_bf16:
@@ -1641,8 +1523,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, -1
-; GFX10CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7e, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_issubnormal_bf16:
@@ -1650,8 +1531,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, -1
-; GFX11CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7e, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 879) ; ~0x90 = ~"subnormal"
ret i1 %class
@@ -1663,40 +1543,35 @@ define i1 @iszero_bf16(bfloat %x) {
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_bf16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_bf16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 96) ; 0x60 = "zero"
ret i1 %class
@@ -1708,40 +1583,35 @@ define i1 @not_iszero_bf16(bfloat %x) {
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_iszero_bf16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8CHECK-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_iszero_bf16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9CHECK-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_iszero_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_ne_u16_e64 s4, 0, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_iszero_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_ne_u16_e64 s0, 0, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 927) ; ~0x60 = ~"zero"
ret i1 %class
@@ -1754,38 +1624,33 @@ define i1 @ispositive_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81
-; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: ispositive_bf16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f81
-; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: ispositive_bf16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f81
-; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: ispositive_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f81, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: ispositive_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f81, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 960) ; fcPositive
ret i1 %class
@@ -1808,7 +1673,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s6, v0
; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_ispositive_bf16:
@@ -1824,7 +1688,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s6, v1
; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_ispositive_bf16:
@@ -1840,7 +1703,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s6, v1
; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_ispositive_bf16:
@@ -1854,7 +1716,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: s_or_b32 s4, s4, s5
; GFX10CHECK-NEXT: s_or_b32 s4, s4, s6
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_ispositive_bf16:
@@ -1868,7 +1729,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: s_or_b32 s0, s0, s1
; GFX11CHECK-NEXT: s_or_b32 s0, s0, s2
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 63) ; ~fcPositive
ret i1 %class
@@ -1889,7 +1749,6 @@ define i1 @isnegative_bf16(bfloat %x) {
; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1
; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isnegative_bf16:
@@ -1903,7 +1762,6 @@ define i1 @isnegative_bf16(bfloat %x) {
; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s6, v0
; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isnegative_bf16:
@@ -1917,7 +1775,6 @@ define i1 @isnegative_bf16(bfloat %x) {
; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s6, v0
; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isnegative_bf16:
@@ -1929,7 +1786,6 @@ define i1 @isnegative_bf16(bfloat %x) {
; GFX10CHECK-NEXT: v_cmp_gt_i16_e64 s4, 0x7f80, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: s_or_b32 s4, s4, s5
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isnegative_bf16:
@@ -1941,7 +1797,6 @@ define i1 @isnegative_bf16(bfloat %x) {
; GFX11CHECK-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: s_or_b32 s0, s0, s1
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 60) ; fcNegative
ret i1 %class
@@ -1959,7 +1814,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81
; GFX7CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v1
; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_isnegative_bf16:
@@ -1971,7 +1825,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX8CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_isnegative_bf16:
@@ -1983,7 +1836,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX9CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_isnegative_bf16:
@@ -1993,7 +1845,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7f80, v1
; GFX10CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_isnegative_bf16:
@@ -2003,7 +1854,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v1
; GFX11CHECK-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 963) ; ~fcNegative
ret i1 %class
@@ -2019,7 +1869,6 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_or_nan_bf16:
@@ -2030,7 +1879,6 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_or_nan_bf16:
@@ -2041,7 +1889,6 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_or_nan_bf16:
@@ -2051,7 +1898,6 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0, v0
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_or_nan_bf16:
@@ -2061,7 +1907,6 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0, v0
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99) ; 0x60|0x3 = "zero|nan"
@@ -2078,7 +1923,6 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_or_nan_f_daz:
@@ -2089,7 +1933,6 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_or_nan_f_daz:
@@ -2100,7 +1943,6 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_or_nan_f_daz:
@@ -2110,7 +1952,6 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0, v0
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_or_nan_f_daz:
@@ -2120,7 +1961,6 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0, v0
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99) ; 0x60|0x3 = "zero|nan"
@@ -2137,7 +1977,6 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_or_nan_f_maybe_daz:
@@ -2148,7 +1987,6 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_or_nan_f_maybe_daz:
@@ -2159,7 +1997,6 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_or_nan_f_maybe_daz:
@@ -2169,7 +2006,6 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0, v0
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_or_nan_f_maybe_daz:
@@ -2179,7 +2015,6 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0, v0
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99) ; 0x60|0x3 = "zero|nan"
@@ -2196,7 +2031,6 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_iszero_or_nan_bf16:
@@ -2207,7 +2041,6 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0
; GFX8CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0
; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_iszero_or_nan_bf16:
@@ -2218,7 +2051,6 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0
; GFX9CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0
; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_iszero_or_nan_bf16:
@@ -2228,7 +2060,6 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
; GFX10CHECK-NEXT: v_cmp_ne_u16_e64 s4, 0, v0
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_iszero_or_nan_bf16:
@@ -2238,7 +2069,6 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
; GFX11CHECK-NEXT: v_cmp_ne_u16_e64 s0, 0, v0
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924) ; ~0x60 = "~(zero|nan)"
@@ -2255,7 +2085,6 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_iszero_or_nan_f_daz:
@@ -2266,7 +2095,6 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0
; GFX8CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0
; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_iszero_or_nan_f_daz:
@@ -2277,7 +2105,6 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0
; GFX9CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0
; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_iszero_or_nan_f_daz:
@@ -2287,7 +2114,6 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
; GFX10CHECK-NEXT: v_cmp_ne_u16_e64 s4, 0, v0
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_iszero_or_nan_f_daz:
@@ -2297,7 +2123,6 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
; GFX11CHECK-NEXT: v_cmp_ne_u16_e64 s0, 0, v0
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924) ; ~(0x60|0x3) = "~(zero|nan)"
@@ -2314,7 +2139,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
@@ -2325,7 +2149,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0
; GFX8CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0
; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
@@ -2336,7 +2159,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0
; GFX9CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0
; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
@@ -2346,7 +2168,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
; GFX10CHECK-NEXT: v_cmp_ne_u16_e64 s4, 0, v0
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
@@ -2356,7 +2177,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
; GFX11CHECK-NEXT: v_cmp_ne_u16_e64 s0, 0, v0
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924) ; ~(0x60|0x3) = "~(zero|nan)"
@@ -2373,7 +2193,6 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_or_qnan_bf16:
@@ -2384,7 +2203,6 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_or_qnan_bf16:
@@ -2395,7 +2213,6 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0
; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_or_qnan_bf16:
@@ -2405,7 +2222,6 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0, v0
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_or_qnan_bf16:
@@ -2415,7 +2231,6 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0, v0
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 98) ; 0x60|0x2 = "zero|qnan"
@@ -2435,7 +2250,6 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_or_snan_bf16:
@@ -2449,7 +2263,6 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
; GFX8CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_or_snan_bf16:
@@ -2463,7 +2276,6 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
; GFX9CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_or_snan_bf16:
@@ -2475,7 +2287,6 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s5, 0, v0
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: s_or_b32 s4, s5, s4
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_or_snan_bf16:
@@ -2487,7 +2298,6 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s1, 0, v0
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: s_or_b32 s0, s1, s0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 97) ; 0x60|0x1 = "zero|snan"
@@ -2516,7 +2326,6 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f00
; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_iszero_or_qnan_bf16:
@@ -2538,7 +2347,6 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
; GFX8CHECK-NEXT: s_movk_i32 s6, 0x7f00
; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s6, v0
; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_iszero_or_qnan_bf16:
@@ -2560,7 +2368,6 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
; GFX9CHECK-NEXT: s_movk_i32 s6, 0x7f00
; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s6, v0
; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_iszero_or_qnan_bf16:
@@ -2578,7 +2385,6 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_or_b32 s5, s6, s5
; GFX10CHECK-NEXT: s_or_b32 s4, s5, s4
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_iszero_or_qnan_bf16:
@@ -2596,7 +2402,6 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_or_b32 s1, s2, s1
; GFX11CHECK-NEXT: s_or_b32 s0, s1, s0
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 925) ; ~(0x60|0x2) = "~(zero|qnan)"
@@ -2623,7 +2428,6 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f00
; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_iszero_or_snan_bf16:
@@ -2643,7 +2447,6 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s6, v0
; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_iszero_or_snan_bf16:
@@ -2663,7 +2466,6 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s6, v0
; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_iszero_or_snan_bf16:
@@ -2679,7 +2481,6 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: s_or_b32 s4, s4, s5
; GFX10CHECK-NEXT: s_or_b32 s4, s4, s6
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_iszero_or_snan_bf16:
@@ -2695,7 +2496,6 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: s_or_b32 s0, s0, s1
; GFX11CHECK-NEXT: s_or_b32 s0, s0, s2
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 926) ; ~(0x60|0x1) = "~(zero|snan)"
@@ -2709,8 +2509,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f7f
-; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isinf_or_nan_bf16:
@@ -2718,8 +2517,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f7f
-; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isinf_or_nan_bf16:
@@ -2727,24 +2525,21 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f7f
-; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isinf_or_nan_bf16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7f7f, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isinf_or_nan_bf16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7f7f, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 519) ; 0x204|0x3 = "inf|nan"
@@ -2758,8 +2553,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_isinf_or_nan_bf16:
@@ -2767,8 +2561,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_isinf_or_nan_bf16:
@@ -2776,24 +2569,21 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_isinf_or_nan_bf16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_gt_i16_e64 s4, 0x7f80, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_isinf_or_nan_bf16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 504) ; ~(0x204|0x3) = "~(inf|nan)"
@@ -2807,8 +2597,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isfinite_or_nan_f:
@@ -2816,8 +2605,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isfinite_or_nan_f:
@@ -2825,24 +2613,21 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isfinite_or_nan_f:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_ne_u16_e64 s4, 0x7f80, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isfinite_or_nan_f:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_ne_u16_e64 s0, 0x7f80, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 507) ; 0x1f8|0x3 = "finite|nan"
@@ -2856,8 +2641,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_isfinite_or_nan_f:
@@ -2865,8 +2649,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], s4, v0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_isfinite_or_nan_f:
@@ -2874,24 +2657,21 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], s4, v0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_isfinite_or_nan_f:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0x7f80, v0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_isfinite_or_nan_f:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0x7f80, v0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 516) ; ~(0x1f8|0x3) = "~(finite|nan)"
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index da64c379672ef..74138ce83e095 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -99,31 +99,31 @@ define i1 @zeromask_f16(half %x) nounwind {
; GFX7CHECK-LABEL: zeromask_f16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX7CHECK-NEXT: s_mov_b64 s[4:5], 0
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: zeromask_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX8CHECK-NEXT: s_mov_b64 s[4:5], 0
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: zeromask_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX9CHECK-NEXT: s_mov_b64 s[4:5], 0
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: zeromask_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX10CHECK-NEXT: s_mov_b32 s4, 0
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: zeromask_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX11CHECK-NEXT: s_mov_b32 s0, 0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 0)
ret i1 %1
@@ -131,35 +131,65 @@ define i1 @zeromask_f16(half %x) nounwind {
; FIXME: DAG and GlobalISel return different values for i1 true
define i1 @allflags_f16(half %x) nounwind {
-; GFX7CHECK-LABEL: allflags_f16:
-; GFX7CHECK: ; %bb.0:
-; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mov_b32_e32 v0, 1
-; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX7SELDAG-LABEL: allflags_f16:
+; GFX7SELDAG: ; %bb.0:
+; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7SELDAG-NEXT: s_mov_b64 s[4:5], -1
+; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8CHECK-LABEL: allflags_f16:
-; GFX8CHECK: ; %bb.0:
-; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, 1
-; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX7GLISEL-LABEL: allflags_f16:
+; GFX7GLISEL: ; %bb.0:
+; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7GLISEL-NEXT: s_mov_b64 s[4:5], 1
+; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9CHECK-LABEL: allflags_f16:
-; GFX9CHECK: ; %bb.0:
-; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 1
-; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX8SELDAG-LABEL: allflags_f16:
+; GFX8SELDAG: ; %bb.0:
+; GFX8SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8SELDAG-NEXT: s_mov_b64 s[4:5], -1
+; GFX8SELDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10CHECK-LABEL: allflags_f16:
-; GFX10CHECK: ; %bb.0:
-; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 1
-; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX8GLISEL-LABEL: allflags_f16:
+; GFX8GLISEL: ; %bb.0:
+; GFX8GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GLISEL-NEXT: s_mov_b64 s[4:5], 1
+; GFX8GLISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11CHECK-LABEL: allflags_f16:
-; GFX11CHECK: ; %bb.0:
-; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 1
-; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX9SELDAG-LABEL: allflags_f16:
+; GFX9SELDAG: ; %bb.0:
+; GFX9SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9SELDAG-NEXT: s_mov_b64 s[4:5], -1
+; GFX9SELDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GLISEL-LABEL: allflags_f16:
+; GFX9GLISEL: ; %bb.0:
+; GFX9GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GLISEL-NEXT: s_mov_b64 s[4:5], 1
+; GFX9GLISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10SELDAG-LABEL: allflags_f16:
+; GFX10SELDAG: ; %bb.0:
+; GFX10SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10SELDAG-NEXT: s_mov_b32 s4, -1
+; GFX10SELDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10GLISEL-LABEL: allflags_f16:
+; GFX10GLISEL: ; %bb.0:
+; GFX10GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10GLISEL-NEXT: s_mov_b32 s4, 1
+; GFX10GLISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11SELDAG-LABEL: allflags_f16:
+; GFX11SELDAG: ; %bb.0:
+; GFX11SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11SELDAG-NEXT: s_mov_b32 s0, -1
+; GFX11SELDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11GLISEL-LABEL: allflags_f16:
+; GFX11GLISEL: ; %bb.0:
+; GFX11GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11GLISEL-NEXT: s_mov_b32 s0, 1
+; GFX11GLISEL-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 1023) ; 0x3ff
ret i1 %1
}
@@ -175,7 +205,6 @@ define i1 @snan_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s5, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: snan_f16:
@@ -185,36 +214,31 @@ define i1 @snan_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xffff83ff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x1ff
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: snan_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: snan_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: snan_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 1
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: snan_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 1
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 1) ; 0x001
ret i1 %1
@@ -227,8 +251,7 @@ define i1 @qnan_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7dff
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: qnan_f16:
@@ -237,36 +260,31 @@ define i1 @qnan_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7e00
-; GFX7GLISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: qnan_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 2
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: qnan_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 2
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: qnan_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 2
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: qnan_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 2
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 2) ; 0x002
ret i1 %1
@@ -278,8 +296,7 @@ define i1 @posinf_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
-; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: posinf_f16:
@@ -287,38 +304,33 @@ define i1 @posinf_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: posinf_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x200
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: posinf_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x200
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: posinf_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x200
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: posinf_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x200
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 512) ; 0x200
ret i1 %1
@@ -330,8 +342,7 @@ define i1 @neginf_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_mov_b32 s4, 0xfc00
-; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: neginf_f16:
@@ -339,36 +350,31 @@ define i1 @neginf_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0xfc00
-; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: neginf_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 4
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: neginf_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 4
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: neginf_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 4
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: neginf_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 4
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 4) ; 0x004
ret i1 %1
@@ -387,7 +393,6 @@ define i1 @posnormal_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v1
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: posnormal_f16:
@@ -402,37 +407,32 @@ define i1 @posnormal_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: posnormal_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x100
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: posnormal_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x100
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: posnormal_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x100
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: posnormal_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x100
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 256) ; 0x100
ret i1 %1
@@ -451,7 +451,6 @@ define i1 @negnormal_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: negnormal_f16:
@@ -466,35 +465,30 @@ define i1 @negnormal_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: negnormal_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 8
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: negnormal_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 8
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: negnormal_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 8
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: negnormal_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 8
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 8) ; 0x008
ret i1 %1
@@ -508,8 +502,7 @@ define i1 @possubnormal_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, -1, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: possubnormal_f16:
@@ -518,38 +511,33 @@ define i1 @possubnormal_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 1, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x3ff
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: possubnormal_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x80
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: possubnormal_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x80
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: possubnormal_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x80
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: possubnormal_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x80
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 128) ; 0x080
ret i1 %1
@@ -567,7 +555,6 @@ define i1 @negsubnormal_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: negsubnormal_f16:
@@ -582,35 +569,30 @@ define i1 @negsubnormal_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x3ff
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: negsubnormal_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 16
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: negsubnormal_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 16
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: negsubnormal_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 16
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: negsubnormal_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 16
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 16) ; 0x010
ret i1 %1
@@ -621,44 +603,38 @@ define i1 @poszero_f16(half %x) nounwind {
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: poszero_f16:
; GFX7GLISEL: ; %bb.0:
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: poszero_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 64
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: poszero_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 64
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: poszero_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 64
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: poszero_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 64
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 64) ; 0x040
ret i1 %1
@@ -670,8 +646,7 @@ define i1 @negzero_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_mov_b32 s4, 0x8000
-; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: negzero_f16:
@@ -679,36 +654,31 @@ define i1 @negzero_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x8000
-; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: negzero_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 32
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: negzero_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 32
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: negzero_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 32
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: negzero_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 32
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 32) ; 0x020
ret i1 %1
@@ -720,8 +690,7 @@ define i1 @posfinite_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
-; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: posfinite_f16:
@@ -729,38 +698,33 @@ define i1 @posfinite_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: posfinite_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x1c0
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: posfinite_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x1c0
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: posfinite_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x1c0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: posfinite_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1c0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 448) ; 0x1c0
ret i1 %1
@@ -777,7 +741,6 @@ define i1 @negfinite_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: negfinite_f16:
@@ -790,35 +753,30 @@ define i1 @negfinite_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x7c00
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v1, v0
; GFX7GLISEL-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: negfinite_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 56
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: negfinite_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 56
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: negfinite_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 56
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: negfinite_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 56
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 56) ; 0x038
ret i1 %1
@@ -831,8 +789,7 @@ define i1 @isnan_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: isnan_f16:
@@ -841,36 +798,31 @@ define i1 @isnan_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isnan_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 3
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isnan_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 3
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isnan_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isnan_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 3
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 3) ; nan
ret i1 %1
@@ -883,8 +835,7 @@ define i1 @not_isnan_f16(half %x) {
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_isnan_f16:
@@ -893,38 +844,33 @@ define i1 @not_isnan_f16(half %x) {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c01
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_isnan_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x3fc
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_isnan_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x3fc
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_isnan_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x3fc
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_isnan_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3fc
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = call i1 @llvm.is.fpclass.f16(half %x, i32 1020) ; ~nan
ret i1 %class
@@ -1316,8 +1262,7 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind {
; GFX7SELDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: isnan_f16_strictfp:
@@ -1326,36 +1271,31 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isnan_f16_strictfp:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 3
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isnan_f16_strictfp:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 3
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isnan_f16_strictfp:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isnan_f16_strictfp:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 3
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 3) strictfp ; nan
ret i1 %1
@@ -1368,8 +1308,7 @@ define i1 @isinf_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: isinf_f16:
@@ -1378,38 +1317,33 @@ define i1 @isinf_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isinf_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x204
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isinf_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x204
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isinf_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x204
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isinf_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 516) ; 0x204 = "inf"
ret i1 %1
@@ -1422,8 +1356,7 @@ define i1 @isfinite_f16(half %x) nounwind {
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: isfinite_f16:
@@ -1432,38 +1365,33 @@ define i1 @isfinite_f16(half %x) nounwind {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isfinite_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x1f8
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isfinite_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x1f8
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isfinite_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x1f8
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isfinite_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1f8
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 504) ; 0x1f8 = "finite"
ret i1 %1
@@ -1475,8 +1403,7 @@ define i1 @issubnormal_or_zero_f16(half %x) {
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7c00, v0
-; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: issubnormal_or_zero_f16:
@@ -1484,38 +1411,33 @@ define i1 @issubnormal_or_zero_f16(half %x) {
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7c00, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: issubnormal_or_zero_f16:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0xf0
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: issubnormal_or_zero_f16:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0xf0
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: issubnormal_or_zero_f16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0xf0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: issubnormal_or_zero_f16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0xf0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 240) ; 0xf0 = "subnormal|zero"
@@ -1528,8 +1450,7 @@ define i1 @not_issubnormal_or_zero_f16(half %x) {
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7c00, v0
-; GFX7SELDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_issubnormal_or_zero_f16:
@@ -1544,37 +1465,32 @@ define i1 @not_issubnormal_or_zero_f16(half %x) {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_issubnormal_or_zero_f16:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x30f
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_issubnormal_or_zero_f16:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x30f
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_issubnormal_or_zero_f16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x30f
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_issubnormal_or_zero_f16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x30f
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 783) ; ~0xf0 = "~(subnormal|zero)"
@@ -1590,8 +1506,7 @@ define i1 @isnormal_f16(half %x) {
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: isnormal_f16:
@@ -1601,38 +1516,33 @@ define i1 @isnormal_f16(half %x) {
; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isnormal_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x108
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isnormal_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x108
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isnormal_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x108
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isnormal_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x108
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 264) ; 0x108 = "normal"
ret i1 %class
@@ -1647,8 +1557,7 @@ define i1 @not_isnormal_f16(half %x) {
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_isnormal_f16:
@@ -1664,37 +1573,32 @@ define i1 @not_isnormal_f16(half %x) {
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v2
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_isnormal_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x2f7
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_isnormal_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x2f7
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_isnormal_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x2f7
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_isnormal_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x2f7
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 759) ; ~0x108 = "~normal"
ret i1 %class
@@ -1713,7 +1617,6 @@ define i1 @not_is_plus_normal_f16(half %x) {
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_is_plus_normal_f16:
@@ -1737,37 +1640,32 @@ define i1 @not_is_plus_normal_f16(half %x) {
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_is_plus_normal_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x2ff
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_is_plus_normal_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x2ff
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_is_plus_normal_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x2ff
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_is_plus_normal_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x2ff
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 767) ; ~0x100 = ~"+normal"
ret i1 %class
@@ -1786,7 +1684,6 @@ define i1 @not_is_neg_normal_f16(half %x) {
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v1
; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_is_neg_normal_f16:
@@ -1810,37 +1707,32 @@ define i1 @not_is_neg_normal_f16(half %x) {
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_is_neg_normal_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x3f7
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_is_neg_normal_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x3f7
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_is_neg_normal_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x3f7
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_is_neg_normal_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3f7
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 1015) ; ~0x008 = ~"-normal"
ret i1 %class
@@ -1854,8 +1746,7 @@ define i1 @issubnormal_f16(half %x) {
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, -1, v0
-; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: issubnormal_f16:
@@ -1865,38 +1756,33 @@ define i1 @issubnormal_f16(half %x) {
; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 1, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x3ff
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: issubnormal_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x90
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: issubnormal_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x90
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: issubnormal_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x90
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: issubnormal_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x90
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 144) ; 0x90 = "subnormal"
ret i1 %class
@@ -1910,8 +1796,7 @@ define i1 @not_issubnormal_f16(half %x) {
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3fe
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, -1, v0
-; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_issubnormal_f16:
@@ -1930,37 +1815,32 @@ define i1 @not_issubnormal_f16(half %x) {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_issubnormal_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x36f
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_issubnormal_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x36f
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_issubnormal_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x36f
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_issubnormal_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x36f
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 879) ; ~0x90 = ~"subnormal"
ret i1 %class
@@ -1972,8 +1852,7 @@ define i1 @iszero_f16(half %x) {
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: iszero_f16:
@@ -1981,38 +1860,33 @@ define i1 @iszero_f16(half %x) {
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x60
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x60
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x60
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x60
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 96) ; 0x60 = "zero"
ret i1 %class
@@ -2024,8 +1898,7 @@ define i1 @not_iszero_f16(half %x) {
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_iszero_f16:
@@ -2047,37 +1920,32 @@ define i1 @not_iszero_f16(half %x) {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_iszero_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x39f
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_iszero_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x39f
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_iszero_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x39f
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_iszero_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39f
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 927) ; ~0x60 = ~"zero"
ret i1 %class
@@ -2089,8 +1957,7 @@ define i1 @ispositive_f16(half %x) {
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
-; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: ispositive_f16:
@@ -2098,38 +1965,33 @@ define i1 @ispositive_f16(half %x) {
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c01
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: ispositive_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x3c0
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: ispositive_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x3c0
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: ispositive_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x3c0
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: ispositive_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3c0
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 960) ; fcPositive
ret i1 %class
@@ -2151,7 +2013,6 @@ define i1 @not_ispositive_f16(half %x) {
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s6, v2
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_ispositive_f16:
@@ -2169,35 +2030,30 @@ define i1 @not_ispositive_f16(half %x) {
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v2
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_ispositive_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 63
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_ispositive_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 63
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_ispositive_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 63
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_ispositive_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 63
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 63) ; ~fcPositive
ret i1 %class
@@ -2217,7 +2073,6 @@ define i1 @isnegative_f16(half %x) {
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: isnegative_f16:
@@ -2233,35 +2088,30 @@ define i1 @isnegative_f16(half %x) {
; GFX7GLISEL-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isnegative_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 60
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isnegative_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, 60
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isnegative_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 60
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isnegative_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 60
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 60) ; fcNegative
ret i1 %class
@@ -2273,12 +2123,11 @@ define i1 @not_isnegative_f16(half %x) {
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
-; GFX7SELDAG-NEXT: s_movk_i32 s5, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s5, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
+; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_isnegative_f16:
@@ -2292,37 +2141,32 @@ define i1 @not_isnegative_f16(half %x) {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_isnegative_f16:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x3c3
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_isnegative_f16:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x3c3
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_isnegative_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x3c3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_isnegative_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3c3
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 963) ; ~fcNegative
ret i1 %class
@@ -2338,7 +2182,6 @@ define i1 @iszero_or_nan_f16(half %x) {
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: iszero_or_nan_f16:
@@ -2348,38 +2191,33 @@ define i1 @iszero_or_nan_f16(half %x) {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xffff83ff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0xffff8400
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_or_nan_f16:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x63
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_or_nan_f16:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x63
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_or_nan_f16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x63
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_or_nan_f16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x63
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 99) ; 0x60|0x3 = "zero|nan"
@@ -2396,7 +2234,6 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 {
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: iszero_or_nan_f_daz:
@@ -2406,38 +2243,33 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xffff83ff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0xffff8400
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_or_nan_f_daz:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x63
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_or_nan_f_daz:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x63
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_or_nan_f_daz:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x63
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_or_nan_f_daz:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x63
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 99) ; 0x60|0x3 = "zero|nan"
@@ -2454,7 +2286,6 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 {
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: iszero_or_nan_f_maybe_daz:
@@ -2464,38 +2295,33 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xffff83ff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0xffff8400
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_or_nan_f_maybe_daz:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x63
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_or_nan_f_maybe_daz:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x63
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_or_nan_f_maybe_daz:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x63
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_or_nan_f_maybe_daz:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x63
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 99) ; 0x60|0x3 = "zero|nan"
@@ -2512,7 +2338,6 @@ define i1 @not_iszero_or_nan_f16(half %x) {
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_iszero_or_nan_f16:
@@ -2532,37 +2357,32 @@ define i1 @not_iszero_or_nan_f16(half %x) {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_iszero_or_nan_f16:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x39c
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_iszero_or_nan_f16:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x39c
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_iszero_or_nan_f16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x39c
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_iszero_or_nan_f16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39c
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 924) ; ~0x60 = "~(zero|nan)"
@@ -2579,7 +2399,6 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_iszero_or_nan_f_daz:
@@ -2599,37 +2418,32 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_iszero_or_nan_f_daz:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x39c
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_iszero_or_nan_f_daz:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x39c
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_iszero_or_nan_f_daz:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x39c
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_iszero_or_nan_f_daz:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39c
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 924) ; ~(0x60|0x3) = "~(zero|nan)"
@@ -2646,7 +2460,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_iszero_or_nan_f_maybe_daz:
@@ -2666,37 +2479,32 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x39c
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x39c
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x39c
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39c
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 924) ; ~(0x60|0x3) = "~(zero|nan)"
@@ -2713,7 +2521,6 @@ define i1 @iszero_or_qnan_f16(half %x) {
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: iszero_or_qnan_f16:
@@ -2723,38 +2530,33 @@ define i1 @iszero_or_qnan_f16(half %x) {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xffff8200, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0xffff8201
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_or_qnan_f16:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x62
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_or_qnan_f16:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x62
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_or_qnan_f16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x62
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_or_qnan_f16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x62
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 98) ; 0x60|0x2 = "zero|qnan"
@@ -2774,7 +2576,6 @@ define i1 @iszero_or_snan_f16(half %x) {
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: iszero_or_snan_f16:
@@ -2787,37 +2588,32 @@ define i1 @iszero_or_snan_f16(half %x) {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x1ff
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_or_snan_f16:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x61
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_or_snan_f16:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x61
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_or_snan_f16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x61
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_or_snan_f16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x61
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 97) ; 0x60|0x1 = "zero|snan"
@@ -2846,7 +2642,6 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_iszero_or_qnan_f16:
@@ -2870,37 +2665,32 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_iszero_or_qnan_f16:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x39d
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_iszero_or_qnan_f16:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x39d
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_iszero_or_qnan_f16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x39d
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_iszero_or_qnan_f16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39d
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 925) ; ~(0x60|0x2) = "~(zero|qnan)"
@@ -2927,7 +2717,6 @@ define i1 @not_iszero_or_snan_f16(half %x) {
; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_iszero_or_snan_f16:
@@ -2950,37 +2739,32 @@ define i1 @not_iszero_or_snan_f16(half %x) {
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_iszero_or_snan_f16:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x39e
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_iszero_or_snan_f16:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x39e
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_iszero_or_snan_f16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x39e
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_iszero_or_snan_f16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39e
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 926) ; ~(0x60|0x1) = "~(zero|snan)"
@@ -2994,8 +2778,7 @@ define i1 @isinf_or_nan_f16(half %x) {
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7bff
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: isinf_or_nan_f16:
@@ -3004,38 +2787,33 @@ define i1 @isinf_or_nan_f16(half %x) {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isinf_or_nan_f16:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x207
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isinf_or_nan_f16:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x207
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isinf_or_nan_f16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x207
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isinf_or_nan_f16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x207
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 519) ; 0x204|0x3 = "inf|nan"
@@ -3049,8 +2827,7 @@ define i1 @not_isinf_or_nan_f16(half %x) {
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_isinf_or_nan_f16:
@@ -3059,38 +2836,33 @@ define i1 @not_isinf_or_nan_f16(half %x) {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_isinf_or_nan_f16:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x1f8
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_isinf_or_nan_f16:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x1f8
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_isinf_or_nan_f16:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x1f8
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_isinf_or_nan_f16:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1f8
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 504) ; ~(0x204|0x3) = "~(inf|nan)"
@@ -3104,8 +2876,7 @@ define i1 @isfinite_or_nan_f(half %x) {
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: isfinite_or_nan_f:
@@ -3114,38 +2885,33 @@ define i1 @isfinite_or_nan_f(half %x) {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isfinite_or_nan_f:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x1fb
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isfinite_or_nan_f:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x1fb
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isfinite_or_nan_f:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x1fb
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isfinite_or_nan_f:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1fb
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 507) ; 0x1f8|0x3 = "finite|nan"
@@ -3159,8 +2925,7 @@ define i1 @not_isfinite_or_nan_f(half %x) {
; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7GLISEL-LABEL: not_isfinite_or_nan_f:
@@ -3169,38 +2934,33 @@ define i1 @not_isfinite_or_nan_f(half %x) {
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
-; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: not_isfinite_or_nan_f:
; GFX8CHECK: ; %bb.0: ; %entry
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x204
-; GFX8CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: not_isfinite_or_nan_f:
; GFX9CHECK: ; %bb.0: ; %entry
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x204
-; GFX9CHECK-NEXT: v_cmp_class_f16_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: not_isfinite_or_nan_f:
; GFX10CHECK: ; %bb.0: ; %entry
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s4, v0, 0x204
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: not_isfinite_or_nan_f:
; GFX11CHECK: ; %bb.0: ; %entry
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 516) ; ~(0x1f8|0x3) = "~(finite|nan)"
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index 347e549e7cf56..37217ed6d64f7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -180,36 +180,30 @@ define i1 @isnan_f32(float %x) nounwind {
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isnan_f32:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isnan_f32:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isnan_f32:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v0, 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isnan_f32:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s0, v0, 3
-; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3) ; nan
ret i1 %1
@@ -989,36 +983,30 @@ define i1 @isnan_f64(double %x) nounwind {
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], 3
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isnan_f64:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], 3
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isnan_f64:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], 3
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isnan_f64:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s4, v[0:1], 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isnan_f64:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s0, v[0:1], 3
-; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3) ; nan
ret i1 %1
@@ -1029,36 +1017,30 @@ define i1 @isnan_f32_strictfp(float %x) strictfp nounwind {
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isnan_f32_strictfp:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isnan_f32_strictfp:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isnan_f32_strictfp:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v0, 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isnan_f32_strictfp:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s0, v0, 3
-; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3) strictfp ; nan
ret i1 %1
@@ -1069,36 +1051,30 @@ define i1 @isnan_f64_strictfp(double %x) strictfp nounwind {
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], 3
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isnan_f64_strictfp:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], 3
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isnan_f64_strictfp:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], 3
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isnan_f64_strictfp:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s4, v[0:1], 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isnan_f64_strictfp:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s0, v[0:1], 3
-; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3) strictfp ; nan
ret i1 %1
@@ -1109,39 +1085,33 @@ define i1 @isinf_f32(float %x) nounwind {
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_mov_b32_e32 v1, 0x204
-; GFX7CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isinf_f32:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x204
-; GFX8CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isinf_f32:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x204
-; GFX9CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isinf_f32:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v0, 0x204
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isinf_f32:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s0, v0, 0x204
-; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f32(float %x, i32 516) ; 0x204 = "inf"
ret i1 %1
@@ -1152,39 +1122,33 @@ define i1 @isinf_f64(double %x) nounwind {
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_mov_b32_e32 v2, 0x204
-; GFX7CHECK-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v2
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isinf_f64:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v2, 0x204
-; GFX8CHECK-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v2
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isinf_f64:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v2, 0x204
-; GFX9CHECK-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v2
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isinf_f64:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s4, v[0:1], 0x204
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isinf_f64:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s0, v[0:1], 0x204
-; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f64(double %x, i32 516) ; 0x204 = "inf"
ret i1 %1
@@ -1195,39 +1159,33 @@ define i1 @isfinite_f32(float %x) nounwind {
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_mov_b32_e32 v1, 0x1f8
-; GFX7CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isfinite_f32:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x1f8
-; GFX8CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isfinite_f32:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x1f8
-; GFX9CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isfinite_f32:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v0, 0x1f8
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isfinite_f32:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s0, v0, 0x1f8
-; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f32(float %x, i32 504) ; 0x1f8 = "finite"
ret i1 %1
@@ -1238,39 +1196,33 @@ define i1 @isfinite_f64(double %x) nounwind {
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_mov_b32_e32 v2, 0x1f8
-; GFX7CHECK-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v2
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isfinite_f64:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v2, 0x1f8
-; GFX8CHECK-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v2
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isfinite_f64:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v2, 0x1f8
-; GFX9CHECK-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v2
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v2
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isfinite_f64:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s4, v[0:1], 0x1f8
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isfinite_f64:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s0, v[0:1], 0x1f8
-; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f64(double %x, i32 504) ; 0x1f8 = "finite"
ret i1 %1
@@ -1281,39 +1233,33 @@ define i1 @isnormal_f32(float %x) nounwind {
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_mov_b32_e32 v1, 0x108
-; GFX7CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: isnormal_f32:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x108
-; GFX8CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: isnormal_f32:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x108
-; GFX9CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: isnormal_f32:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v0, 0x108
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: isnormal_f32:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s0, v0, 0x108
-; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f32(float %x, i32 264) ; 0x108 = "normal"
ret i1 %1
@@ -1377,39 +1323,33 @@ define i1 @issubnormal_f32(float %x) nounwind {
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_mov_b32_e32 v1, 0x90
-; GFX7CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: issubnormal_f32:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x90
-; GFX8CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: issubnormal_f32:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x90
-; GFX9CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: issubnormal_f32:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v0, 0x90
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: issubnormal_f32:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s0, v0, 0x90
-; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f32(float %x, i32 144) ; 0x90 = "subnormal"
ret i1 %1
@@ -1420,39 +1360,33 @@ define i1 @iszero_f32(float %x) nounwind {
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7CHECK-NEXT: v_mov_b32_e32 v1, 0x60
-; GFX7CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: iszero_f32:
; GFX8CHECK: ; %bb.0:
; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, 0x60
-; GFX8CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX8CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX9CHECK-LABEL: iszero_f32:
; GFX9CHECK: ; %bb.0:
; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x60
-; GFX9CHECK-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[4:5], v0, v1
; GFX9CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX10CHECK-LABEL: iszero_f32:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v0, 0x60
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX11CHECK-LABEL: iszero_f32:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s0, v0, 0x60
-; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = call i1 @llvm.is.fpclass.f32(float %x, i32 96) ; 0x60 = "zero"
ret i1 %1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index c3e665fa8269a..779f82454e379 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -16,17 +16,16 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; SI-NEXT: v_mul_hi_u32 v8, v0, v2
; SI-NEXT: v_mul_hi_u32 v9, v1, v3
; SI-NEXT: v_mul_lo_u32 v3, v1, v3
-; SI-NEXT: v_mul_lo_u32 v0, v0, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, v8, v7
-; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc
-; SI-NEXT: v_add_i32_e32 v6, vcc, v1, v5
+; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; SI-NEXT: v_add_i32_e32 v7, vcc, v1, v5
; SI-NEXT: v_add_i32_e64 v1, s[4:5], v1, v5
-; SI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc
-; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc
+; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; SI-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
+; SI-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[3:4]
+; SI-NEXT: v_mul_lo_u32 v0, v0, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: umulo_i64_v_v:
@@ -46,9 +45,8 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
; GFX9-NEXT: v_add3_u32 v1, v1, v5, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: umulo_i64_v_v:
@@ -69,8 +67,7 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u64_e64 s4, 0, v[2:3]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: umulo_i64_v_v:
@@ -95,8 +92,7 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u64_e64 s0, 0, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: umulo_i64_v_v:
@@ -125,8 +121,7 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT: v_cmp_ne_u64_e64 s0, 0, v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
@@ -137,38 +132,36 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; SI-LABEL: smulo_i64_v_v:
; SI: ; %bb.0: ; %bb
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mul_hi_u32 v6, v1, v2
-; SI-NEXT: v_mul_lo_u32 v5, v1, v2
-; SI-NEXT: v_mul_hi_u32 v7, v0, v3
-; SI-NEXT: v_mul_lo_u32 v8, v0, v3
-; SI-NEXT: v_mul_hi_u32 v9, v0, v2
-; SI-NEXT: v_mul_hi_i32 v10, v1, v3
-; SI-NEXT: v_mul_lo_u32 v11, v1, v3
-; SI-NEXT: v_mul_lo_u32 v4, v0, v2
-; SI-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; SI-NEXT: v_add_i32_e32 v9, vcc, v8, v5
-; SI-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5
-; SI-NEXT: v_addc_u32_e32 v8, vcc, v7, v6, vcc
-; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; SI-NEXT: v_mov_b32_e32 v7, v6
-; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; SI-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
-; SI-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v9, vcc
+; SI-NEXT: v_mul_hi_u32 v5, v1, v2
+; SI-NEXT: v_mul_lo_u32 v4, v1, v2
+; SI-NEXT: v_mul_hi_u32 v6, v0, v3
+; SI-NEXT: v_mul_lo_u32 v7, v0, v3
+; SI-NEXT: v_mul_hi_u32 v8, v0, v2
+; SI-NEXT: v_mul_hi_i32 v9, v1, v3
+; SI-NEXT: v_mul_lo_u32 v10, v1, v3
+; SI-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; SI-NEXT: v_add_i32_e32 v8, vcc, v7, v4
+; SI-NEXT: v_add_i32_e64 v4, s[4:5], v7, v4
+; SI-NEXT: v_addc_u32_e32 v7, vcc, v6, v5, vcc
+; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc
+; SI-NEXT: v_mov_b32_e32 v6, v5
+; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v10
+; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; SI-NEXT: v_sub_i32_e32 v9, vcc, v7, v2
+; SI-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v8, vcc
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
-; SI-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; SI-NEXT: v_sub_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT: v_sub_i32_e32 v9, vcc, v7, v0
; SI-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT: v_mov_b32_e32 v0, v4
-; SI-NEXT: v_mov_b32_e32 v1, v5
+; SI-NEXT: v_cndmask_b32_e32 v8, v1, v8, vcc
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT: v_cmp_ne_u64_e64 s[4:5], v[7:8], v[5:6]
+; SI-NEXT: v_mul_lo_u32 v0, v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v4
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: smulo_i64_v_v:
@@ -201,8 +194,7 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v5, v4
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: smulo_i64_v_v:
@@ -226,17 +218,16 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v7, v4
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, 0, v5, vcc_lo
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u64_e64 s4, v[4:5], v[2:3]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: smulo_i64_v_v:
@@ -265,19 +256,18 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v7, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, 0, v5, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_cndmask_b32 v4, v7, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
-; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u64_e64 s0, v[4:5], v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: smulo_i64_v_v:
@@ -310,19 +300,18 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX12-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
+; GFX12-NEXT: v_sub_co_u32 v4, vcc_lo, v7, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, 0, v5, vcc_lo
; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_cndmask_b32 v4, v7, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
-; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
-; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX12-NEXT: v_cmp_ne_u64_e64 s0, v[4:5], v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
@@ -719,50 +708,45 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
; SI-LABEL: smulo_i64_v_4:
; SI: ; %bb.0: ; %bb
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_lshl_b64 v[5:6], v[0:1], 2
-; SI-NEXT: v_alignbit_b32 v4, v1, v0, 30
-; SI-NEXT: v_ashr_i64 v[2:3], v[5:6], 2
-; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT: v_mov_b32_e32 v0, v5
-; SI-NEXT: v_mov_b32_e32 v1, v4
+; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], 2
+; SI-NEXT: v_ashr_i64 v[3:4], v[2:3], 2
+; SI-NEXT: v_cmp_ne_u64_e64 s[4:5], v[3:4], v[0:1]
+; SI-NEXT: v_alignbit_b32 v1, v1, v0, 30
+; SI-NEXT: v_mov_b32_e32 v0, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: smulo_i64_v_4:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX9-NEXT: v_ashrrev_i64 v[4:5], 2, v[2:3]
; GFX9-NEXT: v_alignbit_b32 v3, v1, v0, 30
-; GFX9-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5]
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: smulo_i64_v_4:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX10-NEXT: v_ashrrev_i64 v[4:5], 2, v[2:3]
; GFX10-NEXT: v_alignbit_b32 v3, v1, v0, 30
-; GFX10-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5]
-; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_cmp_ne_u64_e64 s4, v[4:5], v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: v_mov_b32_e32 v1, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: smulo_i64_v_4:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ashrrev_i64 v[4:5], 2, v[2:3]
; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5]
-; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u64_e64 s0, v[4:5], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: smulo_i64_v_4:
@@ -772,14 +756,13 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[0:1]
+; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_ashrrev_i64 v[4:5], 2, v[2:3]
; GFX12-NEXT: v_alignbit_b32 v3, v1, v0, 30
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5]
-; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
-; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT: v_cmp_ne_u64_e64 s0, v[4:5], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
@@ -790,52 +773,46 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
; SI-LABEL: umulo_i64_v_4:
; SI: ; %bb.0: ; %bb
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: v_lshl_b64 v[4:5], v[0:1], 2
-; SI-NEXT: v_alignbit_b32 v3, v1, v0, 30
-; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT: v_mov_b32_e32 v0, v4
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_and_b32_e32 v5, 0x3fffffff, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], 2
+; SI-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], v[0:1]
+; SI-NEXT: v_alignbit_b32 v1, v1, v0, 30
+; SI-NEXT: v_mov_b32_e32 v0, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: umulo_i64_v_4:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1
-; GFX9-NEXT: v_mov_b32_e32 v6, v0
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
-; GFX9-NEXT: v_alignbit_b32 v3, v1, v0, 30
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-NEXT: v_and_b32_e32 v3, 0x3fffffff, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[4:5], v[2:3], v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX9-NEXT: v_alignbit_b32 v1, v1, v0, 30
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: umulo_i64_v_4:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
+; GFX10-NEXT: v_and_b32_e32 v5, 0x3fffffff, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
; GFX10-NEXT: v_alignbit_b32 v3, v1, v0, 30
-; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_cmp_ne_u64_e64 s4, v[4:5], v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: v_mov_b32_e32 v1, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: umulo_i64_v_4:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v5, 0x3fffffff, v1
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u64_e64 s0, v[4:5], v[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: umulo_i64_v_4:
@@ -845,13 +822,12 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1
-; GFX12-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v5, 0x3fffffff, v1
+; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[0:1]
; GFX12-NEXT: v_alignbit_b32 v3, v1, v0, 30
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
-; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT: v_cmp_ne_u64_e64 s0, v[4:5], v[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
diff --git a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
index 546022b4f9c43..de62bac2b5f9c 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
@@ -19,20 +19,18 @@ define void @loop_on_argument(i1 %arg) {
; CHECK-LABEL: loop_on_argument:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_mov_b64 s[6:7], 0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: .LBB0_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_and_b64 s[6:7], exec, vcc
-; CHECK-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; CHECK-NEXT: s_and_b64 s[8:9], exec, s[4:5]
+; CHECK-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
; CHECK-NEXT: global_store_dword v[0:1], v0, off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[6:7]
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
; CHECK-NEXT: ; %bb.2: ; %exit
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
index a407cd20bf762..67c76578e012c 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
@@ -99,9 +99,7 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) {
; GCN-LABEL: break_cond_is_arg:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GCN-NEXT: s_mov_b32 s10, 1
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_branch .LBB2_2
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
index b8e74bc7db09a..6fe55fbdfbe9a 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
@@ -8,13 +8,9 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-LABEL: machinesink_loop_variable_out_of_divergent_loop:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v1, 1, v1
-; CHECK-NEXT: v_and_b32_e32 v3, 1, v3
-; CHECK-NEXT: s_mov_b32 s5, 0
-; CHECK-NEXT: v_cmp_eq_u32_e64 s4, 1, v1
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: s_xor_b32 s6, s4, -1
+; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_inst_prefetch 0x1
; CHECK-NEXT: s_branch .LBB0_3
; CHECK-NEXT: .p2align 6
@@ -25,12 +21,12 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: .LBB0_2: ; %Flow1
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT: v_cmp_ne_u32_e64 s4, 0, v3
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; j lastloop entry
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_or_b32 s5, s4, s5
-; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_8
; CHECK-NEXT: .LBB0_3: ; %for.body33
; CHECK-NEXT: ; =>This Loop Header: Depth=1
@@ -44,35 +40,34 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: s_branch .LBB0_6
-; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_5: ; %if.end118
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s10
; CHECK-NEXT: s_add_i32 s9, s9, 4
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; backedge
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_add_nc_u32_e32 v4, s9, v2
-; CHECK-NEXT: v_cmp_ge_u32_e64 s4, v4, v0
-; CHECK-NEXT: s_or_b32 s8, s4, s8
+; CHECK-NEXT: v_add_nc_u32_e32 v4, s9, v1
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v0
+; CHECK-NEXT: s_or_b32 s8, vcc_lo, s8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execz .LBB0_1
; CHECK-NEXT: .LBB0_6: ; %for.body51
; CHECK-NEXT: ; Parent Loop BB0_3 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: v_mov_b32_e32 v3, 1
-; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; CHECK-NEXT: s_and_saveexec_b32 s10, s5
; CHECK-NEXT: s_cbranch_execz .LBB0_5
; CHECK-NEXT: ; %bb.7: ; %if.then112
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
-; CHECK-NEXT: s_add_i32 s10, s9, 4
+; CHECK-NEXT: s_add_i32 s11, s9, 4
; CHECK-NEXT: v_mov_b32_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v4, s10
-; CHECK-NEXT: ds_write_b32 v1, v4
+; CHECK-NEXT: v_mov_b32_e32 v4, s11
+; CHECK-NEXT: ds_write_b32 v2, v4
; CHECK-NEXT: s_branch .LBB0_5
; CHECK-NEXT: .LBB0_8: ; %for.body159.preheader
; CHECK-NEXT: s_inst_prefetch 0x2
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_mov_b32 vcc_lo, exec_lo
; CHECK-NEXT: .LBB0_9: ; %for.body159
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 1e9994dd8e6ef..85d44460ce3dc 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -54,10 +54,8 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3)
; GFX9-LABEL: lsr_order_mul24_1:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v5, 1, v18
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
-; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9-NEXT: s_cbranch_execz .LBB1_3
; GFX9-NEXT: ; %bb.1: ; %bb19
; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6
@@ -83,11 +81,11 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3)
; GFX9-NEXT: v_sub_u32_e32 v3, v18, v19
; GFX9-NEXT: v_sub_u32_e32 v12, v12, v19
; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v20, v15, v[3:4]
-; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v20, v13
+; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v20, v13
; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14
-; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v18, s[4:5]
+; GFX9-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT: s_and_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v18, vcc
; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4]
; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18
; GFX9-NEXT: v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7]
@@ -95,7 +93,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3)
; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1
; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GFX9-NEXT: ds_write_b32 v6, v3
; GFX9-NEXT: v_add_u32_e32 v6, v6, v8
; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 13f8eff94f86b..f6f3128a1dec9 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -17,14 +17,10 @@ define void @nested_inf_loop(i1 %0, i1 %1) {
; ISA-LABEL: nested_inf_loop:
; ISA-NEXT: %bb.0: ; %BB
; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ISA-NEXT: v_and_b32_e32 v1, 1, v1
-; ISA-NEXT: v_and_b32_e32 v0, 1, v0
-; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
-; ISA-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; ISA-NEXT: s_xor_b64 s[6:7], vcc, -1
+; ISA-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; ISA-NEXT: s_mov_b64 s[8:9], 0
; ISA-NEXT: .LBB0_1: ; %BB1
-; ISA: s_and_b64 s[10:11], exec, s[6:7]
+; ISA: s_and_b64 s[10:11], exec, s[4:5]
; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
; ISA-NEXT: s_cbranch_execnz .LBB0_1
@@ -32,7 +28,7 @@ define void @nested_inf_loop(i1 %0, i1 %1) {
; ISA: s_or_b64 exec, exec, s[8:9]
; ISA-NEXT: s_mov_b64 s[8:9], 0
; ISA-NEXT: .LBB0_3: ; %BB4
-; ISA: s_and_b64 s[10:11], exec, s[4:5]
+; ISA: s_and_b64 s[10:11], exec, s[6:7]
; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
; ISA-NEXT: s_cbranch_execnz .LBB0_3
diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
index d34769ad0fcf0..e77e8dd2a3821 100644
--- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
@@ -10,12 +10,10 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev
; GCN-NEXT: .cfi_startproc
; GCN-NEXT: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: global_load_dwordx2 v[1:2], v[1:2], off
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GCN-NEXT: s_cbranch_execnz .LBB0_3
@@ -27,18 +25,18 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
; GCN-NEXT: .LBB0_3: ; %bb2
-; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: v_mov_b32_e32 v4, v3
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v3, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
-; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB0_2
; GCN-NEXT: .LBB0_4: ; %bb1
-; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: v_mov_b32_e32 v4, v3
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v3, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
index 126b17e718b59..8e22aa65c68b9 100644
--- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
@@ -6,12 +6,11 @@ define i1 @test_srem_odd(i29 %X) nounwind {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s4, 0x1f5a814b
-; CHECK-NEXT: s_mov_b32 s5, 0x52bf5b
; CHECK-NEXT: v_mul_lo_u32 v0, v0, s4
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 0x295fad, v0
; CHECK-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s5, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: s_mov_b32 s4, 0x52bf5b
+; CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%srem = srem i29 %X, 99
%cmp = icmp eq i29 %srem, 0
@@ -31,8 +30,7 @@ define i1 @test_srem_even(i4 %X) nounwind {
; CHECK-NEXT: v_mul_u32_u24_e32 v1, 6, v1
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_and_b32_e32 v0, 15, v0
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%srem = srem i4 %X, 6
%cmp = icmp eq i4 %srem, 1
@@ -49,8 +47,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; CHECK-NEXT: v_and_b32_e32 v1, 60, v1
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_and_b32_e32 v0, 63, v0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%srem = srem i6 %X, 4
%cmp = icmp ne i6 %srem, 0
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index c6a599094fe43..f32ca994589d4 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -203,33 +203,29 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
; WAVE32-OPT-LABEL: func_stacksave_nonentry_block:
; WAVE32-OPT: ; %bb.0: ; %bb0
; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WAVE32-OPT-NEXT: v_and_b32_e32 v0, 1, v0
-; WAVE32-OPT-NEXT: s_mov_b32 s4, exec_lo
-; WAVE32-OPT-NEXT: v_cmpx_eq_u32_e32 1, v0
+; WAVE32-OPT-NEXT: s_and_saveexec_b32 s5, s4
; WAVE32-OPT-NEXT: s_cbranch_execz .LBB4_2
; WAVE32-OPT-NEXT: ; %bb.1: ; %bb1
-; WAVE32-OPT-NEXT: s_lshr_b32 s5, s32, 5
+; WAVE32-OPT-NEXT: s_lshr_b32 s4, s32, 5
; WAVE32-OPT-NEXT: ;;#ASMSTART
-; WAVE32-OPT-NEXT: ; use s5
+; WAVE32-OPT-NEXT: ; use s4
; WAVE32-OPT-NEXT: ;;#ASMEND
; WAVE32-OPT-NEXT: .LBB4_2: ; %bb2
-; WAVE32-OPT-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; WAVE32-OPT-NEXT: s_or_b32 exec_lo, exec_lo, s5
; WAVE32-OPT-NEXT: s_setpc_b64 s[30:31]
;
; WAVE64-OPT-LABEL: func_stacksave_nonentry_block:
; WAVE64-OPT: ; %bb.0: ; %bb0
; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WAVE64-OPT-NEXT: v_and_b32_e32 v0, 1, v0
-; WAVE64-OPT-NEXT: s_mov_b64 s[4:5], exec
-; WAVE64-OPT-NEXT: v_cmpx_eq_u32_e32 1, v0
+; WAVE64-OPT-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; WAVE64-OPT-NEXT: s_cbranch_execz .LBB4_2
; WAVE64-OPT-NEXT: ; %bb.1: ; %bb1
-; WAVE64-OPT-NEXT: s_lshr_b32 s6, s32, 6
+; WAVE64-OPT-NEXT: s_lshr_b32 s4, s32, 6
; WAVE64-OPT-NEXT: ;;#ASMSTART
-; WAVE64-OPT-NEXT: ; use s6
+; WAVE64-OPT-NEXT: ; use s4
; WAVE64-OPT-NEXT: ;;#ASMEND
; WAVE64-OPT-NEXT: .LBB4_2: ; %bb2
-; WAVE64-OPT-NEXT: s_or_b64 exec, exec, s[4:5]
+; WAVE64-OPT-NEXT: s_or_b64 exec, exec, s[6:7]
; WAVE64-OPT-NEXT: s_setpc_b64 s[30:31]
;
; WAVE32-O0-LABEL: func_stacksave_nonentry_block:
@@ -238,20 +234,13 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1
; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4
-; WAVE32-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; WAVE32-O0-NEXT: v_mov_b32_e32 v1, v0
-; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1
-; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7
-; WAVE32-O0-NEXT: v_and_b32_e64 v1, 1, v1
-; WAVE32-O0-NEXT: v_cmp_eq_u32_e64 s5, v1, 1
+; WAVE32-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; WAVE32-O0-NEXT: s_mov_b32 s4, exec_lo
-; WAVE32-O0-NEXT: s_waitcnt vmcnt(0)
; WAVE32-O0-NEXT: v_writelane_b32 v0, s4, 0
; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1
; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7
-; WAVE32-O0-NEXT: s_and_b32 s4, s4, s5
+; WAVE32-O0-NEXT: s_and_b32 s4, s4, s6
; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4
; WAVE32-O0-NEXT: s_cbranch_execz .LBB4_2
; WAVE32-O0-NEXT: ; %bb.1: ; %bb1
@@ -280,15 +269,8 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5]
-; WAVE64-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; WAVE64-O0-NEXT: v_mov_b32_e32 v1, v0
-; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
-; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11]
-; WAVE64-O0-NEXT: v_and_b32_e64 v1, 1, v1
-; WAVE64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, 1
+; WAVE64-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; WAVE64-O0-NEXT: s_mov_b64 s[4:5], exec
-; WAVE64-O0-NEXT: s_waitcnt vmcnt(0)
; WAVE64-O0-NEXT: v_writelane_b32 v0, s4, 0
; WAVE64-O0-NEXT: v_writelane_b32 v0, s5, 1
; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
@@ -322,14 +304,12 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
; WAVE32-WWM-PREALLOC: ; %bb.0: ; %bb0
; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1
-; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4
-; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; WAVE32-WWM-PREALLOC-NEXT: v_and_b32_e64 v0, 1, v0
-; WAVE32-WWM-PREALLOC-NEXT: v_cmp_eq_u32_e64 s5, v0, 1
+; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s4, exec_lo
-; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v1, s4, 0
-; WAVE32-WWM-PREALLOC-NEXT: s_and_b32 s4, s4, s5
+; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v0, s4, 0
+; WAVE32-WWM-PREALLOC-NEXT: s_and_b32 s4, s4, s6
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4
; WAVE32-WWM-PREALLOC-NEXT: s_cbranch_execz .LBB4_2
; WAVE32-WWM-PREALLOC-NEXT: ; %bb.1: ; %bb1
@@ -339,11 +319,11 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
; WAVE32-WWM-PREALLOC-NEXT: ; use s4
; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMEND
; WAVE32-WWM-PREALLOC-NEXT: .LBB4_2: ; %bb2
-; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v1, 0
+; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v0, 0
; WAVE32-WWM-PREALLOC-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; WAVE32-WWM-PREALLOC-NEXT: ; kill: killed $vgpr1
+; WAVE32-WWM-PREALLOC-NEXT: ; kill: killed $vgpr0
; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1
-; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4
; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0)
; WAVE32-WWM-PREALLOC-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
index a0dd0e7e78f9d..0de7658dc39c4 100644
--- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
@@ -6,11 +6,10 @@ define i1 @test_urem_odd(i13 %X) nounwind {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v0, 0x1fff, v0
-; CHECK-NEXT: s_movk_i32 s4, 0x667
; CHECK-NEXT: v_mul_u32_u24_e32 v0, 0xccd, v0
; CHECK-NEXT: v_and_b32_e32 v0, 0x1fff, v0
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: s_movk_i32 s4, 0x667
+; CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%urem = urem i13 %X, 5
%cmp = icmp eq i13 %urem, 0
@@ -22,14 +21,13 @@ define i1 @test_urem_even(i27 %X) nounwind {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s4, 0x6db6db7
-; CHECK-NEXT: s_mov_b32 s5, 0x924925
; CHECK-NEXT: v_mul_lo_u32 v0, v0, s4
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 26, v0
; CHECK-NEXT: v_bfe_u32 v0, v0, 1, 26
; CHECK-NEXT: v_or_b32_e32 v0, v0, v1
; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s5, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: s_mov_b32 s4, 0x924925
+; CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%urem = urem i27 %X, 14
%cmp = icmp eq i27 %urem, 0
@@ -43,8 +41,7 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
; CHECK-NEXT: v_and_b32_e32 v0, 15, v0
; CHECK-NEXT: v_mul_u32_u24_e32 v0, 13, v0
; CHECK-NEXT: v_and_b32_e32 v0, 15, v0
-; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 3, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[4:5], 3, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%urem = urem i4 %X, 5
%cmp = icmp ne i4 %urem, 0
@@ -58,8 +55,7 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
; CHECK-NEXT: v_and_b32_e32 v0, 0x1ff, v0
; CHECK-NEXT: v_mul_u32_u24_e32 v0, 0x133, v0
; CHECK-NEXT: v_and_b32_e32 v0, 0x1ff, v0
-; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[4:5], 1, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%urem = urem i9 %X, -5
%cmp = icmp ne i9 %urem, 0
>From c0dfff71a591278bb15d328e6cbd804229c7d8ab Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 13 May 2024 16:41:49 -0500
Subject: [PATCH 19/20] Testcase updates.
---
.../AMDGPU/GlobalISel/function-i1-args.ll | 142 -----
llvm/test/CodeGen/AMDGPU/function-i1-args.ll | 514 +++++++++++++++++-
2 files changed, 511 insertions(+), 145 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
index 47c4682196d60..8fdd512a1c61a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-i1-args.ll
@@ -706,41 +706,6 @@ define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
ret void
}
-define void @test_call_void_func_i1_i1_inreg() {
-; GFX9-LABEL: name: test_call_void_func_i1_i1_inreg
-; GFX9: bb.1 (%ir-block.0):
-; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_i1_inreg]]
-; GFX9-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
-; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
-; GFX9-NEXT: $sgpr6 = COPY [[ANYEXT]](s32)
-; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT: SI_RETURN
-;
-; GFX11-LABEL: name: test_call_void_func_i1_i1_inreg
-; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_i1_inreg]]
-; GFX11-NEXT: $sgpr0 = COPY [[LOAD]](s1)
-; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
-; GFX11-NEXT: $sgpr1 = COPY [[ANYEXT]](s32)
-; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
-; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT: SI_RETURN
- %val = load i1, ptr addrspace(1) undef
- call void @void_func_i1_i1_inreg(i1 %val, i1 inreg true)
- ret void
-}
-
define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
; GFX9-LABEL: name: void_func_i1_inreg_i1
; GFX9: bb.1 (%ir-block.0):
@@ -770,42 +735,6 @@ define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
ret void
}
-define void @test_call_void_func_i1_inreg_i1() {
-; GFX9-LABEL: name: test_call_void_func_i1_inreg_i1
-; GFX9: bb.1 (%ir-block.0):
-; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_inreg_i1]]
-; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX9-NEXT: $sgpr4 = COPY [[ANYEXT]](s32)
-; GFX9-NEXT: $sgpr6_sgpr7 = COPY [[CONST]](s1)
-; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr4, implicit $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT: SI_RETURN
-;
-; GFX11-LABEL: name: test_call_void_func_i1_inreg_i1
-; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_inreg_i1]]
-; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
-; GFX11-NEXT: $sgpr1 = COPY [[CONST]](s1)
-; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
-; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT: SI_RETURN
-
- %val = load i1, ptr addrspace(1) undef
- call void @void_func_i1_inreg_i1(i1 inreg %val, i1 true)
- ret void
-}
-
define void @void_func_zeroext_i1_i1_inreg(i1 zeroext %arg0, i1 inreg %arg1) {
; GFX9-LABEL: name: void_func_zeroext_i1_i1_inreg
; GFX9: bb.1 (%ir-block.0):
@@ -835,41 +764,6 @@ define void @void_func_zeroext_i1_i1_inreg(i1 zeroext %arg0, i1 inreg %arg1) {
ret void
}
-define void @test_call_void_func_zeroext_i1_i1_inreg() {
-; GFX9-LABEL: name: test_call_void_func_zeroext_i1_i1_inreg
-; GFX9: bb.1 (%ir-block.0):
-; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_zeroext_i1_i1_inreg]]
-; GFX9-NEXT: $sgpr4_sgpr5 = COPY [[LOAD]](s1)
-; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
-; GFX9-NEXT: $sgpr6 = COPY [[ANYEXT]](s32)
-; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT: SI_RETURN
-;
-; GFX11-LABEL: name: test_call_void_func_zeroext_i1_i1_inreg
-; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_zeroext_i1_i1_inreg]]
-; GFX11-NEXT: $sgpr0 = COPY [[LOAD]](s1)
-; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[CONST]](s1)
-; GFX11-NEXT: $sgpr1 = COPY [[ANYEXT]](s32)
-; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
-; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT: SI_RETURN
- %val = load i1, ptr addrspace(1) undef
- call void @void_func_zeroext_i1_i1_inreg(i1 zeroext %val, i1 inreg true)
- ret void
-}
-
define void @void_func_i1_inreg_zeroext_i1(i1 inreg %arg0, i1 zeroext %arg1) {
; GFX9-LABEL: name: void_func_i1_inreg_zeroext_i1
; GFX9: bb.1 (%ir-block.0):
@@ -899,42 +793,6 @@ define void @void_func_i1_inreg_zeroext_i1(i1 inreg %arg0, i1 zeroext %arg1) {
ret void
}
-define void @test_call_void_func_i1_inreg_zeroext_i1() {
-; GFX9-LABEL: name: test_call_void_func_i1_inreg_zeroext_i1
-; GFX9: bb.1 (%ir-block.0):
-; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX9-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX9-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX9-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_i1_inreg_zeroext_i1
-; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX9-NEXT: $sgpr4 = COPY [[ANYEXT]](s32)
-; GFX9-NEXT: $sgpr6_sgpr7 = COPY [[CONST]](s1)
-; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-; GFX9-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @void_func_i1_inreg_zeroext_i1, csr_amdgpu, implicit $sgpr4, implicit $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-; GFX9-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX9-NEXT: SI_RETURN
-;
-; GFX11-LABEL: name: test_call_void_func_i1_inreg_zeroext_i1
-; GFX11: bb.1 (%ir-block.0):
-; GFX11-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-; GFX11-NEXT: [[CONST:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `ptr addrspace(1) undef`, addrspace 1)
-; GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
-; GFX11-NEXT: [[GLOBAL:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @[[CALLEE:void_func_i1_inreg_zeroext_i1]]
-; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-; GFX11-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
-; GFX11-NEXT: $sgpr1 = COPY [[CONST]](s1)
-; GFX11-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GLOBAL]](p0), @[[CALLEE]], csr_amdgpu, implicit $sgpr0, implicit $sgpr1
-; GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-; GFX11-NEXT: SI_RETURN
-
- %val = load i1, ptr addrspace(1) undef
- call void @void_func_i1_inreg_zeroext_i1(i1 inreg %val, i1 zeroext true)
- ret void
-}
-
define void @void_func_signext_i1_i1_inreg(i1 signext %arg0, i1 inreg %arg1) {
; GFX9-LABEL: name: void_func_signext_i1_i1_inreg
; GFX9: bb.1 (%ir-block.0):
diff --git a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
index 2d63695674404..caf0879671d85 100644
--- a/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-i1-args.ll
@@ -733,8 +733,8 @@ define void @void_func_a2i1_i1([2 x i1] %arg0, i1 %arg1) {
ret void
}
-define void @many_i1_args(
-; GFX9-LABEL: many_i1_args:
+define void @exhaust_sgprs_by_i1_args(
+; GFX9-LABEL: exhaust_sgprs_by_i1_args:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
@@ -835,7 +835,7 @@ define void @many_i1_args(
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: many_i1_args:
+; GFX11-LABEL: exhaust_sgprs_by_i1_args:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
@@ -978,6 +978,327 @@ define void @many_i1_args(
ret void
}
+define void @void_func_a64i1([64 x i1] %arg0) {
+; GFX9-LABEL: void_func_a64i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:76
+; GFX9-NEXT: v_and_b32_e32 v30, 1, v30
+; GFX9-NEXT: v_and_b32_e32 v29, 1, v29
+; GFX9-NEXT: v_and_b32_e32 v28, 1, v28
+; GFX9-NEXT: v_and_b32_e32 v27, 1, v27
+; GFX9-NEXT: v_and_b32_e32 v26, 1, v26
+; GFX9-NEXT: v_and_b32_e32 v25, 1, v25
+; GFX9-NEXT: v_and_b32_e32 v24, 1, v24
+; GFX9-NEXT: v_and_b32_e32 v23, 1, v23
+; GFX9-NEXT: v_and_b32_e32 v22, 1, v22
+; GFX9-NEXT: v_and_b32_e32 v21, 1, v21
+; GFX9-NEXT: v_and_b32_e32 v20, 1, v20
+; GFX9-NEXT: v_and_b32_e32 v19, 1, v19
+; GFX9-NEXT: v_and_b32_e32 v18, 1, v18
+; GFX9-NEXT: v_and_b32_e32 v17, 1, v17
+; GFX9-NEXT: v_and_b32_e32 v16, 1, v16
+; GFX9-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX9-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX9-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX9-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX9-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX9-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX9-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT: global_store_byte v[0:1], v30, off
+; GFX9-NEXT: global_store_byte v[0:1], v29, off
+; GFX9-NEXT: global_store_byte v[0:1], v28, off
+; GFX9-NEXT: global_store_byte v[0:1], v27, off
+; GFX9-NEXT: global_store_byte v[0:1], v26, off
+; GFX9-NEXT: global_store_byte v[0:1], v25, off
+; GFX9-NEXT: global_store_byte v[0:1], v24, off
+; GFX9-NEXT: global_store_byte v[0:1], v23, off
+; GFX9-NEXT: global_store_byte v[0:1], v22, off
+; GFX9-NEXT: global_store_byte v[0:1], v21, off
+; GFX9-NEXT: global_store_byte v[0:1], v20, off
+; GFX9-NEXT: global_store_byte v[0:1], v19, off
+; GFX9-NEXT: global_store_byte v[0:1], v18, off
+; GFX9-NEXT: global_store_byte v[0:1], v17, off
+; GFX9-NEXT: global_store_byte v[0:1], v16, off
+; GFX9-NEXT: global_store_byte v[0:1], v15, off
+; GFX9-NEXT: global_store_byte v[0:1], v14, off
+; GFX9-NEXT: global_store_byte v[0:1], v13, off
+; GFX9-NEXT: global_store_byte v[0:1], v12, off
+; GFX9-NEXT: global_store_byte v[0:1], v11, off
+; GFX9-NEXT: global_store_byte v[0:1], v10, off
+; GFX9-NEXT: global_store_byte v[0:1], v9, off
+; GFX9-NEXT: global_store_byte v[0:1], v8, off
+; GFX9-NEXT: global_store_byte v[0:1], v7, off
+; GFX9-NEXT: global_store_byte v[0:1], v6, off
+; GFX9-NEXT: global_store_byte v[0:1], v5, off
+; GFX9-NEXT: global_store_byte v[0:1], v4, off
+; GFX9-NEXT: global_store_byte v[0:1], v3, off
+; GFX9-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NEXT: s_waitcnt vmcnt(29)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:72
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:68
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:64
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:60
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:56
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:48
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:44
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:40
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:36
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:32
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:28
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:24
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:20
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:12
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:8
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32 offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: buffer_load_ubyte v31, off, s[0:3], s32
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX9-NEXT: global_store_byte v[0:1], v31, off
+; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT: global_store_byte v[0:1], v1, off
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[28:29]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[26:27]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[24:25]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[22:23]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[20:21]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[18:19]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[16:17]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[14:15]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[12:13]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[10:11]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_a64i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: scratch_load_u8 v32, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u8 v33, off, s32 offset:4
+; GFX11-NEXT: scratch_load_u8 v34, off, s32
+; GFX11-NEXT: v_cndmask_b32_e64 v35, 0, 1, s29
+; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v36, 0, 1, s28
+; GFX11-NEXT: v_cndmask_b32_e64 v37, 0, 1, s27
+; GFX11-NEXT: v_cndmask_b32_e64 v38, 0, 1, s26
+; GFX11-NEXT: v_cndmask_b32_e64 v39, 0, 1, s25
+; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v48, 0, 1, s24
+; GFX11-NEXT: v_cndmask_b32_e64 v49, 0, 1, s23
+; GFX11-NEXT: v_cndmask_b32_e64 v50, 0, 1, s22
+; GFX11-NEXT: v_cndmask_b32_e64 v51, 0, 1, s21
+; GFX11-NEXT: v_cndmask_b32_e64 v52, 0, 1, s20
+; GFX11-NEXT: v_cndmask_b32_e64 v53, 0, 1, s19
+; GFX11-NEXT: v_cndmask_b32_e64 v54, 0, 1, s18
+; GFX11-NEXT: v_cndmask_b32_e64 v55, 0, 1, s17
+; GFX11-NEXT: v_cndmask_b32_e64 v64, 0, 1, s16
+; GFX11-NEXT: v_cndmask_b32_e64 v65, 0, 1, s15
+; GFX11-NEXT: v_cndmask_b32_e64 v66, 0, 1, s14
+; GFX11-NEXT: v_cndmask_b32_e64 v67, 0, 1, s13
+; GFX11-NEXT: v_cndmask_b32_e64 v68, 0, 1, s12
+; GFX11-NEXT: v_cndmask_b32_e64 v69, 0, 1, s11
+; GFX11-NEXT: v_cndmask_b32_e64 v70, 0, 1, s10
+; GFX11-NEXT: v_cndmask_b32_e64 v71, 0, 1, s9
+; GFX11-NEXT: v_cndmask_b32_e64 v80, 0, 1, s8
+; GFX11-NEXT: v_cndmask_b32_e64 v81, 0, 1, s7
+; GFX11-NEXT: v_cndmask_b32_e64 v82, 0, 1, s6
+; GFX11-NEXT: v_cndmask_b32_e64 v83, 0, 1, s5
+; GFX11-NEXT: v_cndmask_b32_e64 v84, 0, 1, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v85, 0, 1, s3
+; GFX11-NEXT: v_cndmask_b32_e64 v86, 0, 1, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v87, 0, 1, s1
+; GFX11-NEXT: v_and_b32_e32 v30, 1, v30
+; GFX11-NEXT: v_and_b32_e32 v29, 1, v29
+; GFX11-NEXT: v_and_b32_e32 v28, 1, v28
+; GFX11-NEXT: v_and_b32_e32 v27, 1, v27
+; GFX11-NEXT: v_and_b32_e32 v26, 1, v26
+; GFX11-NEXT: v_and_b32_e32 v25, 1, v25
+; GFX11-NEXT: v_and_b32_e32 v24, 1, v24
+; GFX11-NEXT: v_and_b32_e32 v23, 1, v23
+; GFX11-NEXT: v_and_b32_e32 v22, 1, v22
+; GFX11-NEXT: v_and_b32_e32 v21, 1, v21
+; GFX11-NEXT: v_and_b32_e32 v20, 1, v20
+; GFX11-NEXT: v_and_b32_e32 v19, 1, v19
+; GFX11-NEXT: v_and_b32_e32 v18, 1, v18
+; GFX11-NEXT: v_and_b32_e32 v17, 1, v17
+; GFX11-NEXT: v_and_b32_e32 v16, 1, v16
+; GFX11-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX11-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX11-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX11-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX11-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX11-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX11-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX11-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX11-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: global_store_b8 v[0:1], v35, off
+; GFX11-NEXT: global_store_b8 v[0:1], v36, off
+; GFX11-NEXT: global_store_b8 v[0:1], v37, off
+; GFX11-NEXT: global_store_b8 v[0:1], v38, off
+; GFX11-NEXT: global_store_b8 v[0:1], v39, off
+; GFX11-NEXT: global_store_b8 v[0:1], v48, off
+; GFX11-NEXT: global_store_b8 v[0:1], v49, off
+; GFX11-NEXT: global_store_b8 v[0:1], v50, off
+; GFX11-NEXT: global_store_b8 v[0:1], v51, off
+; GFX11-NEXT: global_store_b8 v[0:1], v52, off
+; GFX11-NEXT: global_store_b8 v[0:1], v53, off
+; GFX11-NEXT: global_store_b8 v[0:1], v54, off
+; GFX11-NEXT: global_store_b8 v[0:1], v55, off
+; GFX11-NEXT: global_store_b8 v[0:1], v64, off
+; GFX11-NEXT: global_store_b8 v[0:1], v65, off
+; GFX11-NEXT: global_store_b8 v[0:1], v66, off
+; GFX11-NEXT: global_store_b8 v[0:1], v67, off
+; GFX11-NEXT: global_store_b8 v[0:1], v68, off
+; GFX11-NEXT: global_store_b8 v[0:1], v69, off
+; GFX11-NEXT: global_store_b8 v[0:1], v70, off
+; GFX11-NEXT: global_store_b8 v[0:1], v71, off
+; GFX11-NEXT: global_store_b8 v[0:1], v80, off
+; GFX11-NEXT: global_store_b8 v[0:1], v81, off
+; GFX11-NEXT: global_store_b8 v[0:1], v82, off
+; GFX11-NEXT: global_store_b8 v[0:1], v83, off
+; GFX11-NEXT: global_store_b8 v[0:1], v84, off
+; GFX11-NEXT: global_store_b8 v[0:1], v85, off
+; GFX11-NEXT: global_store_b8 v[0:1], v86, off
+; GFX11-NEXT: global_store_b8 v[0:1], v87, off
+; GFX11-NEXT: global_store_b8 v[0:1], v30, off
+; GFX11-NEXT: global_store_b8 v[0:1], v29, off
+; GFX11-NEXT: global_store_b8 v[0:1], v28, off
+; GFX11-NEXT: s_clause 0x19
+; GFX11-NEXT: global_store_b8 v[0:1], v27, off
+; GFX11-NEXT: global_store_b8 v[0:1], v26, off
+; GFX11-NEXT: global_store_b8 v[0:1], v25, off
+; GFX11-NEXT: global_store_b8 v[0:1], v24, off
+; GFX11-NEXT: global_store_b8 v[0:1], v23, off
+; GFX11-NEXT: global_store_b8 v[0:1], v22, off
+; GFX11-NEXT: global_store_b8 v[0:1], v21, off
+; GFX11-NEXT: global_store_b8 v[0:1], v20, off
+; GFX11-NEXT: global_store_b8 v[0:1], v19, off
+; GFX11-NEXT: global_store_b8 v[0:1], v18, off
+; GFX11-NEXT: global_store_b8 v[0:1], v17, off
+; GFX11-NEXT: global_store_b8 v[0:1], v16, off
+; GFX11-NEXT: global_store_b8 v[0:1], v15, off
+; GFX11-NEXT: global_store_b8 v[0:1], v14, off
+; GFX11-NEXT: global_store_b8 v[0:1], v13, off
+; GFX11-NEXT: global_store_b8 v[0:1], v12, off
+; GFX11-NEXT: global_store_b8 v[0:1], v11, off
+; GFX11-NEXT: global_store_b8 v[0:1], v10, off
+; GFX11-NEXT: global_store_b8 v[0:1], v9, off
+; GFX11-NEXT: global_store_b8 v[0:1], v8, off
+; GFX11-NEXT: global_store_b8 v[0:1], v7, off
+; GFX11-NEXT: global_store_b8 v[0:1], v6, off
+; GFX11-NEXT: global_store_b8 v[0:1], v5, off
+; GFX11-NEXT: global_store_b8 v[0:1], v4, off
+; GFX11-NEXT: global_store_b8 v[0:1], v3, off
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: v_cndmask_b32_e64 v31, 0, 1, s0
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_and_b32_e32 v2, 1, v32
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v3, 1, v33
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v4, 1, v34
+; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: global_store_b8 v[0:1], v3, off
+; GFX11-NEXT: global_store_b8 v[0:1], v4, off
+; GFX11-NEXT: global_store_b8 v[0:1], v31, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store [64 x i1] %arg0, ptr addrspace(1) undef
+ ret void
+}
+
define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
; GFX9-LABEL: void_func_i1_i1_inreg:
; GFX9: ; %bb.0:
@@ -1008,6 +1329,76 @@ define void @void_func_i1_i1_inreg(i1 %arg0, i1 inreg %arg1) {
ret void
}
+define void @test_call_void_func_i1_i1_inreg() {
+; GFX9-LABEL: test_call_void_func_i1_i1_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s7, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_i1_i1_inreg at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_i1_i1_inreg at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: s_mov_b32 s6, 1
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s7
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_void_func_i1_i1_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_i1_i1_inreg at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_i1_i1_inreg at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT: s_mov_b32 s1, 1
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
+; GFX11-NEXT: s_mov_b32 s33, s4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_i1_inreg(i1 %val, i1 inreg true)
+ ret void
+}
+
define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
; GFX9-LABEL: void_func_i1_inreg_i1:
; GFX9: ; %bb.0:
@@ -1037,3 +1428,120 @@ define void @void_func_i1_inreg_i1(i1 inreg %arg0, i1 %arg1) {
ret void
}
+define void @void_func_zeroext_i1_i1_inreg(i1 zeroext %arg0, i1 inreg %arg1) {
+; GFX9-LABEL: void_func_zeroext_i1_i1_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: s_and_b32 s4, s6, 1
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_zeroext_i1_i1_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_and_b32 s0, s1, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_i1_inreg_zeroext_i1(i1 inreg %arg0, i1 zeroext %arg1) {
+; GFX9-LABEL: void_func_i1_inreg_zeroext_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_and_b32 s4, s4, 1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: global_store_byte v[0:1], v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_inreg_zeroext_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s0, s0, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_signext_i1_i1_inreg(i1 signext %arg0, i1 inreg %arg1) {
+; GFX9-LABEL: void_func_signext_i1_i1_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: s_and_b32 s4, s6, 1
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_signext_i1_i1_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_and_b32 s0, s1, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_i1_inreg_signext_i1(i1 inreg %arg0, i1 signext %arg1) {
+; GFX9-LABEL: void_func_i1_inreg_signext_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_and_b32 s4, s4, 1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: global_store_byte v[0:1], v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_inreg_signext_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s0, s0, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
>From 6f2289b8a7e896765d5fa14dda3c87052f9c8a5a Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 13 May 2024 17:44:52 -0500
Subject: [PATCH 20/20] Fix test file after merge from main.
---
.../CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 5d2f794b94c4d..f0ab1b25d6f03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -101,8 +101,8 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY]], [[C]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1)
- ; CHECK-NEXT: G_BRCOND [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS]](s1), %bb.2
+ ; CHECK-NEXT: [[INTRINSIC_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1)
+ ; CHECK-NEXT: G_BRCOND [[INTRINSIC_W_SIDE_EFFECTS]](s1), %bb.2
; CHECK-NEXT: G_BR %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.bb1:
More information about the llvm-commits
mailing list