[llvm] [AMDGPU][GlobalISel] Expand SGPR S1 exts into G_SELECT (PR #68858)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 12 00:00:13 PDT 2023
https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/68858
This patch prevents most SGPR S1 G_SZA_EXT from reaching ISel by lowering them to a G_SELECT in RegBankSelect.
It also adds some new ISel logic to make G_SELECT 1,0 be lowered to a simple SCC copy. This is because a copy of SCC is already a `s_cselect x, 1, 0`. Without that we get some regressions.
>From ca999104982649ec0978206fe4a85e778120e509 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 12 Oct 2023 08:59:25 +0200
Subject: [PATCH] [AMDGPU][GlobalISel] Expand SGPR S1 exts into G_SELECT
This patch prevents most SGPR S1 G_SZA_EXT from reaching ISel by lowering them to a G_SELECT in RegBankSelect.
It also adds some new ISel logic to make G_SELECT 1,0 be lowered to a simple SCC copy. This is because a copy of SCC is already a `s_cselect x, 1, 0`. Without that we get some regressions.
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 12 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 48 ++--
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 50 ++++-
llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll | 9 -
.../GlobalISel/llvm.amdgcn.set.inactive.ll | 1 -
.../CodeGen/AMDGPU/GlobalISel/localizer.ll | 2 -
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 212 +++++++++---------
.../GlobalISel/regbankselect-anyext.mir | 32 ++-
.../GlobalISel/regbankselect-freeze.mir | 5 +-
.../AMDGPU/GlobalISel/regbankselect-sext.mir | 34 ++-
.../AMDGPU/GlobalISel/regbankselect-zext.mir | 36 ++-
.../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 1 -
.../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 1 -
.../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 1 -
.../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 1 -
llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll | 9 -
.../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 1 -
.../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 1 -
.../AMDGPU/commute-compares-scalar-float.ll | 80 +++----
llvm/test/CodeGen/AMDGPU/fptrunc.ll | 4 -
.../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 3 +-
llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll | 140 +++++-------
22 files changed, 376 insertions(+), 307 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 8d4cad4c07bc74c..16f82b50f6dec01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -133,6 +133,15 @@ def expand_promoted_fmed3 : GICombineRule<
} // End Predicates = [NotHasMed3_16]
+// Form G_AMDGPU_SGPR_S1_TO_VCC_COPY from (copy (trunc)) with the right regbanks.
+def fold_sgpr_s1_to_vcc_copy : GICombineRule<
+ (defs root:$dst),
+ (match (COPY $dst, $tmp),
+ (G_TRUNC $tmp, $src),
+ [{ return Helper.matchExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]),
+ (apply [{ Helper.applyExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }])
+>;
+
// Combines which should only apply on SI/CI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
@@ -155,7 +164,8 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
def AMDGPURegBankCombiner : GICombiner<
"AMDGPURegBankCombinerImpl",
- [unmerge_merge, unmerge_cst, unmerge_undef,
+ [unmerge_merge, unmerge_cst, unmerge_undef, trunc_ext_fold,
+ anyext_trunc_fold, select_constant_cmp,
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 31d72fb8cadd8a6..85b5b5988619905 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2153,24 +2153,40 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
assert(Size <= 32 || Size == 64);
const MachineOperand &CCOp = I.getOperand(1);
Register CCReg = CCOp.getReg();
+
+ Register TrueVal = I.getOperand(2).getReg();
+ Register FalseVal = I.getOperand(3).getReg();
if (!isVCC(CCReg, *MRI)) {
unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
AMDGPU::S_CSELECT_B32;
MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
.addReg(CCReg);
- // The generic constrainSelectedInstRegOperands doesn't work for the scc register
- // bank, because it does not cover the register class that we used to represent
- // for it. So we need to manually set the register class here.
- if (!MRI->getRegClassOrNull(CCReg))
- MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
- MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
- .add(I.getOperand(2))
- .add(I.getOperand(3));
+ bool Ret = constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
+
+ // select 1, 0 is just a copy SCC.
+ if (getIConstantVRegVal(TrueVal, *MRI) == 1 &&
+ getIConstantVRegVal(FalseVal, *MRI) == 0) {
+ // FIXME: Do we need to have two copies or could we get away with just
+ // returning CCReg?
+ MachineInstr *RetCopy =
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(AMDGPU::SCC);
+ Ret |= constrainSelectedInstRegOperands(*RetCopy, TII, TRI, RBI);
+ } else {
+ // The generic constrainSelectedInstRegOperands doesn't work for the scc
+ // register bank, because it does not cover the register class that we
+ // used to represent for it. So we need to manually set the register
+ // class here.
+ if (!MRI->getRegClassOrNull(CCReg))
+ MRI->setRegClass(CCReg,
+ TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
+ MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
+ .addReg(TrueVal)
+ .addReg(FalseVal);
+ Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+ }
- bool Ret = false;
- Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
- Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
I.eraseFromParent();
return Ret;
}
@@ -2181,11 +2197,11 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
MachineInstr *Select =
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
- .addImm(0)
- .add(I.getOperand(3))
- .addImm(0)
- .add(I.getOperand(2))
- .add(I.getOperand(1));
+ .addImm(0)
+ .addReg(FalseVal)
+ .addImm(0)
+ .addReg(TrueVal)
+ .add(I.getOperand(1));
bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
I.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5b056bd9e5dba2c..402ee5cfbb5a053 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2556,16 +2556,62 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_SEXT:
case AMDGPU::G_ZEXT:
case AMDGPU::G_ANYEXT: {
+ Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
LLT SrcTy = MRI.getType(SrcReg);
const bool Signed = Opc == AMDGPU::G_SEXT;
+ const LLT S16 = LLT::scalar(16);
+ const LLT S32 = LLT::scalar(32);
+ const LLT S64 = LLT::scalar(64);
+
assert(OpdMapper.getVRegs(1).empty());
const RegisterBank *SrcBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
- Register DstReg = MI.getOperand(0).getReg();
+ LLT SelType = MRI.getType(MI.getOperand(0).getReg());
+
+ // Extending SGPR S1 to S16/32/64.
+ if (SrcBank == &AMDGPU::SGPRRegBank &&
+ MRI.getType(SrcReg) == LLT::scalar(1) &&
+ (SelType == S32 || SelType == S16 || SelType == S64)) {
+
+ Register False = B.buildConstant(S32, 0).getReg(0);
+ MRI.setRegBank(False, AMDGPU::SGPRRegBank);
+
+ Register True = Signed ? B.buildConstant(S32, -1).getReg(0)
+ : B.buildConstant(S32, 1).getReg(0);
+ MRI.setRegBank(True, AMDGPU::SGPRRegBank);
+
+ B.setInstrAndDebugLoc(MI);
+ Register NewReg = MRI.createGenericVirtualRegister(S32);
+ B.buildInstr(AMDGPU::G_ANYEXT, {NewReg}, {MI.getOperand(1).getReg()});
+ MRI.setRegBank(NewReg, AMDGPU::SGPRRegBank);
+
+ if (SelType == S32) {
+ B.buildSelect(DstReg, NewReg, True, False);
+ } else if (SelType == S16) {
+ Register TmpReg = B.buildSelect(S32, NewReg, True, False).getReg(0);
+ B.buildTrunc(DstReg, TmpReg);
+ MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
+
+ } else if (SelType == S64) {
+ Register TmpReg = B.buildSelect(S32, NewReg, True, False).getReg(0);
+ MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
+
+ Register HiPart = Signed ? TmpReg : B.buildConstant(S32, 0).getReg(0);
+ MRI.setRegBank(HiPart, AMDGPU::SGPRRegBank);
+
+ B.buildMergeLikeInstr(DstReg, {TmpReg, HiPart});
+ } else
+ llvm_unreachable("bad type");
+
+ MRI.setRegBank(DstReg, *SrcBank); // FIXME: Correct?
+ MI.eraseFromParent();
+ return;
+ }
+
LLT DstTy = MRI.getType(DstReg);
if (DstTy.isScalar() &&
SrcBank != &AMDGPU::SGPRRegBank &&
@@ -2609,7 +2655,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
SrcBank->getID() == AMDGPU::SGPRRegBankID;
// TODO: Should s16 select be legal?
- LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
+ LLT SelType = UseSel64 ? LLT::scalar(64) : S32;
auto True = B.buildConstant(SelType, Signed ? -1 : 1);
auto False = B.buildConstant(SelType, 0);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index a1013f3803e781a..cdb817a00911016 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -640,7 +640,6 @@ define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
; GFX7-NEXT: s_cmp_lt_i32 s1, 0
; GFX7-NEXT: s_cselect_b32 s1, 1, 0
; GFX7-NEXT: s_xor_b32 s0, s1, s0
-; GFX7-NEXT: s_and_b32 s0, s0, 1
; GFX7-NEXT: s_add_i32 s0, s2, s0
; GFX7-NEXT: ; return to shader part epilog
;
@@ -652,7 +651,6 @@ define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
; GFX8-NEXT: s_cmp_lt_i32 s1, 0
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
; GFX8-NEXT: s_xor_b32 s0, s1, s0
-; GFX8-NEXT: s_and_b32 s0, s0, 1
; GFX8-NEXT: s_add_i32 s0, s2, s0
; GFX8-NEXT: ; return to shader part epilog
;
@@ -664,7 +662,6 @@ define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
; GFX9-NEXT: s_cmp_lt_i32 s1, 0
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
; GFX9-NEXT: s_xor_b32 s0, s1, s0
-; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: s_add_i32 s0, s2, s0
; GFX9-NEXT: ; return to shader part epilog
%saddo = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
@@ -749,8 +746,6 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX7-NEXT: s_cselect_b32 s3, 1, 0
; GFX7-NEXT: s_xor_b32 s0, s2, s0
; GFX7-NEXT: s_xor_b32 s1, s3, s1
-; GFX7-NEXT: s_and_b32 s0, s0, 1
-; GFX7-NEXT: s_and_b32 s1, s1, 1
; GFX7-NEXT: s_add_i32 s0, s4, s0
; GFX7-NEXT: s_add_i32 s1, s5, s1
; GFX7-NEXT: ; return to shader part epilog
@@ -769,8 +764,6 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
; GFX8-NEXT: s_xor_b32 s0, s2, s0
; GFX8-NEXT: s_xor_b32 s1, s3, s1
-; GFX8-NEXT: s_and_b32 s0, s0, 1
-; GFX8-NEXT: s_and_b32 s1, s1, 1
; GFX8-NEXT: s_add_i32 s0, s4, s0
; GFX8-NEXT: s_add_i32 s1, s5, s1
; GFX8-NEXT: ; return to shader part epilog
@@ -789,8 +782,6 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
; GFX9-NEXT: s_xor_b32 s0, s2, s0
; GFX9-NEXT: s_xor_b32 s1, s3, s1
-; GFX9-NEXT: s_and_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s1, s1, 1
; GFX9-NEXT: s_add_i32 s0, s4, s0
; GFX9-NEXT: s_add_i32 s1, s5, s1
; GFX9-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 043e69abaeef2d3..38d49add27df132 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -66,7 +66,6 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: .LBB2_2: ; %Flow
; GCN-NEXT: s_xor_b32 s2, s4, 1
-; GCN-NEXT: s_and_b32 s2, s2, 1
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_4
; GCN-NEXT: ; %bb.3: ; %.zero
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index da9601a8998c2ba..138eb26063f67c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -36,7 +36,6 @@ define amdgpu_kernel void @localize_constants(i1 %cond) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB0_2: ; %Flow
; GFX9-NEXT: s_xor_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cbranch_scc1 .LBB0_4
; GFX9-NEXT: ; %bb.3: ; %bb0
@@ -121,7 +120,6 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB1_2: ; %Flow
; GFX9-NEXT: s_xor_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cbranch_scc1 .LBB1_4
; GFX9-NEXT: ; %bb.3: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index eb3f74be71de017..cef9e86c49e513a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -860,7 +860,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-NEXT: v_mov_b32_e32 v1, s12
; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1
; GFX7-NEXT: s_mul_i32 s18, s1, s8
-; GFX7-NEXT: s_cselect_b32 s25, 1, 0
+; GFX7-NEXT: s_cselect_b32 s26, 1, 0
; GFX7-NEXT: s_add_u32 s18, s18, s17
; GFX7-NEXT: s_addc_u32 s17, s23, s22
; GFX7-NEXT: v_mov_b32_e32 v4, s11
@@ -871,33 +871,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-NEXT: s_mul_i32 s24, s1, s11
; GFX7-NEXT: v_readfirstlane_b32 s28, v3
; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: v_readfirstlane_b32 s27, v5
+; GFX7-NEXT: v_readfirstlane_b32 s25, v5
; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9
-; GFX7-NEXT: s_cselect_b32 s26, 1, 0
+; GFX7-NEXT: s_cselect_b32 s27, 1, 0
; GFX7-NEXT: s_add_u32 s24, s24, s22
-; GFX7-NEXT: s_addc_u32 s23, s27, s23
+; GFX7-NEXT: s_addc_u32 s23, s25, s23
; GFX7-NEXT: v_readfirstlane_b32 s29, v5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: v_mul_hi_u32 v6, v5, s8
-; GFX7-NEXT: s_mul_i32 s27, s2, s10
+; GFX7-NEXT: s_mul_i32 s25, s2, s10
; GFX7-NEXT: s_cselect_b32 s22, 1, 0
-; GFX7-NEXT: s_add_u32 s24, s27, s24
+; GFX7-NEXT: s_add_u32 s24, s25, s24
; GFX7-NEXT: v_mul_hi_u32 v0, v0, s10
-; GFX7-NEXT: s_addc_u32 s27, s28, s23
+; GFX7-NEXT: s_addc_u32 s25, s28, s23
; GFX7-NEXT: s_mul_i32 s28, s3, s9
; GFX7-NEXT: s_cselect_b32 s23, 1, 0
; GFX7-NEXT: s_add_u32 s28, s28, s24
; GFX7-NEXT: v_readfirstlane_b32 s30, v6
; GFX7-NEXT: v_mul_hi_u32 v6, s16, v4
-; GFX7-NEXT: s_addc_u32 s27, s29, s27
+; GFX7-NEXT: s_addc_u32 s25, s29, s25
; GFX7-NEXT: s_mul_i32 s29, s4, s8
; GFX7-NEXT: s_cselect_b32 s24, 1, 0
; GFX7-NEXT: s_add_u32 s28, s29, s28
; GFX7-NEXT: v_readfirstlane_b32 s33, v0
; GFX7-NEXT: v_mul_hi_u32 v0, v2, s9
-; GFX7-NEXT: s_addc_u32 s27, s30, s27
+; GFX7-NEXT: s_addc_u32 s29, s30, s25
; GFX7-NEXT: s_mul_i32 s30, s16, s11
-; GFX7-NEXT: s_cselect_b32 s29, 1, 0
+; GFX7-NEXT: s_cselect_b32 s25, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s31, v6
; GFX7-NEXT: s_add_u32 s19, s30, s19
; GFX7-NEXT: s_addc_u32 s28, s31, s28
@@ -919,84 +919,84 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-NEXT: s_addc_u32 s28, s35, s28
; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0
; GFX7-NEXT: s_cselect_b32 s34, 1, 0
-; GFX7-NEXT: s_cmp_lg_u32 s26, 0
-; GFX7-NEXT: s_addc_u32 s19, s25, s19
+; GFX7-NEXT: s_cmp_lg_u32 s27, 0
+; GFX7-NEXT: s_addc_u32 s19, s26, s19
; GFX7-NEXT: v_mov_b32_e32 v2, s13
-; GFX7-NEXT: s_cselect_b32 s25, 1, 0
+; GFX7-NEXT: s_cselect_b32 s26, 1, 0
; GFX7-NEXT: s_cmp_lg_u32 s21, 0
; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2
; GFX7-NEXT: s_addc_u32 s20, s20, 0
-; GFX7-NEXT: v_readfirstlane_b32 s26, v0
+; GFX7-NEXT: v_readfirstlane_b32 s27, v0
; GFX7-NEXT: v_mul_hi_u32 v0, s2, v1
-; GFX7-NEXT: s_cmp_lg_u32 s25, 0
+; GFX7-NEXT: s_cmp_lg_u32 s26, 0
; GFX7-NEXT: s_addc_u32 s20, s20, s28
-; GFX7-NEXT: s_mul_i32 s25, s16, s14
+; GFX7-NEXT: s_mul_i32 s26, s16, s14
; GFX7-NEXT: s_mul_i32 s28, s1, s13
; GFX7-NEXT: s_cselect_b32 s21, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s35, v6
-; GFX7-NEXT: s_add_u32 s25, s28, s25
-; GFX7-NEXT: s_addc_u32 s26, s35, s26
+; GFX7-NEXT: s_add_u32 s26, s28, s26
+; GFX7-NEXT: s_addc_u32 s27, s35, s27
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
; GFX7-NEXT: v_mul_hi_u32 v0, v3, s11
; GFX7-NEXT: s_mul_i32 s28, s2, s12
-; GFX7-NEXT: s_add_u32 s25, s28, s25
-; GFX7-NEXT: s_addc_u32 s26, s35, s26
+; GFX7-NEXT: s_add_u32 s26, s28, s26
+; GFX7-NEXT: s_addc_u32 s27, s35, s27
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
; GFX7-NEXT: v_mul_hi_u32 v0, v5, s10
; GFX7-NEXT: s_mul_i32 s28, s3, s11
-; GFX7-NEXT: s_add_u32 s25, s28, s25
-; GFX7-NEXT: s_addc_u32 s26, s35, s26
+; GFX7-NEXT: s_add_u32 s26, s28, s26
+; GFX7-NEXT: s_addc_u32 s27, s35, s27
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s5
; GFX7-NEXT: v_mul_hi_u32 v6, v0, s9
; GFX7-NEXT: s_mul_i32 s28, s4, s10
-; GFX7-NEXT: s_add_u32 s25, s28, s25
+; GFX7-NEXT: s_add_u32 s26, s28, s26
; GFX7-NEXT: v_mul_hi_u32 v1, s1, v1
-; GFX7-NEXT: s_addc_u32 s26, s35, s26
+; GFX7-NEXT: s_addc_u32 s27, s35, s27
; GFX7-NEXT: v_readfirstlane_b32 s35, v6
; GFX7-NEXT: v_mov_b32_e32 v6, s6
; GFX7-NEXT: v_mul_hi_u32 v6, v6, s8
; GFX7-NEXT: s_mul_i32 s28, s5, s9
-; GFX7-NEXT: s_add_u32 s25, s28, s25
+; GFX7-NEXT: s_add_u32 s26, s28, s26
; GFX7-NEXT: v_mul_hi_u32 v2, s16, v2
; GFX7-NEXT: v_readfirstlane_b32 s36, v1
; GFX7-NEXT: v_mul_hi_u32 v1, s2, v4
-; GFX7-NEXT: s_addc_u32 s26, s35, s26
+; GFX7-NEXT: s_addc_u32 s27, s35, s27
; GFX7-NEXT: s_mul_i32 s28, s6, s8
; GFX7-NEXT: v_readfirstlane_b32 s35, v6
-; GFX7-NEXT: s_add_u32 s25, s28, s25
-; GFX7-NEXT: s_addc_u32 s26, s35, s26
+; GFX7-NEXT: s_add_u32 s26, s28, s26
+; GFX7-NEXT: s_addc_u32 s27, s35, s27
; GFX7-NEXT: s_mul_i32 s28, s16, s13
; GFX7-NEXT: v_readfirstlane_b32 s35, v2
-; GFX7-NEXT: s_add_u32 s27, s28, s27
+; GFX7-NEXT: s_add_u32 s28, s28, s29
; GFX7-NEXT: v_readfirstlane_b32 s37, v1
; GFX7-NEXT: v_mul_hi_u32 v1, v3, s10
-; GFX7-NEXT: s_addc_u32 s25, s35, s25
+; GFX7-NEXT: s_addc_u32 s26, s35, s26
; GFX7-NEXT: s_mul_i32 s35, s1, s12
-; GFX7-NEXT: s_cselect_b32 s28, 1, 0
-; GFX7-NEXT: s_add_u32 s27, s35, s27
-; GFX7-NEXT: s_addc_u32 s25, s36, s25
+; GFX7-NEXT: s_cselect_b32 s29, 1, 0
+; GFX7-NEXT: s_add_u32 s28, s35, s28
+; GFX7-NEXT: s_addc_u32 s26, s36, s26
; GFX7-NEXT: s_mul_i32 s36, s2, s11
; GFX7-NEXT: s_cselect_b32 s35, 1, 0
-; GFX7-NEXT: s_add_u32 s27, s36, s27
+; GFX7-NEXT: s_add_u32 s28, s36, s28
; GFX7-NEXT: v_readfirstlane_b32 s38, v1
; GFX7-NEXT: v_mul_hi_u32 v1, v5, s9
-; GFX7-NEXT: s_addc_u32 s25, s37, s25
+; GFX7-NEXT: s_addc_u32 s26, s37, s26
; GFX7-NEXT: s_mul_i32 s37, s3, s10
; GFX7-NEXT: s_cselect_b32 s36, 1, 0
-; GFX7-NEXT: s_add_u32 s27, s37, s27
+; GFX7-NEXT: s_add_u32 s28, s37, s28
; GFX7-NEXT: v_mul_hi_u32 v0, v0, s8
-; GFX7-NEXT: s_addc_u32 s25, s38, s25
+; GFX7-NEXT: s_addc_u32 s26, s38, s26
; GFX7-NEXT: s_mul_i32 s38, s4, s9
; GFX7-NEXT: s_cselect_b32 s37, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s39, v1
-; GFX7-NEXT: s_add_u32 s27, s38, s27
-; GFX7-NEXT: s_addc_u32 s25, s39, s25
+; GFX7-NEXT: s_add_u32 s28, s38, s28
+; GFX7-NEXT: s_addc_u32 s26, s39, s26
; GFX7-NEXT: s_mul_i32 s39, s5, s8
; GFX7-NEXT: s_cselect_b32 s38, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s40, v0
-; GFX7-NEXT: s_add_u32 s27, s39, s27
-; GFX7-NEXT: s_addc_u32 s25, s40, s25
+; GFX7-NEXT: s_add_u32 s28, s39, s28
+; GFX7-NEXT: s_addc_u32 s26, s40, s26
; GFX7-NEXT: s_cselect_b32 s39, 1, 0
; GFX7-NEXT: s_cmp_lg_u32 s31, 0
; GFX7-NEXT: s_addc_u32 s30, s30, 0
@@ -1005,18 +1005,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-NEXT: s_cmp_lg_u32 s34, 0
; GFX7-NEXT: s_addc_u32 s30, s30, 0
; GFX7-NEXT: s_cmp_lg_u32 s21, 0
-; GFX7-NEXT: s_addc_u32 s21, s30, s27
-; GFX7-NEXT: s_cselect_b32 s27, 1, 0
+; GFX7-NEXT: s_addc_u32 s21, s30, s28
+; GFX7-NEXT: s_cselect_b32 s28, 1, 0
; GFX7-NEXT: s_cmp_lg_u32 s23, 0
; GFX7-NEXT: s_addc_u32 s22, s22, 0
; GFX7-NEXT: s_cmp_lg_u32 s24, 0
; GFX7-NEXT: s_addc_u32 s22, s22, 0
-; GFX7-NEXT: s_cmp_lg_u32 s29, 0
+; GFX7-NEXT: s_cmp_lg_u32 s25, 0
; GFX7-NEXT: s_addc_u32 s22, s22, 0
-; GFX7-NEXT: s_cmp_lg_u32 s27, 0
-; GFX7-NEXT: s_addc_u32 s22, s22, s25
+; GFX7-NEXT: s_cmp_lg_u32 s28, 0
+; GFX7-NEXT: s_addc_u32 s22, s22, s26
; GFX7-NEXT: s_mul_i32 s16, s16, s15
-; GFX7-NEXT: s_addc_u32 s15, s26, s16
+; GFX7-NEXT: s_addc_u32 s15, s27, s16
; GFX7-NEXT: s_mul_i32 s1, s1, s14
; GFX7-NEXT: s_cmp_lg_u32 s39, 0
; GFX7-NEXT: s_addc_u32 s1, s15, s1
@@ -1033,7 +1033,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-NEXT: s_cmp_lg_u32 s35, 0
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_mul_i32 s6, s6, s9
-; GFX7-NEXT: s_cmp_lg_u32 s28, 0
+; GFX7-NEXT: s_cmp_lg_u32 s29, 0
; GFX7-NEXT: s_addc_u32 s1, s1, s6
; GFX7-NEXT: s_mul_i32 s7, s7, s8
; GFX7-NEXT: s_mul_i32 s0, s0, s8
@@ -1081,7 +1081,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX8-NEXT: v_mov_b32_e32 v1, s12
; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1
; GFX8-NEXT: s_mul_i32 s18, s1, s8
-; GFX8-NEXT: s_cselect_b32 s25, 1, 0
+; GFX8-NEXT: s_cselect_b32 s26, 1, 0
; GFX8-NEXT: s_add_u32 s18, s18, s17
; GFX8-NEXT: s_addc_u32 s17, s23, s22
; GFX8-NEXT: v_mov_b32_e32 v4, s11
@@ -1092,33 +1092,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX8-NEXT: s_mul_i32 s24, s1, s11
; GFX8-NEXT: v_readfirstlane_b32 s28, v3
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_readfirstlane_b32 s27, v5
+; GFX8-NEXT: v_readfirstlane_b32 s25, v5
; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9
-; GFX8-NEXT: s_cselect_b32 s26, 1, 0
+; GFX8-NEXT: s_cselect_b32 s27, 1, 0
; GFX8-NEXT: s_add_u32 s24, s24, s22
-; GFX8-NEXT: s_addc_u32 s23, s27, s23
+; GFX8-NEXT: s_addc_u32 s23, s25, s23
; GFX8-NEXT: v_readfirstlane_b32 s29, v5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: v_mul_hi_u32 v6, v5, s8
-; GFX8-NEXT: s_mul_i32 s27, s2, s10
+; GFX8-NEXT: s_mul_i32 s25, s2, s10
; GFX8-NEXT: s_cselect_b32 s22, 1, 0
-; GFX8-NEXT: s_add_u32 s24, s27, s24
+; GFX8-NEXT: s_add_u32 s24, s25, s24
; GFX8-NEXT: v_mul_hi_u32 v0, v0, s10
-; GFX8-NEXT: s_addc_u32 s27, s28, s23
+; GFX8-NEXT: s_addc_u32 s25, s28, s23
; GFX8-NEXT: s_mul_i32 s28, s3, s9
; GFX8-NEXT: s_cselect_b32 s23, 1, 0
; GFX8-NEXT: s_add_u32 s28, s28, s24
; GFX8-NEXT: v_readfirstlane_b32 s30, v6
; GFX8-NEXT: v_mul_hi_u32 v6, s16, v4
-; GFX8-NEXT: s_addc_u32 s27, s29, s27
+; GFX8-NEXT: s_addc_u32 s25, s29, s25
; GFX8-NEXT: s_mul_i32 s29, s4, s8
; GFX8-NEXT: s_cselect_b32 s24, 1, 0
; GFX8-NEXT: s_add_u32 s28, s29, s28
; GFX8-NEXT: v_readfirstlane_b32 s33, v0
; GFX8-NEXT: v_mul_hi_u32 v0, v2, s9
-; GFX8-NEXT: s_addc_u32 s27, s30, s27
+; GFX8-NEXT: s_addc_u32 s29, s30, s25
; GFX8-NEXT: s_mul_i32 s30, s16, s11
-; GFX8-NEXT: s_cselect_b32 s29, 1, 0
+; GFX8-NEXT: s_cselect_b32 s25, 1, 0
; GFX8-NEXT: v_readfirstlane_b32 s31, v6
; GFX8-NEXT: s_add_u32 s19, s30, s19
; GFX8-NEXT: s_addc_u32 s28, s31, s28
@@ -1140,84 +1140,84 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX8-NEXT: s_addc_u32 s28, s35, s28
; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0
; GFX8-NEXT: s_cselect_b32 s34, 1, 0
-; GFX8-NEXT: s_cmp_lg_u32 s26, 0
-; GFX8-NEXT: s_addc_u32 s19, s25, s19
+; GFX8-NEXT: s_cmp_lg_u32 s27, 0
+; GFX8-NEXT: s_addc_u32 s19, s26, s19
; GFX8-NEXT: v_mov_b32_e32 v2, s13
-; GFX8-NEXT: s_cselect_b32 s25, 1, 0
+; GFX8-NEXT: s_cselect_b32 s26, 1, 0
; GFX8-NEXT: s_cmp_lg_u32 s21, 0
; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2
; GFX8-NEXT: s_addc_u32 s20, s20, 0
-; GFX8-NEXT: v_readfirstlane_b32 s26, v0
+; GFX8-NEXT: v_readfirstlane_b32 s27, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v1
-; GFX8-NEXT: s_cmp_lg_u32 s25, 0
+; GFX8-NEXT: s_cmp_lg_u32 s26, 0
; GFX8-NEXT: s_addc_u32 s20, s20, s28
-; GFX8-NEXT: s_mul_i32 s25, s16, s14
+; GFX8-NEXT: s_mul_i32 s26, s16, s14
; GFX8-NEXT: s_mul_i32 s28, s1, s13
; GFX8-NEXT: s_cselect_b32 s21, 1, 0
; GFX8-NEXT: v_readfirstlane_b32 s35, v6
-; GFX8-NEXT: s_add_u32 s25, s28, s25
-; GFX8-NEXT: s_addc_u32 s26, s35, s26
+; GFX8-NEXT: s_add_u32 s26, s28, s26
+; GFX8-NEXT: s_addc_u32 s27, s35, s27
; GFX8-NEXT: v_readfirstlane_b32 s35, v0
; GFX8-NEXT: v_mul_hi_u32 v0, v3, s11
; GFX8-NEXT: s_mul_i32 s28, s2, s12
-; GFX8-NEXT: s_add_u32 s25, s28, s25
-; GFX8-NEXT: s_addc_u32 s26, s35, s26
+; GFX8-NEXT: s_add_u32 s26, s28, s26
+; GFX8-NEXT: s_addc_u32 s27, s35, s27
; GFX8-NEXT: v_readfirstlane_b32 s35, v0
; GFX8-NEXT: v_mul_hi_u32 v0, v5, s10
; GFX8-NEXT: s_mul_i32 s28, s3, s11
-; GFX8-NEXT: s_add_u32 s25, s28, s25
-; GFX8-NEXT: s_addc_u32 s26, s35, s26
+; GFX8-NEXT: s_add_u32 s26, s28, s26
+; GFX8-NEXT: s_addc_u32 s27, s35, s27
; GFX8-NEXT: v_readfirstlane_b32 s35, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: v_mul_hi_u32 v6, v0, s9
; GFX8-NEXT: s_mul_i32 s28, s4, s10
-; GFX8-NEXT: s_add_u32 s25, s28, s25
+; GFX8-NEXT: s_add_u32 s26, s28, s26
; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1
-; GFX8-NEXT: s_addc_u32 s26, s35, s26
+; GFX8-NEXT: s_addc_u32 s27, s35, s27
; GFX8-NEXT: v_readfirstlane_b32 s35, v6
; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: v_mul_hi_u32 v6, v6, s8
; GFX8-NEXT: s_mul_i32 s28, s5, s9
-; GFX8-NEXT: s_add_u32 s25, s28, s25
+; GFX8-NEXT: s_add_u32 s26, s28, s26
; GFX8-NEXT: v_mul_hi_u32 v2, s16, v2
; GFX8-NEXT: v_readfirstlane_b32 s36, v1
; GFX8-NEXT: v_mul_hi_u32 v1, s2, v4
-; GFX8-NEXT: s_addc_u32 s26, s35, s26
+; GFX8-NEXT: s_addc_u32 s27, s35, s27
; GFX8-NEXT: s_mul_i32 s28, s6, s8
; GFX8-NEXT: v_readfirstlane_b32 s35, v6
-; GFX8-NEXT: s_add_u32 s25, s28, s25
-; GFX8-NEXT: s_addc_u32 s26, s35, s26
+; GFX8-NEXT: s_add_u32 s26, s28, s26
+; GFX8-NEXT: s_addc_u32 s27, s35, s27
; GFX8-NEXT: s_mul_i32 s28, s16, s13
; GFX8-NEXT: v_readfirstlane_b32 s35, v2
-; GFX8-NEXT: s_add_u32 s27, s28, s27
+; GFX8-NEXT: s_add_u32 s28, s28, s29
; GFX8-NEXT: v_readfirstlane_b32 s37, v1
; GFX8-NEXT: v_mul_hi_u32 v1, v3, s10
-; GFX8-NEXT: s_addc_u32 s25, s35, s25
+; GFX8-NEXT: s_addc_u32 s26, s35, s26
; GFX8-NEXT: s_mul_i32 s35, s1, s12
-; GFX8-NEXT: s_cselect_b32 s28, 1, 0
-; GFX8-NEXT: s_add_u32 s27, s35, s27
-; GFX8-NEXT: s_addc_u32 s25, s36, s25
+; GFX8-NEXT: s_cselect_b32 s29, 1, 0
+; GFX8-NEXT: s_add_u32 s28, s35, s28
+; GFX8-NEXT: s_addc_u32 s26, s36, s26
; GFX8-NEXT: s_mul_i32 s36, s2, s11
; GFX8-NEXT: s_cselect_b32 s35, 1, 0
-; GFX8-NEXT: s_add_u32 s27, s36, s27
+; GFX8-NEXT: s_add_u32 s28, s36, s28
; GFX8-NEXT: v_readfirstlane_b32 s38, v1
; GFX8-NEXT: v_mul_hi_u32 v1, v5, s9
-; GFX8-NEXT: s_addc_u32 s25, s37, s25
+; GFX8-NEXT: s_addc_u32 s26, s37, s26
; GFX8-NEXT: s_mul_i32 s37, s3, s10
; GFX8-NEXT: s_cselect_b32 s36, 1, 0
-; GFX8-NEXT: s_add_u32 s27, s37, s27
+; GFX8-NEXT: s_add_u32 s28, s37, s28
; GFX8-NEXT: v_mul_hi_u32 v0, v0, s8
-; GFX8-NEXT: s_addc_u32 s25, s38, s25
+; GFX8-NEXT: s_addc_u32 s26, s38, s26
; GFX8-NEXT: s_mul_i32 s38, s4, s9
; GFX8-NEXT: s_cselect_b32 s37, 1, 0
; GFX8-NEXT: v_readfirstlane_b32 s39, v1
-; GFX8-NEXT: s_add_u32 s27, s38, s27
-; GFX8-NEXT: s_addc_u32 s25, s39, s25
+; GFX8-NEXT: s_add_u32 s28, s38, s28
+; GFX8-NEXT: s_addc_u32 s26, s39, s26
; GFX8-NEXT: s_mul_i32 s39, s5, s8
; GFX8-NEXT: s_cselect_b32 s38, 1, 0
; GFX8-NEXT: v_readfirstlane_b32 s40, v0
-; GFX8-NEXT: s_add_u32 s27, s39, s27
-; GFX8-NEXT: s_addc_u32 s25, s40, s25
+; GFX8-NEXT: s_add_u32 s28, s39, s28
+; GFX8-NEXT: s_addc_u32 s26, s40, s26
; GFX8-NEXT: s_cselect_b32 s39, 1, 0
; GFX8-NEXT: s_cmp_lg_u32 s31, 0
; GFX8-NEXT: s_addc_u32 s30, s30, 0
@@ -1226,18 +1226,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX8-NEXT: s_cmp_lg_u32 s34, 0
; GFX8-NEXT: s_addc_u32 s30, s30, 0
; GFX8-NEXT: s_cmp_lg_u32 s21, 0
-; GFX8-NEXT: s_addc_u32 s21, s30, s27
-; GFX8-NEXT: s_cselect_b32 s27, 1, 0
+; GFX8-NEXT: s_addc_u32 s21, s30, s28
+; GFX8-NEXT: s_cselect_b32 s28, 1, 0
; GFX8-NEXT: s_cmp_lg_u32 s23, 0
; GFX8-NEXT: s_addc_u32 s22, s22, 0
; GFX8-NEXT: s_cmp_lg_u32 s24, 0
; GFX8-NEXT: s_addc_u32 s22, s22, 0
-; GFX8-NEXT: s_cmp_lg_u32 s29, 0
+; GFX8-NEXT: s_cmp_lg_u32 s25, 0
; GFX8-NEXT: s_addc_u32 s22, s22, 0
-; GFX8-NEXT: s_cmp_lg_u32 s27, 0
-; GFX8-NEXT: s_addc_u32 s22, s22, s25
+; GFX8-NEXT: s_cmp_lg_u32 s28, 0
+; GFX8-NEXT: s_addc_u32 s22, s22, s26
; GFX8-NEXT: s_mul_i32 s16, s16, s15
-; GFX8-NEXT: s_addc_u32 s15, s26, s16
+; GFX8-NEXT: s_addc_u32 s15, s27, s16
; GFX8-NEXT: s_mul_i32 s1, s1, s14
; GFX8-NEXT: s_cmp_lg_u32 s39, 0
; GFX8-NEXT: s_addc_u32 s1, s15, s1
@@ -1254,7 +1254,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX8-NEXT: s_cmp_lg_u32 s35, 0
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_mul_i32 s6, s6, s9
-; GFX8-NEXT: s_cmp_lg_u32 s28, 0
+; GFX8-NEXT: s_cmp_lg_u32 s29, 0
; GFX8-NEXT: s_addc_u32 s1, s1, s6
; GFX8-NEXT: s_mul_i32 s7, s7, s8
; GFX8-NEXT: s_mul_i32 s0, s0, s8
@@ -1286,15 +1286,15 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9
; GFX9-NEXT: s_add_u32 s17, s22, s17
-; GFX9-NEXT: s_addc_u32 s18, s23, s18
-; GFX9-NEXT: s_mul_i32 s23, s1, s8
-; GFX9-NEXT: s_cselect_b32 s22, 1, 0
+; GFX9-NEXT: s_addc_u32 s22, s23, s18
+; GFX9-NEXT: s_mul_i32 s18, s1, s8
+; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s24, s1, s8
-; GFX9-NEXT: s_add_u32 s17, s23, s17
-; GFX9-NEXT: s_addc_u32 s18, s24, s18
+; GFX9-NEXT: s_add_u32 s18, s18, s17
+; GFX9-NEXT: s_addc_u32 s17, s24, s22
; GFX9-NEXT: s_mul_i32 s24, s16, s12
; GFX9-NEXT: s_mul_i32 s26, s1, s11
-; GFX9-NEXT: s_cselect_b32 s23, 1, 0
+; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s25, s16, s12
; GFX9-NEXT: s_mul_hi_u32 s27, s1, s11
; GFX9-NEXT: s_add_u32 s24, s26, s24
@@ -1335,8 +1335,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX9-NEXT: s_add_u32 s19, s34, s19
; GFX9-NEXT: s_addc_u32 s24, s35, s24
; GFX9-NEXT: s_cselect_b32 s34, 1, 0
-; GFX9-NEXT: s_cmp_lg_u32 s23, 0
-; GFX9-NEXT: s_addc_u32 s19, s22, s19
+; GFX9-NEXT: s_cmp_lg_u32 s22, 0
+; GFX9-NEXT: s_addc_u32 s19, s23, s19
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_cmp_lg_u32 s21, 0
; GFX9-NEXT: s_addc_u32 s20, s20, 0
@@ -1439,8 +1439,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX9-NEXT: s_mul_i32 s7, s7, s8
; GFX9-NEXT: s_mul_i32 s0, s0, s8
; GFX9-NEXT: s_add_u32 s7, s7, s1
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s2, s18
+; GFX9-NEXT: s_mov_b32 s1, s18
+; GFX9-NEXT: s_mov_b32 s2, s17
; GFX9-NEXT: s_mov_b32 s3, s19
; GFX9-NEXT: s_mov_b32 s4, s20
; GFX9-NEXT: s_mov_b32 s5, s21
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir
index b76fbdd9bed15e5..e47e7d63fc9453d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir
@@ -50,7 +50,11 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s16) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -71,7 +75,10 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -92,7 +99,12 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s64) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[C2]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -180,7 +192,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s16) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s1) = G_TRUNC %0
%2:_(s16) = G_ANYEXT %1
@@ -198,7 +214,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
%0:_(s32) = COPY $sgpr0
%1:_(s1) = G_TRUNC %0
%2:_(s32) = G_ANYEXT %1
@@ -216,7 +235,12 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s64) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[C2]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s1) = G_TRUNC %0
%2:_(s64) = G_ANYEXT %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-freeze.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-freeze.mir
index d87bc1f01bdb85b..bb2917ac83886c5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-freeze.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-freeze.mir
@@ -62,8 +62,11 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
; CHECK-NEXT: [[FREEZE:%[0-9]+]]:sgpr(s1) = G_FREEZE [[TRUNC]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[FREEZE]](s1)
- ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: $sgpr0 = COPY [[SELECT]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s1) = G_TRUNC %0(s32)
%2:_(s1) = G_FREEZE %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir
index 9e56cb85bf409a0..17c6609e2de2ba2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir
@@ -69,7 +69,11 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
- ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s16) = G_SEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -90,7 +94,10 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
- ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -111,7 +118,11 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
- ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s64) = G_SEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -199,7 +210,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
- ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s16) = G_SEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s1) = G_TRUNC %0
%2:_(s16) = G_SEXT %1
@@ -217,7 +232,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
- ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
%0:_(s32) = COPY $sgpr0
%1:_(s1) = G_TRUNC %0
%2:_(s32) = G_SEXT %1
@@ -235,7 +253,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
- ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s64) = G_SEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s1) = G_TRUNC %0
%2:_(s64) = G_SEXT %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir
index 8756061d89ca224..d486958c8e9b31d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir
@@ -68,7 +68,11 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s16) = G_ZEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -89,7 +93,10 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -110,7 +117,12 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s64) = G_ZEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[C2]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -198,7 +210,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s16) = G_ZEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s1) = G_TRUNC %0
%2:_(s16) = G_ZEXT %1
@@ -216,7 +232,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
%0:_(s32) = COPY $sgpr0
%1:_(s1) = G_TRUNC %0
%2:_(s32) = G_ZEXT %1
@@ -234,7 +253,12 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s64) = G_ZEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[C2]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s1) = G_TRUNC %0
%2:_(s64) = G_ZEXT %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index cded5c94edf8cc3..7bed94ff8ff4dce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -1081,7 +1081,6 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX8-NEXT: s_xor_b32 s0, s1, s0
; GFX8-NEXT: s_ashr_i32 s1, s3, 23
; GFX8-NEXT: s_add_i32 s1, s1, 0xff800000
-; GFX8-NEXT: s_and_b32 s0, s0, 1
; GFX8-NEXT: s_cmp_lg_u32 s0, 0
; GFX8-NEXT: s_cselect_b32 s0, s1, s2
; GFX8-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 4248f7b6a158312..bef24b766b4c3e8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -335,7 +335,6 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: .LBB1_3: ; %Flow
; CHECK-NEXT: s_xor_b32 s0, s0, 1
-; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
; CHECK-NEXT: ; %bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index d0c55c69f508775..174963c3d6e56e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -327,7 +327,6 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: .LBB1_3: ; %Flow
; CHECK-NEXT: s_xor_b32 s0, s0, 1
-; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
; CHECK-NEXT: ; %bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 65455d754be4f53..0177f00fbb64b4b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -1081,7 +1081,6 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX8-NEXT: s_xor_b32 s0, s1, s0
; GFX8-NEXT: s_ashr_i32 s1, s3, 23
; GFX8-NEXT: s_add_i32 s1, s1, 0xff800000
-; GFX8-NEXT: s_and_b32 s0, s0, 1
; GFX8-NEXT: s_cmp_lg_u32 s0, 0
; GFX8-NEXT: s_cselect_b32 s0, s1, s2
; GFX8-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
index f8e7e5ecd6260fb..e05531dac57aedb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
@@ -640,7 +640,6 @@ define amdgpu_ps i32 @s_ssubo_i32(i32 inreg %a, i32 inreg %b) {
; GFX7-NEXT: s_cmp_gt_i32 s1, 0
; GFX7-NEXT: s_cselect_b32 s1, 1, 0
; GFX7-NEXT: s_xor_b32 s0, s1, s0
-; GFX7-NEXT: s_and_b32 s0, s0, 1
; GFX7-NEXT: s_sub_i32 s0, s2, s0
; GFX7-NEXT: ; return to shader part epilog
;
@@ -652,7 +651,6 @@ define amdgpu_ps i32 @s_ssubo_i32(i32 inreg %a, i32 inreg %b) {
; GFX8-NEXT: s_cmp_gt_i32 s1, 0
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
; GFX8-NEXT: s_xor_b32 s0, s1, s0
-; GFX8-NEXT: s_and_b32 s0, s0, 1
; GFX8-NEXT: s_sub_i32 s0, s2, s0
; GFX8-NEXT: ; return to shader part epilog
;
@@ -664,7 +662,6 @@ define amdgpu_ps i32 @s_ssubo_i32(i32 inreg %a, i32 inreg %b) {
; GFX9-NEXT: s_cmp_gt_i32 s1, 0
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
; GFX9-NEXT: s_xor_b32 s0, s1, s0
-; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: s_sub_i32 s0, s2, s0
; GFX9-NEXT: ; return to shader part epilog
%ssubo = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
@@ -749,8 +746,6 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX7-NEXT: s_cselect_b32 s3, 1, 0
; GFX7-NEXT: s_xor_b32 s0, s2, s0
; GFX7-NEXT: s_xor_b32 s1, s3, s1
-; GFX7-NEXT: s_and_b32 s0, s0, 1
-; GFX7-NEXT: s_and_b32 s1, s1, 1
; GFX7-NEXT: s_sub_i32 s0, s4, s0
; GFX7-NEXT: s_sub_i32 s1, s5, s1
; GFX7-NEXT: ; return to shader part epilog
@@ -769,8 +764,6 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
; GFX8-NEXT: s_xor_b32 s0, s2, s0
; GFX8-NEXT: s_xor_b32 s1, s3, s1
-; GFX8-NEXT: s_and_b32 s0, s0, 1
-; GFX8-NEXT: s_and_b32 s1, s1, 1
; GFX8-NEXT: s_sub_i32 s0, s4, s0
; GFX8-NEXT: s_sub_i32 s1, s5, s1
; GFX8-NEXT: ; return to shader part epilog
@@ -789,8 +782,6 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
; GFX9-NEXT: s_xor_b32 s0, s2, s0
; GFX9-NEXT: s_xor_b32 s1, s3, s1
-; GFX9-NEXT: s_and_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s1, s1, 1
; GFX9-NEXT: s_sub_i32 s0, s4, s0
; GFX9-NEXT: s_sub_i32 s1, s5, s1
; GFX9-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 77737b356ff6e9c..c14a7ed5b8c8db3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -323,7 +323,6 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: .LBB1_3: ; %Flow
; CHECK-NEXT: s_xor_b32 s1, s4, 1
-; CHECK-NEXT: s_and_b32 s1, s1, 1
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
; CHECK-NEXT: ; %bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 097f6642cbc669b..6f7ebc8f300c170 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -319,7 +319,6 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: .LBB1_3: ; %Flow
; CHECK-NEXT: s_xor_b32 s1, s4, 1
-; CHECK-NEXT: s_and_b32 s1, s1, 1
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
; CHECK-NEXT: ; %bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll b/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
index e996fda4c9fd6ca..7c481310de64859 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
@@ -19,9 +19,8 @@ define amdgpu_vs void @fcmp_f32_olt_to_ogt(ptr addrspace(1) inreg %out, float in
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_gt_f32 s2, 2.0
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -51,9 +50,8 @@ define amdgpu_vs void @fcmp_f32_ogt_to_olt(ptr addrspace(1) inreg %out, float in
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_lt_f32 s2, 2.0
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -83,9 +81,8 @@ define amdgpu_vs void @fcmp_f32_ole_to_oge(ptr addrspace(1) inreg %out, float in
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_ge_f32 s2, 2.0
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -115,9 +112,8 @@ define amdgpu_vs void @fcmp_f32_oge_to_ole(ptr addrspace(1) inreg %out, float in
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_le_f32 s2, 2.0
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -147,9 +143,8 @@ define amdgpu_vs void @fcmp_f32_ult_to_ugt(ptr addrspace(1) inreg %out, float in
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nle_f32 s2, 2.0
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -179,9 +174,8 @@ define amdgpu_vs void @fcmp_f32_ugt_to_ult(ptr addrspace(1) inreg %out, float in
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nge_f32 s2, 2.0
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -211,9 +205,8 @@ define amdgpu_vs void @fcmp_f32_ule_to_uge(ptr addrspace(1) inreg %out, float in
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nlt_f32 s2, 2.0
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -243,9 +236,8 @@ define amdgpu_vs void @fcmp_f32_uge_to_ule(ptr addrspace(1) inreg %out, float in
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_ngt_f32 s2, 2.0
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -275,9 +267,8 @@ define amdgpu_vs void @fcmp_f16_olt_to_ogt(ptr addrspace(1) inreg %out, half inr
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_gt_f16 s2, 0x4000
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -307,9 +298,8 @@ define amdgpu_vs void @fcmp_f16_ogt_to_olt(ptr addrspace(1) inreg %out, half inr
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_lt_f16 s2, 0x4000
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -339,9 +329,8 @@ define amdgpu_vs void @fcmp_f16_ole_to_oge(ptr addrspace(1) inreg %out, half inr
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_ge_f16 s2, 0x4000
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -371,9 +360,8 @@ define amdgpu_vs void @fcmp_f16_oge_to_ole(ptr addrspace(1) inreg %out, half inr
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_le_f16 s2, 0x4000
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -403,9 +391,8 @@ define amdgpu_vs void @fcmp_f16_ult_to_ugt(ptr addrspace(1) inreg %out, half inr
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nle_f16 s2, 0x4000
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -435,9 +422,8 @@ define amdgpu_vs void @fcmp_f16_ugt_to_ult(ptr addrspace(1) inreg %out, half inr
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nge_f16 s2, 0x4000
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -467,9 +453,8 @@ define amdgpu_vs void @fcmp_ule_to_uge(ptr addrspace(1) inreg %out, half inreg %
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nlt_f16 s2, 0x4000
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -499,9 +484,8 @@ define amdgpu_vs void @fcmp_uge_to_ule(ptr addrspace(1) inreg %out, half inreg %
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_ngt_f16 s2, 0x4000
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 97216b6c94693c4..dd60749db53f85f 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -252,7 +252,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s6, 1
; VI-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
@@ -381,7 +380,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s6, 1
; GFX10-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
@@ -525,8 +523,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s6, 1
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index 671ead6127308dd..a33388b747d6498 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -34,8 +34,7 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
; GFX7GLISEL-NEXT: s_and_b32 s3, s3, 0x7fff
; GFX7GLISEL-NEXT: s_and_b32 s3, 0xffff, s3
; GFX7GLISEL-NEXT: s_cmpk_gt_u32 s3, 0x7c00
-; GFX7GLISEL-NEXT: s_cselect_b32 s3, 1, 0
-; GFX7GLISEL-NEXT: s_bfe_i32 s3, s3, 0x10000
+; GFX7GLISEL-NEXT: s_cselect_b32 s3, -1, 0
; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX7GLISEL-NEXT: s_mov_b32 s3, 0xf000
; GFX7GLISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
index 19e50be155a9646..f8424b89eb7a626 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
@@ -19,9 +19,8 @@ define amdgpu_vs void @f32_olt(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_lt_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -51,9 +50,8 @@ define amdgpu_vs void @f32_oeq(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_eq_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -83,9 +81,8 @@ define amdgpu_vs void @f32_ole(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_le_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -115,9 +112,8 @@ define amdgpu_vs void @f32_ogt(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_gt_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -147,9 +143,8 @@ define amdgpu_vs void @f32_one(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_lg_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -179,9 +174,8 @@ define amdgpu_vs void @f32_oge(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_ge_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -211,9 +205,8 @@ define amdgpu_vs void @f32_ord(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_o_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -243,9 +236,8 @@ define amdgpu_vs void @f32_uno(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_u_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -275,9 +267,8 @@ define amdgpu_vs void @f32_ult(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nge_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -307,9 +298,8 @@ define amdgpu_vs void @f32_ueq(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nlg_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -339,9 +329,8 @@ define amdgpu_vs void @f32_ule(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_ngt_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -371,9 +360,8 @@ define amdgpu_vs void @f32_ugt(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nle_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -403,9 +391,8 @@ define amdgpu_vs void @f32_une(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_neq_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -435,9 +422,8 @@ define amdgpu_vs void @f32_uge(ptr addrspace(1) inreg %out, float inreg %a, floa
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nlt_f32 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -467,9 +453,8 @@ define amdgpu_vs void @f16_olt(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_lt_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -499,9 +484,8 @@ define amdgpu_vs void @f16_oeq(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_eq_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -531,9 +515,8 @@ define amdgpu_vs void @f16_ole(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_le_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -563,9 +546,8 @@ define amdgpu_vs void @f16_ogt(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_gt_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -595,9 +577,8 @@ define amdgpu_vs void @f16_one(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_lg_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -627,9 +608,8 @@ define amdgpu_vs void @f16_oge(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_ge_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -659,9 +639,8 @@ define amdgpu_vs void @f16_ord(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_o_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -691,9 +670,8 @@ define amdgpu_vs void @f16_uno(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_u_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -723,9 +701,8 @@ define amdgpu_vs void @f16_ult(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nge_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -755,9 +732,8 @@ define amdgpu_vs void @f16_ueq(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nlg_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -787,9 +763,8 @@ define amdgpu_vs void @f16_ule(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_ngt_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -819,9 +794,8 @@ define amdgpu_vs void @f16_ugt(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nle_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -851,9 +825,8 @@ define amdgpu_vs void @f16_une(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_neq_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
@@ -883,9 +856,8 @@ define amdgpu_vs void @f16_uge(ptr addrspace(1) inreg %out, half inreg %a, half
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_cmp_nlt_f16 s2, s3
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: s_cselect_b32 s2, -1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GISEL-NEXT: s_nop 0
More information about the llvm-commits
mailing list