[llvm] [AMDGPU][GlobalISel] Expand SGPR S1 exts into G_SELECT (PR #68858)

Thu Oct 12 00:00:13 PDT 2023

https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/68858

This patch prevents most SGPR S1 G_SZA_EXT from reaching ISel by lowering them to a G_SELECT in RegBankSelect.

It also adds some new ISel logic to make G_SELECT 1,0 be lowered to a simple SCC copy. This is because a copy of SCC is already a `s_cselect x, 1, 0`. Without that we get some regressions.

>From ca999104982649ec0978206fe4a85e778120e509 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 12 Oct 2023 08:59:25 +0200
Subject: [PATCH] [AMDGPU][GlobalISel] Expand SGPR S1 exts into G_SELECT

This patch prevents most SGPR S1 G_SZA_EXT from reaching ISel by lowering them to a G_SELECT in RegBankSelect.

It also adds some new ISel logic to make G_SELECT 1,0 be lowered to a simple SCC copy. This is because a copy of SCC is already a `s_cselect x, 1, 0`. Without that we get some regressions.
---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       |  12 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  48 ++--
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  50 ++++-
 llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll   |   9 -
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    |   1 -
 .../CodeGen/AMDGPU/GlobalISel/localizer.ll    |   2 -
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    | 212 +++++++++---------
 .../GlobalISel/regbankselect-anyext.mir       |  32 ++-
 .../GlobalISel/regbankselect-freeze.mir       |   5 +-
 .../AMDGPU/GlobalISel/regbankselect-sext.mir  |  34 ++-
 .../AMDGPU/GlobalISel/regbankselect-zext.mir  |  36 ++-
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll |   1 -
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll     |   1 -
 .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll     |   1 -
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll |   1 -
 llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll   |   9 -
 .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll     |   1 -
 .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll     |   1 -
 .../AMDGPU/commute-compares-scalar-float.ll   |  80 +++----
 llvm/test/CodeGen/AMDGPU/fptrunc.ll           |   4 -
 .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll     |   3 +-
 llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll | 140 +++++-------
 22 files changed, 376 insertions(+), 307 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 8d4cad4c07bc74c..16f82b50f6dec01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -133,6 +133,15 @@ def expand_promoted_fmed3 : GICombineRule<
 
 } // End Predicates = [NotHasMed3_16]
 
+// Form G_AMDGPU_SGPR_S1_TO_VCC_COPY from (copy (trunc)) with the right regbanks.
+def fold_sgpr_s1_to_vcc_copy : GICombineRule<
+  (defs root:$dst),
+  (match (COPY $dst, $tmp),
+         (G_TRUNC $tmp, $src),
+    [{ return Helper.matchExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]),
+  (apply [{ Helper.applyExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }])
+>;
+
 // Combines which should only apply on SI/CI
 def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
 
@@ -155,7 +164,8 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
 
 def AMDGPURegBankCombiner : GICombiner<
   "AMDGPURegBankCombinerImpl",
-  [unmerge_merge, unmerge_cst, unmerge_undef,
+  [unmerge_merge, unmerge_cst, unmerge_undef, trunc_ext_fold,
+   anyext_trunc_fold, select_constant_cmp,
    zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
    fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 31d72fb8cadd8a6..85b5b5988619905 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2153,24 +2153,40 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
   assert(Size <= 32 || Size == 64);
   const MachineOperand &CCOp = I.getOperand(1);
   Register CCReg = CCOp.getReg();
+
+  Register TrueVal = I.getOperand(2).getReg();
+  Register FalseVal = I.getOperand(3).getReg();
   if (!isVCC(CCReg, *MRI)) {
     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
                                          AMDGPU::S_CSELECT_B32;
     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
             .addReg(CCReg);
 
-    // The generic constrainSelectedInstRegOperands doesn't work for the scc register
-    // bank, because it does not cover the register class that we used to represent
-    // for it.  So we need to manually set the register class here.
-    if (!MRI->getRegClassOrNull(CCReg))
-        MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
-    MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
-            .add(I.getOperand(2))
-            .add(I.getOperand(3));
+    bool Ret = constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
+
+    // select 1, 0 is just a copy SCC.
+    if (getIConstantVRegVal(TrueVal, *MRI) == 1 &&
+        getIConstantVRegVal(FalseVal, *MRI) == 0) {
+      // FIXME: Do we need to have two copies or could we get away with just
+      // returning CCReg?
+      MachineInstr *RetCopy =
+          BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
+              .addReg(AMDGPU::SCC);
+      Ret |= constrainSelectedInstRegOperands(*RetCopy, TII, TRI, RBI);
+    } else {
+      // The generic constrainSelectedInstRegOperands doesn't work for the scc
+      // register bank, because it does not cover the register class that we
+      // used to represent for it.  So we need to manually set the register
+      // class here.
+      if (!MRI->getRegClassOrNull(CCReg))
+        MRI->setRegClass(CCReg,
+                         TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
+      MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
+                                 .addReg(TrueVal)
+                                 .addReg(FalseVal);
+      Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+    }
 
-    bool Ret = false;
-    Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
-    Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
     I.eraseFromParent();
     return Ret;
   }
@@ -2181,11 +2197,11 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
 
   MachineInstr *Select =
       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
-              .addImm(0)
-              .add(I.getOperand(3))
-              .addImm(0)
-              .add(I.getOperand(2))
-              .add(I.getOperand(1));
+          .addImm(0)
+          .addReg(FalseVal)
+          .addImm(0)
+          .addReg(TrueVal)
+          .add(I.getOperand(1));
 
   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
   I.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5b056bd9e5dba2c..402ee5cfbb5a053 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2556,16 +2556,62 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   case AMDGPU::G_SEXT:
   case AMDGPU::G_ZEXT:
   case AMDGPU::G_ANYEXT: {
+    Register DstReg = MI.getOperand(0).getReg();
     Register SrcReg = MI.getOperand(1).getReg();
     LLT SrcTy = MRI.getType(SrcReg);
     const bool Signed = Opc == AMDGPU::G_SEXT;
 
+    const LLT S16 = LLT::scalar(16);
+    const LLT S32 = LLT::scalar(32);
+    const LLT S64 = LLT::scalar(64);
+
     assert(OpdMapper.getVRegs(1).empty());
 
     const RegisterBank *SrcBank =
       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
 
-    Register DstReg = MI.getOperand(0).getReg();
+    LLT SelType = MRI.getType(MI.getOperand(0).getReg());
+
+    // Extending SGPR S1 to S16/32/64.
+    if (SrcBank == &AMDGPU::SGPRRegBank &&
+        MRI.getType(SrcReg) == LLT::scalar(1) &&
+        (SelType == S32 || SelType == S16 || SelType == S64)) {
+
+      Register False = B.buildConstant(S32, 0).getReg(0);
+      MRI.setRegBank(False, AMDGPU::SGPRRegBank);
+
+      Register True = Signed ? B.buildConstant(S32, -1).getReg(0)
+                             : B.buildConstant(S32, 1).getReg(0);
+      MRI.setRegBank(True, AMDGPU::SGPRRegBank);
+
+      B.setInstrAndDebugLoc(MI);
+      Register NewReg = MRI.createGenericVirtualRegister(S32);
+      B.buildInstr(AMDGPU::G_ANYEXT, {NewReg}, {MI.getOperand(1).getReg()});
+      MRI.setRegBank(NewReg, AMDGPU::SGPRRegBank);
+
+      if (SelType == S32) {
+        B.buildSelect(DstReg, NewReg, True, False);
+      } else if (SelType == S16) {
+        Register TmpReg = B.buildSelect(S32, NewReg, True, False).getReg(0);
+        B.buildTrunc(DstReg, TmpReg);
+        MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
+
+      } else if (SelType == S64) {
+        Register TmpReg = B.buildSelect(S32, NewReg, True, False).getReg(0);
+        MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
+
+        Register HiPart = Signed ? TmpReg : B.buildConstant(S32, 0).getReg(0);
+        MRI.setRegBank(HiPart, AMDGPU::SGPRRegBank);
+
+        B.buildMergeLikeInstr(DstReg, {TmpReg, HiPart});
+      } else
+        llvm_unreachable("bad type");
+
+      MRI.setRegBank(DstReg, *SrcBank); // FIXME: Correct?
+      MI.eraseFromParent();
+      return;
+    }
+
     LLT DstTy = MRI.getType(DstReg);
     if (DstTy.isScalar() &&
         SrcBank != &AMDGPU::SGPRRegBank &&
@@ -2609,7 +2655,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
         SrcBank->getID() == AMDGPU::SGPRRegBankID;
 
       // TODO: Should s16 select be legal?
-      LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
+      LLT SelType = UseSel64 ? LLT::scalar(64) : S32;
       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
       auto False = B.buildConstant(SelType, 0);
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index a1013f3803e781a..cdb817a00911016 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -640,7 +640,6 @@ define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX7-NEXT:    s_cmp_lt_i32 s1, 0
 ; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX7-NEXT:    s_xor_b32 s0, s1, s0
-; GFX7-NEXT:    s_and_b32 s0, s0, 1
 ; GFX7-NEXT:    s_add_i32 s0, s2, s0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
@@ -652,7 +651,6 @@ define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX8-NEXT:    s_cmp_lt_i32 s1, 0
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    s_xor_b32 s0, s1, s0
-; GFX8-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8-NEXT:    s_add_i32 s0, s2, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -664,7 +662,6 @@ define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    s_xor_b32 s0, s1, s0
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    s_add_i32 s0, s2, s0
 ; GFX9-NEXT:    ; return to shader part epilog
   %saddo = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
@@ -749,8 +746,6 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX7-NEXT:    s_xor_b32 s0, s2, s0
 ; GFX7-NEXT:    s_xor_b32 s1, s3, s1
-; GFX7-NEXT:    s_and_b32 s0, s0, 1
-; GFX7-NEXT:    s_and_b32 s1, s1, 1
 ; GFX7-NEXT:    s_add_i32 s0, s4, s0
 ; GFX7-NEXT:    s_add_i32 s1, s5, s1
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -769,8 +764,6 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    s_xor_b32 s0, s2, s0
 ; GFX8-NEXT:    s_xor_b32 s1, s3, s1
-; GFX8-NEXT:    s_and_b32 s0, s0, 1
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    s_add_i32 s0, s4, s0
 ; GFX8-NEXT:    s_add_i32 s1, s5, s1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -789,8 +782,6 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    s_xor_b32 s0, s2, s0
 ; GFX9-NEXT:    s_xor_b32 s1, s3, s1
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    s_add_i32 s0, s4, s0
 ; GFX9-NEXT:    s_add_i32 s1, s5, s1
 ; GFX9-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 043e69abaeef2d3..38d49add27df132 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -66,7 +66,6 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:  .LBB2_2: ; %Flow
 ; GCN-NEXT:    s_xor_b32 s2, s4, 1
-; GCN-NEXT:    s_and_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s2, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_4
 ; GCN-NEXT:  ; %bb.3: ; %.zero
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index da9601a8998c2ba..138eb26063f67c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -36,7 +36,6 @@ define amdgpu_kernel void @localize_constants(i1 %cond) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  .LBB0_2: ; %Flow
 ; GFX9-NEXT:    s_xor_b32 s0, s0, 1
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB0_4
 ; GFX9-NEXT:  ; %bb.3: ; %bb0
@@ -121,7 +120,6 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  .LBB1_2: ; %Flow
 ; GFX9-NEXT:    s_xor_b32 s0, s0, 1
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_4
 ; GFX9-NEXT:  ; %bb.3: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index eb3f74be71de017..cef9e86c49e513a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -860,7 +860,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX7-NEXT:    v_mul_hi_u32 v3, s16, v1
 ; GFX7-NEXT:    s_mul_i32 s18, s1, s8
-; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX7-NEXT:    s_add_u32 s18, s18, s17
 ; GFX7-NEXT:    s_addc_u32 s17, s23, s22
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s11
@@ -871,33 +871,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_mul_i32 s24, s1, s11
 ; GFX7-NEXT:    v_readfirstlane_b32 s28, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    v_readfirstlane_b32 s27, v5
+; GFX7-NEXT:    v_readfirstlane_b32 s25, v5
 ; GFX7-NEXT:    v_mul_hi_u32 v5, v3, s9
-; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX7-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX7-NEXT:    s_add_u32 s24, s24, s22
-; GFX7-NEXT:    s_addc_u32 s23, s27, s23
+; GFX7-NEXT:    s_addc_u32 s23, s25, s23
 ; GFX7-NEXT:    v_readfirstlane_b32 s29, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX7-NEXT:    v_mul_hi_u32 v6, v5, s8
-; GFX7-NEXT:    s_mul_i32 s27, s2, s10
+; GFX7-NEXT:    s_mul_i32 s25, s2, s10
 ; GFX7-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX7-NEXT:    s_add_u32 s24, s27, s24
+; GFX7-NEXT:    s_add_u32 s24, s25, s24
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s10
-; GFX7-NEXT:    s_addc_u32 s27, s28, s23
+; GFX7-NEXT:    s_addc_u32 s25, s28, s23
 ; GFX7-NEXT:    s_mul_i32 s28, s3, s9
 ; GFX7-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX7-NEXT:    s_add_u32 s28, s28, s24
 ; GFX7-NEXT:    v_readfirstlane_b32 s30, v6
 ; GFX7-NEXT:    v_mul_hi_u32 v6, s16, v4
-; GFX7-NEXT:    s_addc_u32 s27, s29, s27
+; GFX7-NEXT:    s_addc_u32 s25, s29, s25
 ; GFX7-NEXT:    s_mul_i32 s29, s4, s8
 ; GFX7-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX7-NEXT:    s_add_u32 s28, s29, s28
 ; GFX7-NEXT:    v_readfirstlane_b32 s33, v0
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v2, s9
-; GFX7-NEXT:    s_addc_u32 s27, s30, s27
+; GFX7-NEXT:    s_addc_u32 s29, s30, s25
 ; GFX7-NEXT:    s_mul_i32 s30, s16, s11
-; GFX7-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s31, v6
 ; GFX7-NEXT:    s_add_u32 s19, s30, s19
 ; GFX7-NEXT:    s_addc_u32 s28, s31, s28
@@ -919,84 +919,84 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_addc_u32 s28, s35, s28
 ; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
 ; GFX7-NEXT:    s_cselect_b32 s34, 1, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX7-NEXT:    s_addc_u32 s19, s25, s19
+; GFX7-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX7-NEXT:    s_addc_u32 s19, s26, s19
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s13
-; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v6, s1, v2
 ; GFX7-NEXT:    s_addc_u32 s20, s20, 0
-; GFX7-NEXT:    v_readfirstlane_b32 s26, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s27, v0
 ; GFX7-NEXT:    v_mul_hi_u32 v0, s2, v1
-; GFX7-NEXT:    s_cmp_lg_u32 s25, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX7-NEXT:    s_addc_u32 s20, s20, s28
-; GFX7-NEXT:    s_mul_i32 s25, s16, s14
+; GFX7-NEXT:    s_mul_i32 s26, s16, s14
 ; GFX7-NEXT:    s_mul_i32 s28, s1, s13
 ; GFX7-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v3, s11
 ; GFX7-NEXT:    s_mul_i32 s28, s2, s12
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v5, s10
 ; GFX7-NEXT:    s_mul_i32 s28, s3, s11
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX7-NEXT:    v_mul_hi_u32 v6, v0, s9
 ; GFX7-NEXT:    s_mul_i32 s28, s4, s10
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
 ; GFX7-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX7-NEXT:    v_mul_hi_u32 v6, v6, s8
 ; GFX7-NEXT:    s_mul_i32 s28, s5, s9
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
 ; GFX7-NEXT:    v_mul_hi_u32 v2, s16, v2
 ; GFX7-NEXT:    v_readfirstlane_b32 s36, v1
 ; GFX7-NEXT:    v_mul_hi_u32 v1, s2, v4
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    s_mul_i32 s28, s6, s8
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    s_mul_i32 s28, s16, s13
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v2
-; GFX7-NEXT:    s_add_u32 s27, s28, s27
+; GFX7-NEXT:    s_add_u32 s28, s28, s29
 ; GFX7-NEXT:    v_readfirstlane_b32 s37, v1
 ; GFX7-NEXT:    v_mul_hi_u32 v1, v3, s10
-; GFX7-NEXT:    s_addc_u32 s25, s35, s25
+; GFX7-NEXT:    s_addc_u32 s26, s35, s26
 ; GFX7-NEXT:    s_mul_i32 s35, s1, s12
-; GFX7-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX7-NEXT:    s_add_u32 s27, s35, s27
-; GFX7-NEXT:    s_addc_u32 s25, s36, s25
+; GFX7-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX7-NEXT:    s_add_u32 s28, s35, s28
+; GFX7-NEXT:    s_addc_u32 s26, s36, s26
 ; GFX7-NEXT:    s_mul_i32 s36, s2, s11
 ; GFX7-NEXT:    s_cselect_b32 s35, 1, 0
-; GFX7-NEXT:    s_add_u32 s27, s36, s27
+; GFX7-NEXT:    s_add_u32 s28, s36, s28
 ; GFX7-NEXT:    v_readfirstlane_b32 s38, v1
 ; GFX7-NEXT:    v_mul_hi_u32 v1, v5, s9
-; GFX7-NEXT:    s_addc_u32 s25, s37, s25
+; GFX7-NEXT:    s_addc_u32 s26, s37, s26
 ; GFX7-NEXT:    s_mul_i32 s37, s3, s10
 ; GFX7-NEXT:    s_cselect_b32 s36, 1, 0
-; GFX7-NEXT:    s_add_u32 s27, s37, s27
+; GFX7-NEXT:    s_add_u32 s28, s37, s28
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s8
-; GFX7-NEXT:    s_addc_u32 s25, s38, s25
+; GFX7-NEXT:    s_addc_u32 s26, s38, s26
 ; GFX7-NEXT:    s_mul_i32 s38, s4, s9
 ; GFX7-NEXT:    s_cselect_b32 s37, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s39, v1
-; GFX7-NEXT:    s_add_u32 s27, s38, s27
-; GFX7-NEXT:    s_addc_u32 s25, s39, s25
+; GFX7-NEXT:    s_add_u32 s28, s38, s28
+; GFX7-NEXT:    s_addc_u32 s26, s39, s26
 ; GFX7-NEXT:    s_mul_i32 s39, s5, s8
 ; GFX7-NEXT:    s_cselect_b32 s38, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s40, v0
-; GFX7-NEXT:    s_add_u32 s27, s39, s27
-; GFX7-NEXT:    s_addc_u32 s25, s40, s25
+; GFX7-NEXT:    s_add_u32 s28, s39, s28
+; GFX7-NEXT:    s_addc_u32 s26, s40, s26
 ; GFX7-NEXT:    s_cselect_b32 s39, 1, 0
 ; GFX7-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX7-NEXT:    s_addc_u32 s30, s30, 0
@@ -1005,18 +1005,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_cmp_lg_u32 s34, 0
 ; GFX7-NEXT:    s_addc_u32 s30, s30, 0
 ; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX7-NEXT:    s_addc_u32 s21, s30, s27
-; GFX7-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX7-NEXT:    s_addc_u32 s21, s30, s28
+; GFX7-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX7-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX7-NEXT:    s_addc_u32 s22, s22, 0
 ; GFX7-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX7-NEXT:    s_addc_u32 s22, s22, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX7-NEXT:    s_addc_u32 s22, s22, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX7-NEXT:    s_addc_u32 s22, s22, s25
+; GFX7-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX7-NEXT:    s_addc_u32 s22, s22, s26
 ; GFX7-NEXT:    s_mul_i32 s16, s16, s15
-; GFX7-NEXT:    s_addc_u32 s15, s26, s16
+; GFX7-NEXT:    s_addc_u32 s15, s27, s16
 ; GFX7-NEXT:    s_mul_i32 s1, s1, s14
 ; GFX7-NEXT:    s_cmp_lg_u32 s39, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s15, s1
@@ -1033,7 +1033,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_cmp_lg_u32 s35, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX7-NEXT:    s_mul_i32 s6, s6, s9
-; GFX7-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s29, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX7-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX7-NEXT:    s_mul_i32 s0, s0, s8
@@ -1081,7 +1081,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s16, v1
 ; GFX8-NEXT:    s_mul_i32 s18, s1, s8
-; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX8-NEXT:    s_add_u32 s18, s18, s17
 ; GFX8-NEXT:    s_addc_u32 s17, s23, s22
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s11
@@ -1092,33 +1092,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_mul_i32 s24, s1, s11
 ; GFX8-NEXT:    v_readfirstlane_b32 s28, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_readfirstlane_b32 s27, v5
+; GFX8-NEXT:    v_readfirstlane_b32 s25, v5
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v3, s9
-; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX8-NEXT:    s_add_u32 s24, s24, s22
-; GFX8-NEXT:    s_addc_u32 s23, s27, s23
+; GFX8-NEXT:    s_addc_u32 s23, s25, s23
 ; GFX8-NEXT:    v_readfirstlane_b32 s29, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v5, s8
-; GFX8-NEXT:    s_mul_i32 s27, s2, s10
+; GFX8-NEXT:    s_mul_i32 s25, s2, s10
 ; GFX8-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX8-NEXT:    s_add_u32 s24, s27, s24
+; GFX8-NEXT:    s_add_u32 s24, s25, s24
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s10
-; GFX8-NEXT:    s_addc_u32 s27, s28, s23
+; GFX8-NEXT:    s_addc_u32 s25, s28, s23
 ; GFX8-NEXT:    s_mul_i32 s28, s3, s9
 ; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX8-NEXT:    s_add_u32 s28, s28, s24
 ; GFX8-NEXT:    v_readfirstlane_b32 s30, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v6, s16, v4
-; GFX8-NEXT:    s_addc_u32 s27, s29, s27
+; GFX8-NEXT:    s_addc_u32 s25, s29, s25
 ; GFX8-NEXT:    s_mul_i32 s29, s4, s8
 ; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX8-NEXT:    s_add_u32 s28, s29, s28
 ; GFX8-NEXT:    v_readfirstlane_b32 s33, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v2, s9
-; GFX8-NEXT:    s_addc_u32 s27, s30, s27
+; GFX8-NEXT:    s_addc_u32 s29, s30, s25
 ; GFX8-NEXT:    s_mul_i32 s30, s16, s11
-; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s31, v6
 ; GFX8-NEXT:    s_add_u32 s19, s30, s19
 ; GFX8-NEXT:    s_addc_u32 s28, s31, s28
@@ -1140,84 +1140,84 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_addc_u32 s28, s35, s28
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
 ; GFX8-NEXT:    s_cselect_b32 s34, 1, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_addc_u32 s19, s25, s19
+; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_addc_u32 s19, s26, s19
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s13
-; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v6, s1, v2
 ; GFX8-NEXT:    s_addc_u32 s20, s20, 0
-; GFX8-NEXT:    v_readfirstlane_b32 s26, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s27, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v1
-; GFX8-NEXT:    s_cmp_lg_u32 s25, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_addc_u32 s20, s20, s28
-; GFX8-NEXT:    s_mul_i32 s25, s16, s14
+; GFX8-NEXT:    s_mul_i32 s26, s16, s14
 ; GFX8-NEXT:    s_mul_i32 s28, s1, s13
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v3, s11
 ; GFX8-NEXT:    s_mul_i32 s28, s2, s12
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v5, s10
 ; GFX8-NEXT:    s_mul_i32 s28, s3, s11
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v0, s9
 ; GFX8-NEXT:    s_mul_i32 s28, s4, s10
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v6, s8
 ; GFX8-NEXT:    s_mul_i32 s28, s5, s9
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
 ; GFX8-NEXT:    v_mul_hi_u32 v2, s16, v2
 ; GFX8-NEXT:    v_readfirstlane_b32 s36, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v4
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    s_mul_i32 s28, s6, s8
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    s_mul_i32 s28, s16, s13
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v2
-; GFX8-NEXT:    s_add_u32 s27, s28, s27
+; GFX8-NEXT:    s_add_u32 s28, s28, s29
 ; GFX8-NEXT:    v_readfirstlane_b32 s37, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v3, s10
-; GFX8-NEXT:    s_addc_u32 s25, s35, s25
+; GFX8-NEXT:    s_addc_u32 s26, s35, s26
 ; GFX8-NEXT:    s_mul_i32 s35, s1, s12
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_add_u32 s27, s35, s27
-; GFX8-NEXT:    s_addc_u32 s25, s36, s25
+; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX8-NEXT:    s_add_u32 s28, s35, s28
+; GFX8-NEXT:    s_addc_u32 s26, s36, s26
 ; GFX8-NEXT:    s_mul_i32 s36, s2, s11
 ; GFX8-NEXT:    s_cselect_b32 s35, 1, 0
-; GFX8-NEXT:    s_add_u32 s27, s36, s27
+; GFX8-NEXT:    s_add_u32 s28, s36, s28
 ; GFX8-NEXT:    v_readfirstlane_b32 s38, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v5, s9
-; GFX8-NEXT:    s_addc_u32 s25, s37, s25
+; GFX8-NEXT:    s_addc_u32 s26, s37, s26
 ; GFX8-NEXT:    s_mul_i32 s37, s3, s10
 ; GFX8-NEXT:    s_cselect_b32 s36, 1, 0
-; GFX8-NEXT:    s_add_u32 s27, s37, s27
+; GFX8-NEXT:    s_add_u32 s28, s37, s28
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s8
-; GFX8-NEXT:    s_addc_u32 s25, s38, s25
+; GFX8-NEXT:    s_addc_u32 s26, s38, s26
 ; GFX8-NEXT:    s_mul_i32 s38, s4, s9
 ; GFX8-NEXT:    s_cselect_b32 s37, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s39, v1
-; GFX8-NEXT:    s_add_u32 s27, s38, s27
-; GFX8-NEXT:    s_addc_u32 s25, s39, s25
+; GFX8-NEXT:    s_add_u32 s28, s38, s28
+; GFX8-NEXT:    s_addc_u32 s26, s39, s26
 ; GFX8-NEXT:    s_mul_i32 s39, s5, s8
 ; GFX8-NEXT:    s_cselect_b32 s38, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s40, v0
-; GFX8-NEXT:    s_add_u32 s27, s39, s27
-; GFX8-NEXT:    s_addc_u32 s25, s40, s25
+; GFX8-NEXT:    s_add_u32 s28, s39, s28
+; GFX8-NEXT:    s_addc_u32 s26, s40, s26
 ; GFX8-NEXT:    s_cselect_b32 s39, 1, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX8-NEXT:    s_addc_u32 s30, s30, 0
@@ -1226,18 +1226,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_cmp_lg_u32 s34, 0
 ; GFX8-NEXT:    s_addc_u32 s30, s30, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX8-NEXT:    s_addc_u32 s21, s30, s27
-; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX8-NEXT:    s_addc_u32 s21, s30, s28
+; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_addc_u32 s22, s22, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX8-NEXT:    s_addc_u32 s22, s22, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX8-NEXT:    s_addc_u32 s22, s22, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX8-NEXT:    s_addc_u32 s22, s22, s25
+; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_addc_u32 s22, s22, s26
 ; GFX8-NEXT:    s_mul_i32 s16, s16, s15
-; GFX8-NEXT:    s_addc_u32 s15, s26, s16
+; GFX8-NEXT:    s_addc_u32 s15, s27, s16
 ; GFX8-NEXT:    s_mul_i32 s1, s1, s14
 ; GFX8-NEXT:    s_cmp_lg_u32 s39, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s15, s1
@@ -1254,7 +1254,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_cmp_lg_u32 s35, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX8-NEXT:    s_mul_i32 s6, s6, s9
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX8-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX8-NEXT:    s_mul_i32 s0, s0, s8
@@ -1286,15 +1286,15 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX9-NEXT:    s_mul_hi_u32 s23, s16, s9
 ; GFX9-NEXT:    s_add_u32 s17, s22, s17
-; GFX9-NEXT:    s_addc_u32 s18, s23, s18
-; GFX9-NEXT:    s_mul_i32 s23, s1, s8
-; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_addc_u32 s22, s23, s18
+; GFX9-NEXT:    s_mul_i32 s18, s1, s8
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX9-NEXT:    s_mul_hi_u32 s24, s1, s8
-; GFX9-NEXT:    s_add_u32 s17, s23, s17
-; GFX9-NEXT:    s_addc_u32 s18, s24, s18
+; GFX9-NEXT:    s_add_u32 s18, s18, s17
+; GFX9-NEXT:    s_addc_u32 s17, s24, s22
 ; GFX9-NEXT:    s_mul_i32 s24, s16, s12
 ; GFX9-NEXT:    s_mul_i32 s26, s1, s11
-; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX9-NEXT:    s_mul_hi_u32 s25, s16, s12
 ; GFX9-NEXT:    s_mul_hi_u32 s27, s1, s11
 ; GFX9-NEXT:    s_add_u32 s24, s26, s24
@@ -1335,8 +1335,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s19, s34, s19
 ; GFX9-NEXT:    s_addc_u32 s24, s35, s24
 ; GFX9-NEXT:    s_cselect_b32 s34, 1, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX9-NEXT:    s_addc_u32 s19, s22, s19
+; GFX9-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX9-NEXT:    s_addc_u32 s19, s23, s19
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_addc_u32 s20, s20, 0
@@ -1439,8 +1439,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s8
 ; GFX9-NEXT:    s_add_u32 s7, s7, s1
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s2, s18
+; GFX9-NEXT:    s_mov_b32 s1, s18
+; GFX9-NEXT:    s_mov_b32 s2, s17
 ; GFX9-NEXT:    s_mov_b32 s3, s19
 ; GFX9-NEXT:    s_mov_b32 s4, s20
 ; GFX9-NEXT:    s_mov_b32 s5, s21
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir
index b76fbdd9bed15e5..e47e7d63fc9453d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir
@@ -50,7 +50,11 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s16) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -71,7 +75,10 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -92,7 +99,12 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s64) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[C2]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -180,7 +192,11 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s16) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0
     %2:_(s16) = G_ANYEXT %1
@@ -198,7 +214,10 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0
     %2:_(s32) = G_ANYEXT %1
@@ -216,7 +235,12 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s64) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[C2]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0
     %2:_(s64) = G_ANYEXT %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-freeze.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-freeze.mir
index d87bc1f01bdb85b..bb2917ac83886c5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-freeze.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-freeze.mir
@@ -62,8 +62,11 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
     ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:sgpr(s1) = G_FREEZE [[TRUNC]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[FREEZE]](s1)
-    ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: $sgpr0 = COPY [[SELECT]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0(s32)
     %2:_(s1) = G_FREEZE %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir
index 9e56cb85bf409a0..17c6609e2de2ba2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir
@@ -69,7 +69,11 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
-    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s16) = G_SEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -90,7 +94,10 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
-    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -111,7 +118,11 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
-    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s64) = G_SEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -199,7 +210,11 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s16) = G_SEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0
     %2:_(s16) = G_SEXT %1
@@ -217,7 +232,10 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0
     %2:_(s32) = G_SEXT %1
@@ -235,7 +253,11 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s64) = G_SEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0
     %2:_(s64) = G_SEXT %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir
index 8756061d89ca224..d486958c8e9b31d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir
@@ -68,7 +68,11 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s16) = G_ZEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -89,7 +93,10 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -110,7 +117,12 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s64) = G_ZEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[C2]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s1) = G_ICMP intpred(eq), %0, %1
@@ -198,7 +210,11 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s16) = G_ZEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0
     %2:_(s16) = G_ZEXT %1
@@ -216,7 +232,10 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0
     %2:_(s32) = G_ZEXT %1
@@ -234,7 +253,12 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s64) = G_ZEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ANYEXT]](s32), [[C1]], [[C]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[C2]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0
     %2:_(s64) = G_ZEXT %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index cded5c94edf8cc3..7bed94ff8ff4dce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -1081,7 +1081,6 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 ; GFX8-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s3, 23
 ; GFX8-NEXT:    s_add_i32 s1, s1, 0xff800000
-; GFX8-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s0, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 4248f7b6a158312..bef24b766b4c3e8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -335,7 +335,6 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  .LBB1_3: ; %Flow
 ; CHECK-NEXT:    s_xor_b32 s0, s0, 1
-; CHECK-NEXT:    s_and_b32 s0, s0, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB1_5
 ; CHECK-NEXT:  ; %bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index d0c55c69f508775..174963c3d6e56e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -327,7 +327,6 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  .LBB1_3: ; %Flow
 ; CHECK-NEXT:    s_xor_b32 s0, s0, 1
-; CHECK-NEXT:    s_and_b32 s0, s0, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB1_5
 ; CHECK-NEXT:  ; %bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 65455d754be4f53..0177f00fbb64b4b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -1081,7 +1081,6 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 ; GFX8-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s3, 23
 ; GFX8-NEXT:    s_add_i32 s1, s1, 0xff800000
-; GFX8-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s0, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
index f8e7e5ecd6260fb..e05531dac57aedb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
@@ -640,7 +640,6 @@ define amdgpu_ps i32 @s_ssubo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX7-NEXT:    s_cmp_gt_i32 s1, 0
 ; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX7-NEXT:    s_xor_b32 s0, s1, s0
-; GFX7-NEXT:    s_and_b32 s0, s0, 1
 ; GFX7-NEXT:    s_sub_i32 s0, s2, s0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
@@ -652,7 +651,6 @@ define amdgpu_ps i32 @s_ssubo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX8-NEXT:    s_cmp_gt_i32 s1, 0
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    s_xor_b32 s0, s1, s0
-; GFX8-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8-NEXT:    s_sub_i32 s0, s2, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -664,7 +662,6 @@ define amdgpu_ps i32 @s_ssubo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX9-NEXT:    s_cmp_gt_i32 s1, 0
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    s_xor_b32 s0, s1, s0
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    s_sub_i32 s0, s2, s0
 ; GFX9-NEXT:    ; return to shader part epilog
   %ssubo = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
@@ -749,8 +746,6 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX7-NEXT:    s_xor_b32 s0, s2, s0
 ; GFX7-NEXT:    s_xor_b32 s1, s3, s1
-; GFX7-NEXT:    s_and_b32 s0, s0, 1
-; GFX7-NEXT:    s_and_b32 s1, s1, 1
 ; GFX7-NEXT:    s_sub_i32 s0, s4, s0
 ; GFX7-NEXT:    s_sub_i32 s1, s5, s1
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -769,8 +764,6 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    s_xor_b32 s0, s2, s0
 ; GFX8-NEXT:    s_xor_b32 s1, s3, s1
-; GFX8-NEXT:    s_and_b32 s0, s0, 1
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    s_sub_i32 s0, s4, s0
 ; GFX8-NEXT:    s_sub_i32 s1, s5, s1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -789,8 +782,6 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    s_xor_b32 s0, s2, s0
 ; GFX9-NEXT:    s_xor_b32 s1, s3, s1
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    s_sub_i32 s0, s4, s0
 ; GFX9-NEXT:    s_sub_i32 s1, s5, s1
 ; GFX9-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 77737b356ff6e9c..c14a7ed5b8c8db3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -323,7 +323,6 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  .LBB1_3: ; %Flow
 ; CHECK-NEXT:    s_xor_b32 s1, s4, 1
-; CHECK-NEXT:    s_and_b32 s1, s1, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB1_5
 ; CHECK-NEXT:  ; %bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 097f6642cbc669b..6f7ebc8f300c170 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -319,7 +319,6 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  .LBB1_3: ; %Flow
 ; CHECK-NEXT:    s_xor_b32 s1, s4, 1
-; CHECK-NEXT:    s_and_b32 s1, s1, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB1_5
 ; CHECK-NEXT:  ; %bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll b/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
index e996fda4c9fd6ca..7c481310de64859 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
@@ -19,9 +19,8 @@ define amdgpu_vs void @fcmp_f32_olt_to_ogt(ptr addrspace(1) inreg %out, float in
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_gt_f32 s2, 2.0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -51,9 +50,8 @@ define amdgpu_vs void @fcmp_f32_ogt_to_olt(ptr addrspace(1) inreg %out, float in
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_lt_f32 s2, 2.0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -83,9 +81,8 @@ define amdgpu_vs void @fcmp_f32_ole_to_oge(ptr addrspace(1) inreg %out, float in
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_ge_f32 s2, 2.0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -115,9 +112,8 @@ define amdgpu_vs void @fcmp_f32_oge_to_ole(ptr addrspace(1) inreg %out, float in
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_le_f32 s2, 2.0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -147,9 +143,8 @@ define amdgpu_vs void @fcmp_f32_ult_to_ugt(ptr addrspace(1) inreg %out, float in
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nle_f32 s2, 2.0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -179,9 +174,8 @@ define amdgpu_vs void @fcmp_f32_ugt_to_ult(ptr addrspace(1) inreg %out, float in
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nge_f32 s2, 2.0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -211,9 +205,8 @@ define amdgpu_vs void @fcmp_f32_ule_to_uge(ptr addrspace(1) inreg %out, float in
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nlt_f32 s2, 2.0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -243,9 +236,8 @@ define amdgpu_vs void @fcmp_f32_uge_to_ule(ptr addrspace(1) inreg %out, float in
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_ngt_f32 s2, 2.0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -275,9 +267,8 @@ define amdgpu_vs void @fcmp_f16_olt_to_ogt(ptr addrspace(1) inreg %out, half inr
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_gt_f16 s2, 0x4000
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -307,9 +298,8 @@ define amdgpu_vs void @fcmp_f16_ogt_to_olt(ptr addrspace(1) inreg %out, half inr
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_lt_f16 s2, 0x4000
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -339,9 +329,8 @@ define amdgpu_vs void @fcmp_f16_ole_to_oge(ptr addrspace(1) inreg %out, half inr
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_ge_f16 s2, 0x4000
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -371,9 +360,8 @@ define amdgpu_vs void @fcmp_f16_oge_to_ole(ptr addrspace(1) inreg %out, half inr
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_le_f16 s2, 0x4000
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -403,9 +391,8 @@ define amdgpu_vs void @fcmp_f16_ult_to_ugt(ptr addrspace(1) inreg %out, half inr
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nle_f16 s2, 0x4000
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -435,9 +422,8 @@ define amdgpu_vs void @fcmp_f16_ugt_to_ult(ptr addrspace(1) inreg %out, half inr
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nge_f16 s2, 0x4000
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -467,9 +453,8 @@ define amdgpu_vs void @fcmp_ule_to_uge(ptr addrspace(1) inreg %out, half inreg %
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nlt_f16 s2, 0x4000
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -499,9 +484,8 @@ define amdgpu_vs void @fcmp_uge_to_ule(ptr addrspace(1) inreg %out, half inreg %
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_ngt_f16 s2, 0x4000
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 97216b6c94693c4..dd60749db53f85f 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -252,7 +252,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s6, s7, s6
-; VI-SAFE-GISEL-NEXT:    s_and_b32 s6, s6, 1
 ; VI-SAFE-GISEL-NEXT:    s_add_i32 s2, s2, s6
 ; VI-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
@@ -381,7 +380,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s7, s6
-; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s6, s6, 1
 ; GFX10-SAFE-GISEL-NEXT:    s_add_i32 s2, s2, s6
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
@@ -525,8 +523,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s6, s6, 1
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SAFE-GISEL-NEXT:    s_add_i32 s2, s2, s6
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index 671ead6127308dd..a33388b747d6498 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -34,8 +34,7 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
 ; GFX7GLISEL-NEXT:    s_and_b32 s3, s3, 0x7fff
 ; GFX7GLISEL-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX7GLISEL-NEXT:    s_cmpk_gt_u32 s3, 0x7c00
-; GFX7GLISEL-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX7GLISEL-NEXT:    s_bfe_i32 s3, s3, 0x10000
+; GFX7GLISEL-NEXT:    s_cselect_b32 s3, -1, 0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX7GLISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7GLISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
index 19e50be155a9646..f8424b89eb7a626 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
@@ -19,9 +19,8 @@ define amdgpu_vs void @f32_olt(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_lt_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -51,9 +50,8 @@ define amdgpu_vs void @f32_oeq(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_eq_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -83,9 +81,8 @@ define amdgpu_vs void @f32_ole(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_le_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -115,9 +112,8 @@ define amdgpu_vs void @f32_ogt(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_gt_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -147,9 +143,8 @@ define amdgpu_vs void @f32_one(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_lg_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -179,9 +174,8 @@ define amdgpu_vs void @f32_oge(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_ge_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -211,9 +205,8 @@ define amdgpu_vs void @f32_ord(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_o_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -243,9 +236,8 @@ define amdgpu_vs void @f32_uno(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_u_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -275,9 +267,8 @@ define amdgpu_vs void @f32_ult(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nge_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -307,9 +298,8 @@ define amdgpu_vs void @f32_ueq(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nlg_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -339,9 +329,8 @@ define amdgpu_vs void @f32_ule(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_ngt_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -371,9 +360,8 @@ define amdgpu_vs void @f32_ugt(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nle_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -403,9 +391,8 @@ define amdgpu_vs void @f32_une(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_neq_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -435,9 +422,8 @@ define amdgpu_vs void @f32_uge(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nlt_f32 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -467,9 +453,8 @@ define amdgpu_vs void @f16_olt(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_lt_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -499,9 +484,8 @@ define amdgpu_vs void @f16_oeq(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_eq_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -531,9 +515,8 @@ define amdgpu_vs void @f16_ole(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_le_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -563,9 +546,8 @@ define amdgpu_vs void @f16_ogt(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_gt_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -595,9 +577,8 @@ define amdgpu_vs void @f16_one(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_lg_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -627,9 +608,8 @@ define amdgpu_vs void @f16_oge(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_ge_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -659,9 +639,8 @@ define amdgpu_vs void @f16_ord(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_o_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -691,9 +670,8 @@ define amdgpu_vs void @f16_uno(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_u_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -723,9 +701,8 @@ define amdgpu_vs void @f16_ult(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nge_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -755,9 +732,8 @@ define amdgpu_vs void @f16_ueq(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nlg_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -787,9 +763,8 @@ define amdgpu_vs void @f16_ule(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_ngt_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -819,9 +794,8 @@ define amdgpu_vs void @f16_ugt(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nle_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -851,9 +825,8 @@ define amdgpu_vs void @f16_une(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_neq_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0
@@ -883,9 +856,8 @@ define amdgpu_vs void @f16_uge(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_nlt_f16 s2, s3
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT:    s_cselect_b32 s2, -1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-NEXT:    s_nop 0