[llvm] b3bb5c3 - [AMDGPU][GlobalISel] Use scalar min/max instructions

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 4 09:04:41 PST 2021


Author: Jay Foad
Date: 2021-02-04T17:04:32Z
New Revision: b3bb5c3efc971f595a08446f3e58c0fd4162c26d

URL: https://github.com/llvm/llvm-project/commit/b3bb5c3efc971f595a08446f3e58c0fd4162c26d
DIFF: https://github.com/llvm/llvm-project/commit/b3bb5c3efc971f595a08446f3e58c0fd4162c26d.diff

LOG: [AMDGPU][GlobalISel] Use scalar min/max instructions

SALU min/max s32 instructions exist so use them. This means that
regbankselect can handle min/max much like add/sub/mul/shifts.

Differential Revision: https://reviews.llvm.org/D96047

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 502356d4f9a4..408c8d96439e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -591,21 +591,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
 
     return AltMappings;
   }
-  case TargetOpcode::G_SMIN:
-  case TargetOpcode::G_SMAX:
-  case TargetOpcode::G_UMIN:
-  case TargetOpcode::G_UMAX: {
-    static const OpRegBankEntry<3> Table[2] = {
-      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
-
-      // Scalar requires cmp+select, and extends if 16-bit.
-      // FIXME: Should there be separate costs for 32 and 16-bit
-      { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
-    };
-
-    const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
-    return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
-  }
   case TargetOpcode::G_UADDE:
   case TargetOpcode::G_USUBE:
   case TargetOpcode::G_SADDE:
@@ -1576,23 +1561,8 @@ bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
   return true;
 }
 
-// FIXME: Duplicated from LegalizerHelper
-static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
-  switch (Opc) {
-  case TargetOpcode::G_SMIN:
-    return CmpInst::ICMP_SLT;
-  case TargetOpcode::G_SMAX:
-    return CmpInst::ICMP_SGT;
-  case TargetOpcode::G_UMIN:
-    return CmpInst::ICMP_ULT;
-  case TargetOpcode::G_UMAX:
-    return CmpInst::ICMP_UGT;
-  default:
-    llvm_unreachable("not in integer min/max");
-  }
-}
-
-static unsigned minMaxToExtend(unsigned Opc) {
+// Return a suitable opcode for extending the operands of Opc when widening.
+static unsigned getExtendOp(unsigned Opc) {
   switch (Opc) {
   case TargetOpcode::G_SMIN:
   case TargetOpcode::G_SMAX:
@@ -1601,7 +1571,7 @@ static unsigned minMaxToExtend(unsigned Opc) {
   case TargetOpcode::G_UMAX:
     return TargetOpcode::G_ZEXT;
   default:
-    llvm_unreachable("not in integer min/max");
+    return TargetOpcode::G_ANYEXT;
   }
 }
 
@@ -1628,30 +1598,6 @@ unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
   return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
 }
 
-static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B,
-                                               CmpInst::Predicate Pred,
-                                               Register Dst, Register Src0,
-                                               Register Src1) {
-  const LLT CmpType = LLT::scalar(32);
-  auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
-  return B.buildSelect(Dst, Cmp, Src0, Src1);
-}
-
-// FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
-void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
-                                               MachineInstr &MI) const {
-  Register Dst = MI.getOperand(0).getReg();
-  Register Src0 = MI.getOperand(1).getReg();
-  Register Src1 = MI.getOperand(2).getReg();
-
-  const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
-  MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1);
-
-  Register CmpReg = Sel->getOperand(1).getReg();
-  B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank);
-  MI.eraseFromParent();
-}
-
 // For cases where only a single copy is inserted for matching register banks.
 // Replace the register in the instruction operand
 static bool substituteSimpleCopyRegs(
@@ -2341,7 +2287,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   case AMDGPU::G_MUL:
   case AMDGPU::G_SHL:
   case AMDGPU::G_LSHR:
-  case AMDGPU::G_ASHR: {
+  case AMDGPU::G_ASHR:
+  case AMDGPU::G_SMIN:
+  case AMDGPU::G_SMAX:
+  case AMDGPU::G_UMIN:
+  case AMDGPU::G_UMAX: {
     Register DstReg = MI.getOperand(0).getReg();
     LLT DstTy = MRI.getType(DstReg);
 
@@ -2365,10 +2315,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       Register WideSrc0Lo, WideSrc0Hi;
       Register WideSrc1Lo, WideSrc1Hi;
 
+      unsigned ExtendOp = getExtendOp(MI.getOpcode());
       std::tie(WideSrc0Lo, WideSrc0Hi)
-        = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT);
+        = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
       std::tie(WideSrc1Lo, WideSrc1Hi)
-        = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT);
+        = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
@@ -2390,73 +2341,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
     return;
   }
-  case AMDGPU::G_SMIN:
-  case AMDGPU::G_SMAX:
-  case AMDGPU::G_UMIN:
-  case AMDGPU::G_UMAX: {
-    Register DstReg = MI.getOperand(0).getReg();
-    const RegisterBank *DstBank =
-      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
-    if (DstBank == &AMDGPU::VGPRRegBank)
-      break;
-
-    MachineFunction *MF = MI.getParent()->getParent();
-    MachineIRBuilder B(MI);
-
-    // Turn scalar min/max into a compare and select.
-    LLT Ty = MRI.getType(DstReg);
-    const LLT S32 = LLT::scalar(32);
-    const LLT S16 = LLT::scalar(16);
-    const LLT V2S16 = LLT::vector(2, 16);
-
-    if (Ty == V2S16) {
-      ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
-      B.setChangeObserver(ApplySALU);
-
-      // Need to widen to s32, and expand as cmp + select, and avoid producing
-      // illegal vector extends or unmerges that would need further
-      // legalization.
-      //
-      // TODO: Should we just readfirstlane? That should probably be handled
-      // with a UniformVGPR register bank that wouldn't need special
-      // consideration here.
-
-      Register Dst = MI.getOperand(0).getReg();
-      Register Src0 = MI.getOperand(1).getReg();
-      Register Src1 = MI.getOperand(2).getReg();
-
-      Register WideSrc0Lo, WideSrc0Hi;
-      Register WideSrc1Lo, WideSrc1Hi;
-
-      unsigned ExtendOp = minMaxToExtend(MI.getOpcode());
-
-      std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp);
-      std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp);
-
-      Register Lo = MRI.createGenericVirtualRegister(S32);
-      Register Hi = MRI.createGenericVirtualRegister(S32);
-      const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
-      buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo);
-      buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi);
-
-      B.buildBuildVectorTrunc(Dst, {Lo, Hi});
-      MI.eraseFromParent();
-    } else if (Ty == S16) {
-      ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
-      B.setChangeObserver(ApplySALU);
-      LegalizerHelper Helper(*MF, ApplySALU, B);
-
-      // Need to widen to s32, and expand as cmp + select.
-      if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
-        llvm_unreachable("widenScalar should have succeeded");
-
-      // FIXME: This is relying on widenScalar leaving MI in place.
-      lowerScalarMinMax(B, MI);
-    } else
-      lowerScalarMinMax(B, MI);
-
-    return;
-  }
   case AMDGPU::G_SEXT_INREG: {
     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
     if (SrcRegs.empty())

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 1c1441729e30..c481aadeb226 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -84,8 +84,6 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
   bool applyMappingBFEIntrinsic(const OperandsMapper &OpdMapper,
                                 bool Signed) const;
 
-  void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const;
-
   Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
                           Register Reg) const;
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir
index c12d209d9bda..d932e66ca0f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir
@@ -13,8 +13,7 @@ body: |
     ; CHECK-LABEL: name: smax_s32_ss
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[COPY1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
+    ; CHECK: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[COPY]], [[COPY1]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_SMAX %0, %1
@@ -90,9 +89,8 @@ body: |
     ; CHECK-LABEL: name: smax_s32_ss_vgpr_use
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[COPY1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
-    ; CHECK: $vgpr0 = COPY [[SELECT]](s32)
+    ; CHECK: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[COPY]], [[COPY1]]
+    ; CHECK: $vgpr0 = COPY [[SMAX]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_SMAX %0, %1
@@ -114,9 +112,8 @@ body: |
     ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
     ; CHECK: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s16)
     ; CHECK: [[SEXT1:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC1]](s16)
-    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[SEXT]](s32), [[SEXT1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[SEXT]], [[SEXT1]]
+    ; CHECK: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[SEXT]], [[SEXT1]]
+    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SMAX]](s32)
     ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
     ; CHECK: $sgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $sgpr0
@@ -144,9 +141,8 @@ body: |
     ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
     ; CHECK: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s16)
     ; CHECK: [[SEXT1:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC1]](s16)
-    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[SEXT]](s32), [[SEXT1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[SEXT]], [[SEXT1]]
+    ; CHECK: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[SEXT]], [[SEXT1]]
+    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SMAX]](s32)
     ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
     ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $sgpr0
@@ -178,11 +174,9 @@ body: |
     ; CHECK: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16
     ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
     ; CHECK: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32)
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[SEXT_INREG]](s32), [[SEXT_INREG1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; CHECK: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[ASHR]](s32), [[ASHR1]]
-    ; CHECK: [[SELECT1:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP1]](s32), [[ASHR]], [[ASHR1]]
-    ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SELECT]](s32), [[SELECT1]](s32)
+    ; CHECK: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]]
+    ; CHECK: [[SMAX1:%[0-9]+]]:sgpr(s32) = G_SMAX [[ASHR]], [[ASHR1]]
+    ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SMAX]](s32), [[SMAX1]](s32)
     ; CHECK: $sgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $sgpr0
     %1:_(<2 x s16>) = COPY $sgpr1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir
index da19ddcb86df..1efac8980b5e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s  | FileCheck %s
-# XUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s  | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s  | FileCheck %s
 
 ---
 name: smin_s32_ss
@@ -13,9 +13,8 @@ body: |
     ; CHECK-LABEL: name: smin_s32_ss
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
-    ; CHECK: $sgpr0 = COPY [[SELECT]](s32)
+    ; CHECK: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[COPY]], [[COPY1]]
+    ; CHECK: $sgpr0 = COPY [[SMIN]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_SMIN %0, %1
@@ -93,9 +92,8 @@ body: |
     ; CHECK-LABEL: name: smin_s32_ss_vgpr_use
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
-    ; CHECK: $vgpr0 = COPY [[SELECT]](s32)
+    ; CHECK: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[COPY]], [[COPY1]]
+    ; CHECK: $vgpr0 = COPY [[SMIN]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_SMIN %0, %1
@@ -117,9 +115,8 @@ body: |
     ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
     ; CHECK: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s16)
     ; CHECK: [[SEXT1:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC1]](s16)
-    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[SEXT]](s32), [[SEXT1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[SEXT]], [[SEXT1]]
+    ; CHECK: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[SEXT]], [[SEXT1]]
+    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SMIN]](s32)
     ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
     ; CHECK: $sgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $sgpr0
@@ -147,9 +144,8 @@ body: |
     ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
     ; CHECK: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s16)
     ; CHECK: [[SEXT1:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC1]](s16)
-    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[SEXT]](s32), [[SEXT1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[SEXT]], [[SEXT1]]
+    ; CHECK: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[SEXT]], [[SEXT1]]
+    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SMIN]](s32)
     ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
     ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $sgpr0
@@ -181,11 +177,9 @@ body: |
     ; CHECK: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16
     ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
     ; CHECK: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32)
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[SEXT_INREG]](s32), [[SEXT_INREG1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; CHECK: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[ASHR]](s32), [[ASHR1]]
-    ; CHECK: [[SELECT1:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP1]](s32), [[ASHR]], [[ASHR1]]
-    ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SELECT]](s32), [[SELECT1]](s32)
+    ; CHECK: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]]
+    ; CHECK: [[SMIN1:%[0-9]+]]:sgpr(s32) = G_SMIN [[ASHR]], [[ASHR1]]
+    ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SMIN]](s32), [[SMIN1]](s32)
     ; CHECK: $sgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $sgpr0
     %1:_(<2 x s16>) = COPY $sgpr1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir
index 3f464f890d04..5b4cc72990c6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir
@@ -13,9 +13,8 @@ body: |
     ; CHECK-LABEL: name: umax_s32_ss
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
-    ; CHECK: $sgpr0 = COPY [[SELECT]](s32)
+    ; CHECK: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[COPY]], [[COPY1]]
+    ; CHECK: $sgpr0 = COPY [[UMAX]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_UMAX %0, %1
@@ -93,9 +92,8 @@ body: |
     ; CHECK-LABEL: name: umax_s32_ss_vgpr_use
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
-    ; CHECK: $vgpr0 = COPY [[SELECT]](s32)
+    ; CHECK: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[COPY]], [[COPY1]]
+    ; CHECK: $vgpr0 = COPY [[UMAX]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_UMAX %0, %1
@@ -117,9 +115,8 @@ body: |
     ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
     ; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s16)
     ; CHECK: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s16)
-    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ugt), [[ZEXT]](s32), [[ZEXT1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[ZEXT]], [[ZEXT1]]
+    ; CHECK: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[ZEXT]], [[ZEXT1]]
+    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[UMAX]](s32)
     ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
     ; CHECK: $sgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $sgpr0
@@ -147,9 +144,8 @@ body: |
     ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
     ; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s16)
     ; CHECK: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s16)
-    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ugt), [[ZEXT]](s32), [[ZEXT1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[ZEXT]], [[ZEXT1]]
+    ; CHECK: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[ZEXT]], [[ZEXT1]]
+    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[UMAX]](s32)
     ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
     ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $sgpr0
@@ -183,11 +179,9 @@ body: |
     ; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32)
     ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
     ; CHECK: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ugt), [[AND]](s32), [[AND1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[AND]], [[AND1]]
-    ; CHECK: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ugt), [[LSHR]](s32), [[LSHR1]]
-    ; CHECK: [[SELECT1:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP1]](s32), [[LSHR]], [[LSHR1]]
-    ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SELECT]](s32), [[SELECT1]](s32)
+    ; CHECK: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[AND]], [[AND1]]
+    ; CHECK: [[UMAX1:%[0-9]+]]:sgpr(s32) = G_UMAX [[LSHR]], [[LSHR1]]
+    ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UMAX]](s32), [[UMAX1]](s32)
     ; CHECK: $sgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $sgpr0
     %1:_(<2 x s16>) = COPY $sgpr1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir
index 35072751a069..7e14b1fb6615 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir
@@ -13,9 +13,8 @@ body: |
     ; CHECK-LABEL: name: umin_s32_ss
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
-    ; CHECK: $sgpr0 = COPY [[SELECT]](s32)
+    ; CHECK: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[COPY]], [[COPY1]]
+    ; CHECK: $sgpr0 = COPY [[UMIN]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_UMIN %0, %1
@@ -97,9 +96,8 @@ body: |
     ; CHECK-LABEL: name: umin_s32_ss_vgpr_use
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
-    ; CHECK: $vgpr0 = COPY [[SELECT]](s32)
+    ; CHECK: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[COPY]], [[COPY1]]
+    ; CHECK: $vgpr0 = COPY [[UMIN]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_UMIN %0, %1
@@ -121,9 +119,8 @@ body: |
     ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
     ; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s16)
     ; CHECK: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s16)
-    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[ZEXT]](s32), [[ZEXT1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[ZEXT]], [[ZEXT1]]
+    ; CHECK: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[ZEXT]], [[ZEXT1]]
+    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[UMIN]](s32)
     ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
     ; CHECK: $sgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $sgpr0
@@ -151,9 +148,8 @@ body: |
     ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
     ; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s16)
     ; CHECK: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s16)
-    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[ZEXT]](s32), [[ZEXT1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[ZEXT]], [[ZEXT1]]
+    ; CHECK: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[ZEXT]], [[ZEXT1]]
+    ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[UMIN]](s32)
     ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
     ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $sgpr0
@@ -187,11 +183,9 @@ body: |
     ; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32)
     ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
     ; CHECK: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND1]]
-    ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[AND]], [[AND1]]
-    ; CHECK: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[LSHR]](s32), [[LSHR1]]
-    ; CHECK: [[SELECT1:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP1]](s32), [[LSHR]], [[LSHR1]]
-    ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SELECT]](s32), [[SELECT1]](s32)
+    ; CHECK: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[AND]], [[AND1]]
+    ; CHECK: [[UMIN1:%[0-9]+]]:sgpr(s32) = G_UMIN [[LSHR]], [[LSHR1]]
+    ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UMIN]](s32), [[UMIN1]](s32)
     ; CHECK: $sgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $sgpr0
     %1:_(<2 x s16>) = COPY $sgpr1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 8bb6c12c0c7d..a4e7e30b42b8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -61,17 +61,13 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_i7:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
+; GFX6-NEXT:    s_min_i32 s3, s0, 0
+; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s2, s0, 0
-; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
-; GFX6-NEXT:    s_cmp_gt_i32 s3, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s2
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
+; GFX6-NEXT:    s_max_i32 s1, s3, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s2
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 25
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -80,23 +76,19 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_bfe_u32 s2, 9, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s4, 0
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s5, s3, s4
-; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX8-NEXT:    s_max_i32 s5, s3, s4
+; GFX8-NEXT:    s_min_i32 s3, s3, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX8-NEXT:    s_max_i32 s1, s3, s1
+; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s3
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX8-NEXT:    s_min_i32 s1, s1, s3
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s2
@@ -183,17 +175,13 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX6-NEXT:    s_min_i32 s3, s0, 0
+; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s2, s0, 0
-; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
-; GFX6-NEXT:    s_cmp_gt_i32 s3, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s2
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
+; GFX6-NEXT:    s_max_i32 s1, s3, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s2
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -202,23 +190,19 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_bfe_u32 s2, 8, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s4, 0
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s5, s3, s4
-; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX8-NEXT:    s_max_i32 s5, s3, s4
+; GFX8-NEXT:    s_min_i32 s3, s3, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX8-NEXT:    s_max_i32 s1, s3, s1
+; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s3
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX8-NEXT:    s_min_i32 s1, s1, s3
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s2
@@ -360,38 +344,30 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX6-LABEL: s_saddsat_v2i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
-; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    s_min_i32 s7, s0, 0
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
+; GFX6-NEXT:    s_sub_i32 s7, s5, s7
 ; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    s_cselect_b32 s6, s0, 0
+; GFX6-NEXT:    s_max_i32 s6, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s6, s4, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_cselect_b32 s7, s0, 0
-; GFX6-NEXT:    s_sub_i32 s7, s5, s7
-; GFX6-NEXT:    s_cmp_gt_i32 s7, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s7, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s6
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s6
+; GFX6-NEXT:    s_max_i32 s1, s7, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s6
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s1, 0
+; GFX6-NEXT:    s_max_i32 s3, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s3, s4, s3
-; GFX6-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s4, s1, 0
+; GFX6-NEXT:    s_min_i32 s4, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s4, s5, s4
-; GFX6-NEXT:    s_cmp_gt_i32 s4, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s2, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX6-NEXT:    s_max_i32 s2, s4, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s3
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
-; GFX6-NEXT:    s_movk_i32 s2, 0xff
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
+; GFX6-NEXT:    s_movk_i32 s2, 0xff
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX6-NEXT:    s_and_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
@@ -403,50 +379,42 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_bfe_u32 s4, 8, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s8, 0
-; GFX8-NEXT:    s_cmp_gt_i32 s7, s8
-; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s9, s7, s8
-; GFX8-NEXT:    s_sub_i32 s9, s5, s9
-; GFX8-NEXT:    s_cmp_lt_i32 s7, s8
+; GFX8-NEXT:    s_max_i32 s9, s7, s8
 ; GFX8-NEXT:    s_movk_i32 s6, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s7, s7, s8
+; GFX8-NEXT:    s_min_i32 s7, s7, s8
 ; GFX8-NEXT:    s_sub_i32 s7, s6, s7
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s7, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s7, s1
+; GFX8-NEXT:    s_max_i32 s1, s7, s1
+; GFX8-NEXT:    s_sub_i32 s9, s5, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s9
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s7
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX8-NEXT:    s_min_i32 s1, s1, s7
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, s4
-; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s8
-; GFX8-NEXT:    s_cselect_b32 s7, s3, s8
-; GFX8-NEXT:    s_sub_i32 s5, s5, s7
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s8
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s8
+; GFX8-NEXT:    s_max_i32 s7, s3, s8
+; GFX8-NEXT:    s_min_i32 s3, s3, s8
 ; GFX8-NEXT:    s_sub_i32 s3, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s2
-; GFX8-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX8-NEXT:    s_max_i32 s2, s3, s2
+; GFX8-NEXT:    s_sub_i32 s5, s5, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s3
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_add_i32 s1, s1, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s4
+; GFX8-NEXT:    s_movk_i32 s2, 0xff
+; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
 ; GFX8-NEXT:    s_and_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s0, s0, s2
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
@@ -714,68 +682,52 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX6-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX6-NEXT:    s_brev_b32 s9, 1
+; GFX6-NEXT:    s_min_i32 s11, s0, 0
 ; GFX6-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
+; GFX6-NEXT:    s_sub_i32 s11, s9, s11
 ; GFX6-NEXT:    s_brev_b32 s8, -2
-; GFX6-NEXT:    s_cselect_b32 s10, s0, 0
+; GFX6-NEXT:    s_max_i32 s10, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s8, s10
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_brev_b32 s9, 1
-; GFX6-NEXT:    s_cselect_b32 s11, s0, 0
-; GFX6-NEXT:    s_sub_i32 s11, s9, s11
-; GFX6-NEXT:    s_cmp_gt_i32 s11, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s11, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s10
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s10
+; GFX6-NEXT:    s_max_i32 s1, s11, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s10
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
+; GFX6-NEXT:    s_min_i32 s10, s1, 0
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s1, 0
-; GFX6-NEXT:    s_sub_i32 s5, s8, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s10, s1, 0
+; GFX6-NEXT:    s_max_i32 s5, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s9, s10
-; GFX6-NEXT:    s_cmp_gt_i32 s10, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s10, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s2, s5
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s5
+; GFX6-NEXT:    s_sub_i32 s5, s8, s5
+; GFX6-NEXT:    s_max_i32 s2, s10, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s5
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s2, 0
-; GFX6-NEXT:    s_sub_i32 s5, s8, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s6, s2, 0
+; GFX6-NEXT:    s_min_i32 s6, s2, 0
+; GFX6-NEXT:    s_max_i32 s5, s2, 0
 ; GFX6-NEXT:    s_sub_i32 s6, s9, s6
-; GFX6-NEXT:    s_cmp_gt_i32 s6, s3
-; GFX6-NEXT:    s_cselect_b32 s3, s6, s3
-; GFX6-NEXT:    s_cmp_lt_i32 s3, s5
-; GFX6-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX6-NEXT:    s_sub_i32 s5, s8, s5
+; GFX6-NEXT:    s_max_i32 s3, s6, s3
+; GFX6-NEXT:    s_min_i32 s3, s3, s5
 ; GFX6-NEXT:    s_add_i32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
-; GFX6-NEXT:    s_ashr_i32 s2, s2, 24
+; GFX6-NEXT:    s_min_i32 s6, s3, 0
+; GFX6-NEXT:    s_max_i32 s5, s3, 0
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s3, 0
-; GFX6-NEXT:    s_sub_i32 s5, s8, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s6, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s6, s9, s6
-; GFX6-NEXT:    s_cmp_gt_i32 s6, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX6-NEXT:    s_sub_i32 s5, s8, s5
+; GFX6-NEXT:    s_max_i32 s4, s6, s4
+; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
 ; GFX6-NEXT:    s_movk_i32 s4, 0xff
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX6-NEXT:    s_and_b32 s1, s1, s4
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 24
 ; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -795,91 +747,75 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
+; GFX8-NEXT:    s_sext_i32_i16 s11, s0
+; GFX8-NEXT:    s_sext_i32_i16 s12, 0
+; GFX8-NEXT:    s_max_i32 s13, s11, s12
+; GFX8-NEXT:    s_movk_i32 s10, 0x8000
+; GFX8-NEXT:    s_min_i32 s11, s11, s12
+; GFX8-NEXT:    s_sub_i32 s11, s10, s11
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
-; GFX8-NEXT:    s_sext_i32_i16 s11, s0
-; GFX8-NEXT:    s_sext_i32_i16 s12, 0
-; GFX8-NEXT:    s_cmp_gt_i32 s11, s12
 ; GFX8-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s13, s11, s12
-; GFX8-NEXT:    s_sub_i32 s13, s9, s13
-; GFX8-NEXT:    s_cmp_lt_i32 s11, s12
-; GFX8-NEXT:    s_movk_i32 s10, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s11, s11, s12
-; GFX8-NEXT:    s_sub_i32 s11, s10, s11
 ; GFX8-NEXT:    s_sext_i32_i16 s11, s11
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s11, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s11, s1
+; GFX8-NEXT:    s_max_i32 s1, s11, s1
+; GFX8-NEXT:    s_sub_i32 s13, s9, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s11, s13
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s11
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s11
+; GFX8-NEXT:    s_min_i32 s1, s1, s11
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s5, s8
-; GFX8-NEXT:    s_ashr_i32 s0, s0, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s11, s5, s12
-; GFX8-NEXT:    s_sub_i32 s11, s9, s11
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s12
+; GFX8-NEXT:    s_max_i32 s11, s5, s12
+; GFX8-NEXT:    s_min_i32 s5, s5, s12
 ; GFX8-NEXT:    s_sub_i32 s5, s10, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s2
-; GFX8-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX8-NEXT:    s_max_i32 s2, s5, s2
+; GFX8-NEXT:    s_sub_i32 s11, s9, s11
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s11
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s5
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s5
+; GFX8-NEXT:    s_min_i32 s2, s2, s5
 ; GFX8-NEXT:    s_add_i32 s1, s1, s2
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, s8
-; GFX8-NEXT:    s_lshl_b32 s3, s6, s8
-; GFX8-NEXT:    s_ashr_i32 s1, s1, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s6, s5, s12
-; GFX8-NEXT:    s_sub_i32 s6, s9, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s12
+; GFX8-NEXT:    s_lshl_b32 s3, s6, s8
+; GFX8-NEXT:    s_max_i32 s6, s5, s12
+; GFX8-NEXT:    s_min_i32 s5, s5, s12
 ; GFX8-NEXT:    s_sub_i32 s5, s10, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX8-NEXT:    s_max_i32 s3, s5, s3
+; GFX8-NEXT:    s_sub_i32 s6, s9, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s5
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX8-NEXT:    s_min_i32 s3, s3, s5
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
-; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_lshl_b32 s3, s4, s8
-; GFX8-NEXT:    s_lshl_b32 s4, s7, s8
-; GFX8-NEXT:    s_ashr_i32 s2, s2, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s6, s5, s12
-; GFX8-NEXT:    s_sub_i32 s6, s9, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s12
+; GFX8-NEXT:    s_max_i32 s6, s5, s12
+; GFX8-NEXT:    s_min_i32 s5, s5, s12
+; GFX8-NEXT:    s_lshl_b32 s4, s7, s8
 ; GFX8-NEXT:    s_sub_i32 s5, s10, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX8-NEXT:    s_max_i32 s4, s5, s4
+; GFX8-NEXT:    s_sub_i32 s6, s9, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_sext_i32_i16 s1, s1
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s3, s3, s4
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_ashr_i32 s1, s1, s8
 ; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_ashr_i32 s0, s0, s8
+; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_and_b32 s1, s1, s4
+; GFX8-NEXT:    s_ashr_i32 s2, s2, s8
 ; GFX8-NEXT:    s_and_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
@@ -1046,17 +982,13 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_i24:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX6-NEXT:    s_min_i32 s3, s0, 0
+; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s2, s0, 0
-; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
-; GFX6-NEXT:    s_cmp_gt_i32 s3, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s2
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
+; GFX6-NEXT:    s_max_i32 s1, s3, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s2
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 8
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -1159,31 +1091,23 @@ define amdgpu_ps i32 @s_saddsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
 ; GCN-NEXT:    ; return to shader part epilog
 ; GFX6-LABEL: s_saddsat_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s2, s0, 0
-; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s0, 0
+; GFX6-NEXT:    s_min_i32 s3, s0, 0
+; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
-; GFX6-NEXT:    s_cmp_gt_i32 s3, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s2
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
+; GFX6-NEXT:    s_max_i32 s1, s3, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s2
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_saddsat_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX8-NEXT:    s_cselect_b32 s2, s0, 0
-; GFX8-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
-; GFX8-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX8-NEXT:    s_cselect_b32 s3, s0, 0
+; GFX8-NEXT:    s_min_i32 s3, s0, 0
+; GFX8-NEXT:    s_max_i32 s2, s0, 0
 ; GFX8-NEXT:    s_sub_i32 s3, 0x80000000, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s2
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX8-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
+; GFX8-NEXT:    s_max_i32 s1, s3, s1
+; GFX8-NEXT:    s_min_i32 s1, s1, s2
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1206,12 +1130,10 @@ define amdgpu_ps i32 @s_saddsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
 define amdgpu_ps float @saddsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
 ; GFX6-LABEL: saddsat_i32_sv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s1, s0, 0
-; GFX6-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s2, s0, 0
+; GFX6-NEXT:    s_min_i32 s2, s0, 0
+; GFX6-NEXT:    s_max_i32 s1, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s2, 0x80000000, s2
+; GFX6-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
 ; GFX6-NEXT:    v_max_i32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_min_i32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -1219,12 +1141,10 @@ define amdgpu_ps float @saddsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
 ;
 ; GFX8-LABEL: saddsat_i32_sv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX8-NEXT:    s_cselect_b32 s1, s0, 0
-; GFX8-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
-; GFX8-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX8-NEXT:    s_cselect_b32 s2, s0, 0
+; GFX8-NEXT:    s_min_i32 s2, s0, 0
+; GFX8-NEXT:    s_max_i32 s1, s0, 0
 ; GFX8-NEXT:    s_sub_i32 s2, 0x80000000, s2
+; GFX8-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
 ; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
 ; GFX8-NEXT:    v_min_i32_e32 v0, s1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -1345,57 +1265,41 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    s_cselect_b32 s6, s0, 0
-; GFX6-NEXT:    s_sub_i32 s6, s4, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
 ; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_cselect_b32 s7, s0, 0
+; GFX6-NEXT:    s_min_i32 s7, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s7, s5, s7
-; GFX6-NEXT:    s_cmp_gt_i32 s7, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s7, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s2, s6
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s6
+; GFX6-NEXT:    s_brev_b32 s4, -2
+; GFX6-NEXT:    s_max_i32 s6, s0, 0
+; GFX6-NEXT:    s_sub_i32 s6, s4, s6
+; GFX6-NEXT:    s_max_i32 s2, s7, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s6
 ; GFX6-NEXT:    s_add_i32 s0, s0, s2
-; GFX6-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s2, s1, 0
+; GFX6-NEXT:    s_max_i32 s2, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s2, s4, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s4, s1, 0
+; GFX6-NEXT:    s_min_i32 s4, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s4, s5, s4
-; GFX6-NEXT:    s_cmp_gt_i32 s4, s3
-; GFX6-NEXT:    s_cselect_b32 s3, s4, s3
-; GFX6-NEXT:    s_cmp_lt_i32 s3, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_max_i32 s3, s4, s3
+; GFX6-NEXT:    s_min_i32 s2, s3, s2
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_saddsat_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX8-NEXT:    s_brev_b32 s4, -2
-; GFX8-NEXT:    s_cselect_b32 s6, s0, 0
-; GFX8-NEXT:    s_sub_i32 s6, s4, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s0, 0
 ; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    s_cselect_b32 s7, s0, 0
+; GFX8-NEXT:    s_min_i32 s7, s0, 0
 ; GFX8-NEXT:    s_sub_i32 s7, s5, s7
-; GFX8-NEXT:    s_cmp_gt_i32 s7, s2
-; GFX8-NEXT:    s_cselect_b32 s2, s7, s2
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s6
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s6
+; GFX8-NEXT:    s_brev_b32 s4, -2
+; GFX8-NEXT:    s_max_i32 s6, s0, 0
+; GFX8-NEXT:    s_sub_i32 s6, s4, s6
+; GFX8-NEXT:    s_max_i32 s2, s7, s2
+; GFX8-NEXT:    s_min_i32 s2, s2, s6
 ; GFX8-NEXT:    s_add_i32 s0, s0, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX8-NEXT:    s_cselect_b32 s2, s1, 0
+; GFX8-NEXT:    s_max_i32 s2, s1, 0
 ; GFX8-NEXT:    s_sub_i32 s2, s4, s2
-; GFX8-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX8-NEXT:    s_cselect_b32 s4, s1, 0
+; GFX8-NEXT:    s_min_i32 s4, s1, 0
 ; GFX8-NEXT:    s_sub_i32 s4, s5, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s2
-; GFX8-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX8-NEXT:    s_max_i32 s3, s4, s3
+; GFX8-NEXT:    s_min_i32 s2, s3, s2
 ; GFX8-NEXT:    s_add_i32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1500,79 +1404,55 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_brev_b32 s6, -2
-; GFX6-NEXT:    s_cselect_b32 s8, s0, 0
-; GFX6-NEXT:    s_sub_i32 s8, s6, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
 ; GFX6-NEXT:    s_brev_b32 s7, 1
-; GFX6-NEXT:    s_cselect_b32 s9, s0, 0
+; GFX6-NEXT:    s_min_i32 s9, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s9, s7, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s9, s3
-; GFX6-NEXT:    s_cselect_b32 s3, s9, s3
-; GFX6-NEXT:    s_cmp_lt_i32 s3, s8
-; GFX6-NEXT:    s_cselect_b32 s3, s3, s8
+; GFX6-NEXT:    s_brev_b32 s6, -2
+; GFX6-NEXT:    s_max_i32 s8, s0, 0
+; GFX6-NEXT:    s_sub_i32 s8, s6, s8
+; GFX6-NEXT:    s_max_i32 s3, s9, s3
+; GFX6-NEXT:    s_min_i32 s3, s3, s8
+; GFX6-NEXT:    s_min_i32 s8, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s3
-; GFX6-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s1, 0
-; GFX6-NEXT:    s_sub_i32 s3, s6, s3
-; GFX6-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s8, s1, 0
+; GFX6-NEXT:    s_max_i32 s3, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s8, s7, s8
-; GFX6-NEXT:    s_cmp_gt_i32 s8, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s8, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s3
-; GFX6-NEXT:    s_cselect_b32 s3, s4, s3
-; GFX6-NEXT:    s_add_i32 s1, s1, s3
-; GFX6-NEXT:    s_cmp_gt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s2, 0
 ; GFX6-NEXT:    s_sub_i32 s3, s6, s3
-; GFX6-NEXT:    s_cmp_lt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s4, s2, 0
+; GFX6-NEXT:    s_max_i32 s4, s8, s4
+; GFX6-NEXT:    s_min_i32 s3, s4, s3
+; GFX6-NEXT:    s_min_i32 s4, s2, 0
 ; GFX6-NEXT:    s_sub_i32 s4, s7, s4
-; GFX6-NEXT:    s_cmp_gt_i32 s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s3
-; GFX6-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX6-NEXT:    s_add_i32 s1, s1, s3
+; GFX6-NEXT:    s_max_i32 s3, s2, 0
+; GFX6-NEXT:    s_sub_i32 s3, s6, s3
+; GFX6-NEXT:    s_max_i32 s4, s4, s5
+; GFX6-NEXT:    s_min_i32 s3, s4, s3
 ; GFX6-NEXT:    s_add_i32 s2, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_saddsat_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX8-NEXT:    s_brev_b32 s6, -2
-; GFX8-NEXT:    s_cselect_b32 s8, s0, 0
-; GFX8-NEXT:    s_sub_i32 s8, s6, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s0, 0
 ; GFX8-NEXT:    s_brev_b32 s7, 1
-; GFX8-NEXT:    s_cselect_b32 s9, s0, 0
+; GFX8-NEXT:    s_min_i32 s9, s0, 0
 ; GFX8-NEXT:    s_sub_i32 s9, s7, s9
-; GFX8-NEXT:    s_cmp_gt_i32 s9, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s9, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s8
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s8
+; GFX8-NEXT:    s_brev_b32 s6, -2
+; GFX8-NEXT:    s_max_i32 s8, s0, 0
+; GFX8-NEXT:    s_sub_i32 s8, s6, s8
+; GFX8-NEXT:    s_max_i32 s3, s9, s3
+; GFX8-NEXT:    s_min_i32 s3, s3, s8
+; GFX8-NEXT:    s_min_i32 s8, s1, 0
 ; GFX8-NEXT:    s_add_i32 s0, s0, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX8-NEXT:    s_cselect_b32 s3, s1, 0
-; GFX8-NEXT:    s_sub_i32 s3, s6, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX8-NEXT:    s_cselect_b32 s8, s1, 0
+; GFX8-NEXT:    s_max_i32 s3, s1, 0
 ; GFX8-NEXT:    s_sub_i32 s8, s7, s8
-; GFX8-NEXT:    s_cmp_gt_i32 s8, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s8, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
-; GFX8-NEXT:    s_add_i32 s1, s1, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s2, 0
-; GFX8-NEXT:    s_cselect_b32 s3, s2, 0
 ; GFX8-NEXT:    s_sub_i32 s3, s6, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s2, 0
-; GFX8-NEXT:    s_cselect_b32 s4, s2, 0
+; GFX8-NEXT:    s_max_i32 s4, s8, s4
+; GFX8-NEXT:    s_min_i32 s3, s4, s3
+; GFX8-NEXT:    s_min_i32 s4, s2, 0
 ; GFX8-NEXT:    s_sub_i32 s4, s7, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX8-NEXT:    s_add_i32 s1, s1, s3
+; GFX8-NEXT:    s_max_i32 s3, s2, 0
+; GFX8-NEXT:    s_sub_i32 s3, s6, s3
+; GFX8-NEXT:    s_max_i32 s4, s4, s5
+; GFX8-NEXT:    s_min_i32 s3, s4, s3
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1698,101 +1578,69 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_brev_b32 s8, -2
-; GFX6-NEXT:    s_cselect_b32 s10, s0, 0
-; GFX6-NEXT:    s_sub_i32 s10, s8, s10
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
 ; GFX6-NEXT:    s_brev_b32 s9, 1
-; GFX6-NEXT:    s_cselect_b32 s11, s0, 0
+; GFX6-NEXT:    s_min_i32 s11, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s11, s9, s11
-; GFX6-NEXT:    s_cmp_gt_i32 s11, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s11, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s10
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX6-NEXT:    s_brev_b32 s8, -2
+; GFX6-NEXT:    s_max_i32 s10, s0, 0
+; GFX6-NEXT:    s_sub_i32 s10, s8, s10
+; GFX6-NEXT:    s_max_i32 s4, s11, s4
+; GFX6-NEXT:    s_min_i32 s4, s4, s10
+; GFX6-NEXT:    s_min_i32 s10, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s4
-; GFX6-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s4, s1, 0
-; GFX6-NEXT:    s_sub_i32 s4, s8, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s10, s1, 0
+; GFX6-NEXT:    s_max_i32 s4, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s9, s10
-; GFX6-NEXT:    s_cmp_gt_i32 s10, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s10, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX6-NEXT:    s_sub_i32 s4, s8, s4
+; GFX6-NEXT:    s_max_i32 s5, s10, s5
+; GFX6-NEXT:    s_min_i32 s4, s5, s4
+; GFX6-NEXT:    s_min_i32 s5, s2, 0
+; GFX6-NEXT:    s_sub_i32 s5, s9, s5
 ; GFX6-NEXT:    s_add_i32 s1, s1, s4
-; GFX6-NEXT:    s_cmp_gt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s4, s2, 0
+; GFX6-NEXT:    s_max_i32 s4, s2, 0
 ; GFX6-NEXT:    s_sub_i32 s4, s8, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s2, 0
+; GFX6-NEXT:    s_max_i32 s5, s5, s6
+; GFX6-NEXT:    s_min_i32 s4, s5, s4
+; GFX6-NEXT:    s_min_i32 s5, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s5, s9, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s5, s6
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
 ; GFX6-NEXT:    s_add_i32 s2, s2, s4
-; GFX6-NEXT:    s_cmp_gt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s4, s3, 0
+; GFX6-NEXT:    s_max_i32 s4, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s4, s8, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s3, 0
-; GFX6-NEXT:    s_sub_i32 s5, s9, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s5, s7
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX6-NEXT:    s_cmp_lt_i32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX6-NEXT:    s_max_i32 s5, s5, s7
+; GFX6-NEXT:    s_min_i32 s4, s5, s4
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_saddsat_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX8-NEXT:    s_brev_b32 s8, -2
-; GFX8-NEXT:    s_cselect_b32 s10, s0, 0
-; GFX8-NEXT:    s_sub_i32 s10, s8, s10
-; GFX8-NEXT:    s_cmp_lt_i32 s0, 0
 ; GFX8-NEXT:    s_brev_b32 s9, 1
-; GFX8-NEXT:    s_cselect_b32 s11, s0, 0
+; GFX8-NEXT:    s_min_i32 s11, s0, 0
 ; GFX8-NEXT:    s_sub_i32 s11, s9, s11
-; GFX8-NEXT:    s_cmp_gt_i32 s11, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s11, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s10
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX8-NEXT:    s_brev_b32 s8, -2
+; GFX8-NEXT:    s_max_i32 s10, s0, 0
+; GFX8-NEXT:    s_sub_i32 s10, s8, s10
+; GFX8-NEXT:    s_max_i32 s4, s11, s4
+; GFX8-NEXT:    s_min_i32 s4, s4, s10
+; GFX8-NEXT:    s_min_i32 s10, s1, 0
 ; GFX8-NEXT:    s_add_i32 s0, s0, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX8-NEXT:    s_cselect_b32 s4, s1, 0
-; GFX8-NEXT:    s_sub_i32 s4, s8, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX8-NEXT:    s_cselect_b32 s10, s1, 0
+; GFX8-NEXT:    s_max_i32 s4, s1, 0
 ; GFX8-NEXT:    s_sub_i32 s10, s9, s10
-; GFX8-NEXT:    s_cmp_gt_i32 s10, s5
-; GFX8-NEXT:    s_cselect_b32 s5, s10, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX8-NEXT:    s_sub_i32 s4, s8, s4
+; GFX8-NEXT:    s_max_i32 s5, s10, s5
+; GFX8-NEXT:    s_min_i32 s4, s5, s4
+; GFX8-NEXT:    s_min_i32 s5, s2, 0
+; GFX8-NEXT:    s_sub_i32 s5, s9, s5
 ; GFX8-NEXT:    s_add_i32 s1, s1, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s2, 0
-; GFX8-NEXT:    s_cselect_b32 s4, s2, 0
+; GFX8-NEXT:    s_max_i32 s4, s2, 0
 ; GFX8-NEXT:    s_sub_i32 s4, s8, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s2, 0
-; GFX8-NEXT:    s_cselect_b32 s5, s2, 0
+; GFX8-NEXT:    s_max_i32 s5, s5, s6
+; GFX8-NEXT:    s_min_i32 s4, s5, s4
+; GFX8-NEXT:    s_min_i32 s5, s3, 0
 ; GFX8-NEXT:    s_sub_i32 s5, s9, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s6
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s5, s4
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s3, 0
-; GFX8-NEXT:    s_cselect_b32 s4, s3, 0
+; GFX8-NEXT:    s_max_i32 s4, s3, 0
 ; GFX8-NEXT:    s_sub_i32 s4, s8, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX8-NEXT:    s_cselect_b32 s5, s3, 0
-; GFX8-NEXT:    s_sub_i32 s5, s9, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s7
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX8-NEXT:    s_max_i32 s5, s5, s7
+; GFX8-NEXT:    s_min_i32 s4, s5, s4
 ; GFX8-NEXT:    s_add_i32 s3, s3, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1943,123 +1791,83 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_v5i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_brev_b32 s10, -2
-; GFX6-NEXT:    s_cselect_b32 s12, s0, 0
-; GFX6-NEXT:    s_sub_i32 s12, s10, s12
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
 ; GFX6-NEXT:    s_brev_b32 s11, 1
-; GFX6-NEXT:    s_cselect_b32 s13, s0, 0
+; GFX6-NEXT:    s_min_i32 s13, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s13, s11, s13
-; GFX6-NEXT:    s_cmp_gt_i32 s13, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s13, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s5, s12
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s12
+; GFX6-NEXT:    s_brev_b32 s10, -2
+; GFX6-NEXT:    s_max_i32 s12, s0, 0
+; GFX6-NEXT:    s_sub_i32 s12, s10, s12
+; GFX6-NEXT:    s_max_i32 s5, s13, s5
+; GFX6-NEXT:    s_min_i32 s5, s5, s12
+; GFX6-NEXT:    s_min_i32 s12, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s1, 0
-; GFX6-NEXT:    s_sub_i32 s5, s10, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s12, s1, 0
+; GFX6-NEXT:    s_max_i32 s5, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s12, s11, s12
-; GFX6-NEXT:    s_cmp_gt_i32 s12, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s12, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX6-NEXT:    s_sub_i32 s5, s10, s5
+; GFX6-NEXT:    s_max_i32 s6, s12, s6
+; GFX6-NEXT:    s_min_i32 s5, s6, s5
+; GFX6-NEXT:    s_min_i32 s6, s2, 0
+; GFX6-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX6-NEXT:    s_add_i32 s1, s1, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s2, 0
+; GFX6-NEXT:    s_max_i32 s5, s2, 0
 ; GFX6-NEXT:    s_sub_i32 s5, s10, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s6, s2, 0
+; GFX6-NEXT:    s_max_i32 s6, s6, s7
+; GFX6-NEXT:    s_min_i32 s5, s6, s5
+; GFX6-NEXT:    s_min_i32 s6, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s6, s11, s6
-; GFX6-NEXT:    s_cmp_gt_i32 s6, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s7
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
 ; GFX6-NEXT:    s_add_i32 s2, s2, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s3, 0
+; GFX6-NEXT:    s_max_i32 s5, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s5, s10, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s6, s3, 0
+; GFX6-NEXT:    s_max_i32 s6, s6, s8
+; GFX6-NEXT:    s_min_i32 s5, s6, s5
+; GFX6-NEXT:    s_min_i32 s6, s4, 0
 ; GFX6-NEXT:    s_sub_i32 s6, s11, s6
-; GFX6-NEXT:    s_cmp_gt_i32 s6, s8
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
 ; GFX6-NEXT:    s_add_i32 s3, s3, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s4, 0
+; GFX6-NEXT:    s_max_i32 s5, s4, 0
 ; GFX6-NEXT:    s_sub_i32 s5, s10, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s6, s4, 0
-; GFX6-NEXT:    s_sub_i32 s6, s11, s6
-; GFX6-NEXT:    s_cmp_gt_i32 s6, s9
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s9
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX6-NEXT:    s_max_i32 s6, s6, s9
+; GFX6-NEXT:    s_min_i32 s5, s6, s5
 ; GFX6-NEXT:    s_add_i32 s4, s4, s5
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_saddsat_v5i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX8-NEXT:    s_brev_b32 s10, -2
-; GFX8-NEXT:    s_cselect_b32 s12, s0, 0
-; GFX8-NEXT:    s_sub_i32 s12, s10, s12
-; GFX8-NEXT:    s_cmp_lt_i32 s0, 0
 ; GFX8-NEXT:    s_brev_b32 s11, 1
-; GFX8-NEXT:    s_cselect_b32 s13, s0, 0
+; GFX8-NEXT:    s_min_i32 s13, s0, 0
 ; GFX8-NEXT:    s_sub_i32 s13, s11, s13
-; GFX8-NEXT:    s_cmp_gt_i32 s13, s5
-; GFX8-NEXT:    s_cselect_b32 s5, s13, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s12
+; GFX8-NEXT:    s_brev_b32 s10, -2
+; GFX8-NEXT:    s_max_i32 s12, s0, 0
+; GFX8-NEXT:    s_sub_i32 s12, s10, s12
+; GFX8-NEXT:    s_max_i32 s5, s13, s5
+; GFX8-NEXT:    s_min_i32 s5, s5, s12
+; GFX8-NEXT:    s_min_i32 s12, s1, 0
 ; GFX8-NEXT:    s_add_i32 s0, s0, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX8-NEXT:    s_cselect_b32 s5, s1, 0
-; GFX8-NEXT:    s_sub_i32 s5, s10, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX8-NEXT:    s_cselect_b32 s12, s1, 0
+; GFX8-NEXT:    s_max_i32 s5, s1, 0
 ; GFX8-NEXT:    s_sub_i32 s12, s11, s12
-; GFX8-NEXT:    s_cmp_gt_i32 s12, s6
-; GFX8-NEXT:    s_cselect_b32 s6, s12, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s5
-; GFX8-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX8-NEXT:    s_sub_i32 s5, s10, s5
+; GFX8-NEXT:    s_max_i32 s6, s12, s6
+; GFX8-NEXT:    s_min_i32 s5, s6, s5
+; GFX8-NEXT:    s_min_i32 s6, s2, 0
+; GFX8-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX8-NEXT:    s_add_i32 s1, s1, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s2, 0
-; GFX8-NEXT:    s_cselect_b32 s5, s2, 0
+; GFX8-NEXT:    s_max_i32 s5, s2, 0
 ; GFX8-NEXT:    s_sub_i32 s5, s10, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s2, 0
-; GFX8-NEXT:    s_cselect_b32 s6, s2, 0
+; GFX8-NEXT:    s_max_i32 s6, s6, s7
+; GFX8-NEXT:    s_min_i32 s5, s6, s5
+; GFX8-NEXT:    s_min_i32 s6, s3, 0
 ; GFX8-NEXT:    s_sub_i32 s6, s11, s6
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s7
-; GFX8-NEXT:    s_cselect_b32 s6, s6, s7
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s5
-; GFX8-NEXT:    s_cselect_b32 s5, s6, s5
 ; GFX8-NEXT:    s_add_i32 s2, s2, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s3, 0
-; GFX8-NEXT:    s_cselect_b32 s5, s3, 0
+; GFX8-NEXT:    s_max_i32 s5, s3, 0
 ; GFX8-NEXT:    s_sub_i32 s5, s10, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX8-NEXT:    s_cselect_b32 s6, s3, 0
+; GFX8-NEXT:    s_max_i32 s6, s6, s8
+; GFX8-NEXT:    s_min_i32 s5, s6, s5
+; GFX8-NEXT:    s_min_i32 s6, s4, 0
 ; GFX8-NEXT:    s_sub_i32 s6, s11, s6
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s8
-; GFX8-NEXT:    s_cselect_b32 s6, s6, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s5
-; GFX8-NEXT:    s_cselect_b32 s5, s6, s5
 ; GFX8-NEXT:    s_add_i32 s3, s3, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s4, 0
-; GFX8-NEXT:    s_cselect_b32 s5, s4, 0
+; GFX8-NEXT:    s_max_i32 s5, s4, 0
 ; GFX8-NEXT:    s_sub_i32 s5, s10, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, 0
-; GFX8-NEXT:    s_cselect_b32 s6, s4, 0
-; GFX8-NEXT:    s_sub_i32 s6, s11, s6
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s9
-; GFX8-NEXT:    s_cselect_b32 s6, s6, s9
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s5
-; GFX8-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX8-NEXT:    s_max_i32 s6, s6, s9
+; GFX8-NEXT:    s_min_i32 s5, s6, s5
 ; GFX8-NEXT:    s_add_i32 s4, s4, s5
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2391,365 +2199,237 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_v16i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_brev_b32 s32, -2
-; GFX6-NEXT:    s_cselect_b32 s34, s0, 0
-; GFX6-NEXT:    s_sub_i32 s34, s32, s34
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
 ; GFX6-NEXT:    s_brev_b32 s33, 1
-; GFX6-NEXT:    s_cselect_b32 s35, s0, 0
+; GFX6-NEXT:    s_min_i32 s35, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s35, s33, s35
-; GFX6-NEXT:    s_cmp_gt_i32 s35, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s35, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s34
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s34
+; GFX6-NEXT:    s_brev_b32 s32, -2
+; GFX6-NEXT:    s_max_i32 s34, s0, 0
+; GFX6-NEXT:    s_sub_i32 s34, s32, s34
+; GFX6-NEXT:    s_max_i32 s16, s35, s16
+; GFX6-NEXT:    s_min_i32 s16, s16, s34
+; GFX6-NEXT:    s_min_i32 s34, s1, 0
 ; GFX6-NEXT:    s_add_i32 s0, s0, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s1, 0
-; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s34, s1, 0
+; GFX6-NEXT:    s_max_i32 s16, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s34, s33, s34
-; GFX6-NEXT:    s_cmp_gt_i32 s34, s17
-; GFX6-NEXT:    s_cselect_b32 s17, s34, s17
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
+; GFX6-NEXT:    s_sub_i32 s16, s32, s16
+; GFX6-NEXT:    s_max_i32 s17, s34, s17
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s2, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s1, s1, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s2, 0
+; GFX6-NEXT:    s_max_i32 s16, s2, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s2, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s18
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s18
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s18
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s2, s2, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s3, 0
+; GFX6-NEXT:    s_max_i32 s16, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s3, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s19
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s4, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s19
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s19
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s3, s3, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s4, 0
+; GFX6-NEXT:    s_max_i32 s16, s4, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s4, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s20
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s5, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s20
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s20
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s4, s4, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s5, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s5, 0
+; GFX6-NEXT:    s_max_i32 s16, s5, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s5, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s5, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s21
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s6, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s21
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s21
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s5, s5, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s6, 0
+; GFX6-NEXT:    s_max_i32 s16, s6, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s6, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s22
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s7, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s22
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s22
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s6, s6, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s7, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s7, 0
+; GFX6-NEXT:    s_max_i32 s16, s7, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s7, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s7, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s23
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s8, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s23
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s23
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s7, s7, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s8, 0
+; GFX6-NEXT:    s_max_i32 s16, s8, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s8, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s24
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s9, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s24
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s24
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s8, s8, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s9, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s9, 0
+; GFX6-NEXT:    s_max_i32 s16, s9, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s9, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s9, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s25
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s10, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s25
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s25
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s9, s9, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s10, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s10, 0
+; GFX6-NEXT:    s_max_i32 s16, s10, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s10, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s10, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s26
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s11, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s26
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s26
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s10, s10, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s11, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s11, 0
+; GFX6-NEXT:    s_max_i32 s16, s11, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s11, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s11, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s27
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s12, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s27
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s27
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s11, s11, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s12, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s12, 0
+; GFX6-NEXT:    s_max_i32 s16, s12, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s12, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s12, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s28
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s13, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s28
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s28
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s12, s12, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s13, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s13, 0
+; GFX6-NEXT:    s_max_i32 s16, s13, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s13, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s13, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s29
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s14, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s29
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s29
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s13, s13, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s14, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s14, 0
+; GFX6-NEXT:    s_max_i32 s16, s14, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s14, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s14, 0
+; GFX6-NEXT:    s_max_i32 s17, s17, s30
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
+; GFX6-NEXT:    s_min_i32 s17, s15, 0
 ; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s30
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s30
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s14, s14, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s15, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s15, 0
+; GFX6-NEXT:    s_max_i32 s16, s15, 0
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s15, 0
-; GFX6-NEXT:    s_cselect_b32 s17, s15, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s17, s31
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s31
-; GFX6-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s17, s16
+; GFX6-NEXT:    s_max_i32 s17, s17, s31
+; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_add_i32 s15, s15, s16
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_saddsat_v16i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX8-NEXT:    s_brev_b32 s32, -2
-; GFX8-NEXT:    s_cselect_b32 s34, s0, 0
-; GFX8-NEXT:    s_sub_i32 s34, s32, s34
-; GFX8-NEXT:    s_cmp_lt_i32 s0, 0
 ; GFX8-NEXT:    s_brev_b32 s33, 1
-; GFX8-NEXT:    s_cselect_b32 s35, s0, 0
+; GFX8-NEXT:    s_min_i32 s35, s0, 0
 ; GFX8-NEXT:    s_sub_i32 s35, s33, s35
-; GFX8-NEXT:    s_cmp_gt_i32 s35, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s35, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s34
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s34
+; GFX8-NEXT:    s_brev_b32 s32, -2
+; GFX8-NEXT:    s_max_i32 s34, s0, 0
+; GFX8-NEXT:    s_sub_i32 s34, s32, s34
+; GFX8-NEXT:    s_max_i32 s16, s35, s16
+; GFX8-NEXT:    s_min_i32 s16, s16, s34
+; GFX8-NEXT:    s_min_i32 s34, s1, 0
 ; GFX8-NEXT:    s_add_i32 s0, s0, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s1, 0
-; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX8-NEXT:    s_cselect_b32 s34, s1, 0
+; GFX8-NEXT:    s_max_i32 s16, s1, 0
 ; GFX8-NEXT:    s_sub_i32 s34, s33, s34
-; GFX8-NEXT:    s_cmp_gt_i32 s34, s17
-; GFX8-NEXT:    s_cselect_b32 s17, s34, s17
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
+; GFX8-NEXT:    s_sub_i32 s16, s32, s16
+; GFX8-NEXT:    s_max_i32 s17, s34, s17
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s2, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s1, s1, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s2, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s2, 0
+; GFX8-NEXT:    s_max_i32 s16, s2, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s2, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s2, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s18
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s3, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s18
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s18
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s2, s2, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s3, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s3, 0
+; GFX8-NEXT:    s_max_i32 s16, s3, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s3, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s19
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s4, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s19
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s19
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s3, s3, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s4, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s4, 0
+; GFX8-NEXT:    s_max_i32 s16, s4, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s4, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s4, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s20
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s5, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s20
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s20
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s4, s4, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s5, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s5, 0
+; GFX8-NEXT:    s_max_i32 s16, s5, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s5, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s5, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s21
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s6, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s21
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s21
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s5, s5, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s6, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s6, 0
+; GFX8-NEXT:    s_max_i32 s16, s6, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s6, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s6, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s22
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s7, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s22
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s22
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s6, s6, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s7, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s7, 0
+; GFX8-NEXT:    s_max_i32 s16, s7, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s7, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s7, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s23
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s8, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s23
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s23
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s7, s7, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s8, 0
+; GFX8-NEXT:    s_max_i32 s16, s8, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s8, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s24
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s9, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s24
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s24
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s8, s8, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s9, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s9, 0
+; GFX8-NEXT:    s_max_i32 s16, s9, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s9, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s9, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s25
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s10, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s25
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s25
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s9, s9, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s10, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s10, 0
+; GFX8-NEXT:    s_max_i32 s16, s10, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s10, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s10, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s26
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s11, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s26
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s26
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s10, s10, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s11, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s11, 0
+; GFX8-NEXT:    s_max_i32 s16, s11, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s11, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s11, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s27
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s12, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s27
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s27
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s11, s11, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s12, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s12, 0
+; GFX8-NEXT:    s_max_i32 s16, s12, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s12, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s12, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s28
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s13, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s28
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s28
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s12, s12, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s13, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s13, 0
+; GFX8-NEXT:    s_max_i32 s16, s13, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s13, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s13, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s29
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s14, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s29
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s29
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s13, s13, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s14, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s14, 0
+; GFX8-NEXT:    s_max_i32 s16, s14, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s14, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s14, 0
+; GFX8-NEXT:    s_max_i32 s17, s17, s30
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
+; GFX8-NEXT:    s_min_i32 s17, s15, 0
 ; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s30
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s30
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s14, s14, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s15, 0
-; GFX8-NEXT:    s_cselect_b32 s16, s15, 0
+; GFX8-NEXT:    s_max_i32 s16, s15, 0
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s15, 0
-; GFX8-NEXT:    s_cselect_b32 s17, s15, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
-; GFX8-NEXT:    s_cmp_gt_i32 s17, s31
-; GFX8-NEXT:    s_cselect_b32 s17, s17, s31
-; GFX8-NEXT:    s_cmp_lt_i32 s17, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s17, s16
+; GFX8-NEXT:    s_max_i32 s17, s17, s31
+; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_add_i32 s15, s15, s16
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2892,17 +2572,13 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX6-NEXT:    s_min_i32 s3, s0, 0
+; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s2, s0, 0
-; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
-; GFX6-NEXT:    s_cmp_gt_i32 s3, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s2
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
+; GFX6-NEXT:    s_max_i32 s1, s3, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s2
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2911,20 +2587,16 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s3, 0
-; GFX8-NEXT:    s_cmp_gt_i32 s2, s3
-; GFX8-NEXT:    s_cselect_b32 s4, s2, s3
-; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s3
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX8-NEXT:    s_max_i32 s4, s2, s3
+; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s2, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX8-NEXT:    s_max_i32 s1, s2, s1
+; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s2
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX8-NEXT:    s_min_i32 s1, s1, s2
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2948,13 +2620,11 @@ define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
 ; GFX6-LABEL: saddsat_i16_sv:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s1, s0, 0
-; GFX6-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s2, s0, 0
+; GFX6-NEXT:    s_min_i32 s2, s0, 0
+; GFX6-NEXT:    s_max_i32 s1, s0, 0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_sub_i32 s2, 0x80000000, s2
+; GFX6-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
 ; GFX6-NEXT:    v_max_i32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_min_i32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -2965,12 +2635,10 @@ define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s2, 0
-; GFX8-NEXT:    s_cmp_gt_i32 s1, s2
-; GFX8-NEXT:    s_cselect_b32 s3, s1, s2
-; GFX8-NEXT:    s_sub_i32 s3, 0x7fff, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s2
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX8-NEXT:    s_max_i32 s3, s1, s2
+; GFX8-NEXT:    s_min_i32 s1, s1, s2
 ; GFX8-NEXT:    s_sub_i32 s1, 0xffff8000, s1
+; GFX8-NEXT:    s_sub_i32 s3, 0x7fff, s3
 ; GFX8-NEXT:    v_max_i16_e32 v0, s1, v0
 ; GFX8-NEXT:    v_min_i16_e32 v0, s3, v0
 ; GFX8-NEXT:    v_add_u16_e32 v0, s0, v0
@@ -3101,36 +2769,28 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6-LABEL: s_saddsat_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    s_min_i32 s7, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
+; GFX6-NEXT:    s_sub_i32 s7, s5, s7
 ; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    s_cselect_b32 s6, s0, 0
+; GFX6-NEXT:    s_max_i32 s6, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s6, s4, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_cselect_b32 s7, s0, 0
-; GFX6-NEXT:    s_sub_i32 s7, s5, s7
-; GFX6-NEXT:    s_cmp_gt_i32 s7, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s7, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s2, s6
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s6
-; GFX6-NEXT:    s_add_i32 s0, s0, s2
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_max_i32 s2, s7, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_add_i32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s1, 0
+; GFX6-NEXT:    s_max_i32 s3, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s3, s4, s3
-; GFX6-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s4, s1, 0
+; GFX6-NEXT:    s_min_i32 s4, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s4, s5, s4
-; GFX6-NEXT:    s_cmp_gt_i32 s4, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s2, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX6-NEXT:    s_max_i32 s2, s4, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s3
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_mov_b32 s2, 0xffff
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -3139,42 +2799,34 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ;
 ; GFX8-LABEL: s_saddsat_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s7, 0
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s7
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s8, s6, s7
-; GFX8-NEXT:    s_sub_i32 s8, s4, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s7
+; GFX8-NEXT:    s_max_i32 s8, s6, s7
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s6, s6, s7
+; GFX8-NEXT:    s_min_i32 s6, s6, s7
 ; GFX8-NEXT:    s_sub_i32 s6, s5, s6
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s6, s1
+; GFX8-NEXT:    s_max_i32 s1, s6, s1
+; GFX8-NEXT:    s_sub_i32 s8, s4, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s6
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s6
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_min_i32 s1, s1, s6
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s1, s7
-; GFX8-NEXT:    s_cselect_b32 s6, s1, s7
-; GFX8-NEXT:    s_sub_i32 s4, s4, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s7
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX8-NEXT:    s_max_i32 s6, s1, s7
+; GFX8-NEXT:    s_min_i32 s1, s1, s7
 ; GFX8-NEXT:    s_sub_i32 s1, s5, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s1, s3
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX8-NEXT:    s_max_i32 s1, s1, s3
+; GFX8-NEXT:    s_sub_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s3
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX8-NEXT:    s_min_i32 s1, s1, s3
 ; GFX8-NEXT:    s_add_i32 s2, s2, s1
 ; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
@@ -3203,24 +2855,20 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: saddsat_v2i16_sv:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_brev_b32 s2, -2
-; GFX6-NEXT:    s_cselect_b32 s4, s0, 0
-; GFX6-NEXT:    s_sub_i32 s4, s2, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
 ; GFX6-NEXT:    s_brev_b32 s3, 1
-; GFX6-NEXT:    s_cselect_b32 s5, s0, 0
+; GFX6-NEXT:    s_min_i32 s5, s0, 0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_sub_i32 s5, s3, s5
+; GFX6-NEXT:    s_brev_b32 s2, -2
+; GFX6-NEXT:    s_max_i32 s4, s0, 0
+; GFX6-NEXT:    s_sub_i32 s4, s2, s4
 ; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
 ; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s1, s0, 0
+; GFX6-NEXT:    s_max_i32 s1, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s1, s2, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_cselect_b32 s2, s0, 0
+; GFX6-NEXT:    s_min_i32 s2, s0, 0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    s_sub_i32 s2, s3, s2
 ; GFX6-NEXT:    v_max_i32_e32 v1, s2, v1
@@ -3237,28 +2885,24 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ;
 ; GFX8-LABEL: saddsat_v2i16_sv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s5, 0
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s5
-; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s6, s4, s5
-; GFX8-NEXT:    s_sub_i32 s6, s2, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
+; GFX8-NEXT:    s_max_i32 s6, s4, s5
 ; GFX8-NEXT:    s_movk_i32 s3, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
+; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX8-NEXT:    s_sub_i32 s4, s3, s4
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    v_max_i16_e32 v1, s4, v0
+; GFX8-NEXT:    s_sub_i32 s6, s2, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s5
 ; GFX8-NEXT:    v_min_i16_e32 v1, s6, v1
-; GFX8-NEXT:    s_cselect_b32 s6, s4, s5
-; GFX8-NEXT:    s_sub_i32 s2, s2, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_max_i32 s6, s4, s5
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_sub_i32 s2, s2, s6
 ; GFX8-NEXT:    v_min_i16_e32 v0, s2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_add_u16_e32 v1, s0, v1
@@ -3481,64 +3125,48 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-LABEL: s_saddsat_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX6-NEXT:    s_brev_b32 s9, 1
+; GFX6-NEXT:    s_min_i32 s11, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
+; GFX6-NEXT:    s_sub_i32 s11, s9, s11
 ; GFX6-NEXT:    s_brev_b32 s8, -2
-; GFX6-NEXT:    s_cselect_b32 s10, s0, 0
+; GFX6-NEXT:    s_max_i32 s10, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s8, s10
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_brev_b32 s9, 1
-; GFX6-NEXT:    s_cselect_b32 s11, s0, 0
-; GFX6-NEXT:    s_sub_i32 s11, s9, s11
-; GFX6-NEXT:    s_cmp_gt_i32 s11, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s11, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s10
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s10
-; GFX6-NEXT:    s_add_i32 s0, s0, s4
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_max_i32 s4, s11, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_min_i32 s4, s4, s10
+; GFX6-NEXT:    s_min_i32 s10, s1, 0
+; GFX6-NEXT:    s_add_i32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s1, 0
-; GFX6-NEXT:    s_sub_i32 s5, s8, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s10, s1, 0
+; GFX6-NEXT:    s_max_i32 s5, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s9, s10
-; GFX6-NEXT:    s_cmp_gt_i32 s10, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s10, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX6-NEXT:    s_add_i32 s1, s1, s4
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_sub_i32 s5, s8, s5
+; GFX6-NEXT:    s_max_i32 s4, s10, s4
+; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_add_i32 s1, s1, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s2, 0
-; GFX6-NEXT:    s_sub_i32 s5, s8, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s6, s2, 0
+; GFX6-NEXT:    s_min_i32 s6, s2, 0
+; GFX6-NEXT:    s_max_i32 s5, s2, 0
 ; GFX6-NEXT:    s_sub_i32 s6, s9, s6
-; GFX6-NEXT:    s_cmp_gt_i32 s6, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX6-NEXT:    s_add_i32 s2, s2, s4
-; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX6-NEXT:    s_sub_i32 s5, s8, s5
+; GFX6-NEXT:    s_max_i32 s4, s6, s4
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_min_i32 s4, s4, s5
+; GFX6-NEXT:    s_min_i32 s6, s3, 0
+; GFX6-NEXT:    s_add_i32 s2, s2, s4
+; GFX6-NEXT:    s_max_i32 s5, s3, 0
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s3, 0
-; GFX6-NEXT:    s_sub_i32 s5, s8, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s6, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s6, s9, s6
-; GFX6-NEXT:    s_cmp_gt_i32 s6, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX6-NEXT:    s_sub_i32 s5, s8, s5
+; GFX6-NEXT:    s_max_i32 s4, s6, s4
+; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s4
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -3551,76 +3179,60 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ;
 ; GFX8-LABEL: s_saddsat_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s11, 0
-; GFX8-NEXT:    s_cmp_gt_i32 s10, s11
-; GFX8-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s12, s10, s11
-; GFX8-NEXT:    s_sub_i32 s12, s8, s12
-; GFX8-NEXT:    s_cmp_lt_i32 s10, s11
+; GFX8-NEXT:    s_max_i32 s12, s10, s11
 ; GFX8-NEXT:    s_movk_i32 s9, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s10, s10, s11
+; GFX8-NEXT:    s_min_i32 s10, s10, s11
 ; GFX8-NEXT:    s_sub_i32 s10, s9, s10
+; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX8-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s10
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s10, s2
-; GFX8-NEXT:    s_cselect_b32 s2, s10, s2
+; GFX8-NEXT:    s_max_i32 s2, s10, s2
+; GFX8-NEXT:    s_sub_i32 s12, s8, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s12
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s10
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s10
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_min_i32 s2, s2, s10
 ; GFX8-NEXT:    s_add_i32 s0, s0, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s2, s11
-; GFX8-NEXT:    s_cselect_b32 s10, s2, s11
-; GFX8-NEXT:    s_sub_i32 s10, s8, s10
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s11
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s11
+; GFX8-NEXT:    s_max_i32 s10, s2, s11
+; GFX8-NEXT:    s_min_i32 s2, s2, s11
 ; GFX8-NEXT:    s_sub_i32 s2, s9, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_cmp_gt_i32 s2, s6
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s6
+; GFX8-NEXT:    s_max_i32 s2, s2, s6
+; GFX8-NEXT:    s_sub_i32 s10, s8, s10
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s10
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s6
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s6
+; GFX8-NEXT:    s_min_i32 s2, s2, s6
 ; GFX8-NEXT:    s_add_i32 s4, s4, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s2, s11
-; GFX8-NEXT:    s_cselect_b32 s6, s2, s11
-; GFX8-NEXT:    s_sub_i32 s6, s8, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s11
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s11
+; GFX8-NEXT:    s_max_i32 s6, s2, s11
+; GFX8-NEXT:    s_min_i32 s2, s2, s11
 ; GFX8-NEXT:    s_sub_i32 s2, s9, s2
+; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s2, s3
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX8-NEXT:    s_max_i32 s2, s2, s3
+; GFX8-NEXT:    s_sub_i32 s6, s8, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s3
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_add_i32 s1, s1, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s2, s11
-; GFX8-NEXT:    s_cselect_b32 s3, s2, s11
-; GFX8-NEXT:    s_sub_i32 s3, s8, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s11
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s11
+; GFX8-NEXT:    s_max_i32 s3, s2, s11
+; GFX8-NEXT:    s_min_i32 s2, s2, s11
 ; GFX8-NEXT:    s_sub_i32 s2, s9, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s7
-; GFX8-NEXT:    s_cmp_gt_i32 s2, s6
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s6
+; GFX8-NEXT:    s_sub_i32 s3, s8, s3
+; GFX8-NEXT:    s_max_i32 s2, s2, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s3
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_add_i32 s5, s5, s2
 ; GFX8-NEXT:    s_bfe_u32 s2, s4, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
@@ -3830,92 +3442,67 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-LABEL: s_saddsat_v6i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX6-NEXT:    s_brev_b32 s13, 1
+; GFX6-NEXT:    s_min_i32 s15, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
+; GFX6-NEXT:    s_sub_i32 s15, s13, s15
 ; GFX6-NEXT:    s_brev_b32 s12, -2
-; GFX6-NEXT:    s_cselect_b32 s14, s0, 0
+; GFX6-NEXT:    s_max_i32 s14, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s14, s12, s14
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_brev_b32 s13, 1
-; GFX6-NEXT:    s_cselect_b32 s15, s0, 0
-; GFX6-NEXT:    s_sub_i32 s15, s13, s15
-; GFX6-NEXT:    s_cmp_gt_i32 s15, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s15, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s14
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
-; GFX6-NEXT:    s_add_i32 s0, s0, s6
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_max_i32 s6, s15, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_min_i32 s6, s6, s14
+; GFX6-NEXT:    s_min_i32 s14, s1, 0
+; GFX6-NEXT:    s_add_i32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s1, 0
-; GFX6-NEXT:    s_sub_i32 s7, s12, s7
-; GFX6-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s1, 0
+; GFX6-NEXT:    s_max_i32 s7, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s14, s13, s14
-; GFX6-NEXT:    s_cmp_gt_i32 s14, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s14, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s7
-; GFX6-NEXT:    s_add_i32 s1, s1, s6
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_sub_i32 s7, s12, s7
+; GFX6-NEXT:    s_max_i32 s6, s14, s6
+; GFX6-NEXT:    s_min_i32 s6, s6, s7
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_add_i32 s1, s1, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s2, 0
-; GFX6-NEXT:    s_sub_i32 s7, s12, s7
-; GFX6-NEXT:    s_cmp_lt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s8, s2, 0
+; GFX6-NEXT:    s_min_i32 s8, s2, 0
+; GFX6-NEXT:    s_max_i32 s7, s2, 0
 ; GFX6-NEXT:    s_sub_i32 s8, s13, s8
-; GFX6-NEXT:    s_cmp_gt_i32 s8, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s8, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s7
-; GFX6-NEXT:    s_add_i32 s2, s2, s6
-; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX6-NEXT:    s_sub_i32 s7, s12, s7
+; GFX6-NEXT:    s_max_i32 s6, s8, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_min_i32 s6, s6, s7
+; GFX6-NEXT:    s_min_i32 s8, s3, 0
+; GFX6-NEXT:    s_add_i32 s2, s2, s6
+; GFX6-NEXT:    s_max_i32 s7, s3, 0
 ; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s3, 0
-; GFX6-NEXT:    s_sub_i32 s7, s12, s7
-; GFX6-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s8, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s8, s13, s8
-; GFX6-NEXT:    s_cmp_gt_i32 s8, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s8, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s7
-; GFX6-NEXT:    s_add_i32 s3, s3, s6
-; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
+; GFX6-NEXT:    s_sub_i32 s7, s12, s7
+; GFX6-NEXT:    s_max_i32 s6, s8, s6
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX6-NEXT:    s_min_i32 s6, s6, s7
+; GFX6-NEXT:    s_min_i32 s8, s4, 0
+; GFX6-NEXT:    s_add_i32 s3, s3, s6
+; GFX6-NEXT:    s_max_i32 s7, s4, 0
 ; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s4, 0
-; GFX6-NEXT:    s_sub_i32 s7, s12, s7
-; GFX6-NEXT:    s_cmp_lt_i32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s8, s4, 0
 ; GFX6-NEXT:    s_sub_i32 s8, s13, s8
-; GFX6-NEXT:    s_cmp_gt_i32 s8, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s8, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s7
-; GFX6-NEXT:    s_add_i32 s4, s4, s6
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX6-NEXT:    s_sub_i32 s7, s12, s7
+; GFX6-NEXT:    s_max_i32 s6, s8, s6
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX6-NEXT:    s_min_i32 s6, s6, s7
+; GFX6-NEXT:    s_min_i32 s8, s5, 0
+; GFX6-NEXT:    s_add_i32 s4, s4, s6
+; GFX6-NEXT:    s_max_i32 s7, s5, 0
 ; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s5, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s5, 0
-; GFX6-NEXT:    s_sub_i32 s7, s12, s7
-; GFX6-NEXT:    s_cmp_lt_i32 s5, 0
-; GFX6-NEXT:    s_cselect_b32 s8, s5, 0
 ; GFX6-NEXT:    s_sub_i32 s8, s13, s8
-; GFX6-NEXT:    s_cmp_gt_i32 s8, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s8, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s7
+; GFX6-NEXT:    s_sub_i32 s7, s12, s7
+; GFX6-NEXT:    s_max_i32 s6, s8, s6
+; GFX6-NEXT:    s_min_i32 s6, s6, s7
 ; GFX6-NEXT:    s_add_i32 s5, s5, s6
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_mov_b32 s6, 0xffff
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s6
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -3924,6 +3511,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_and_b32 s3, s5, s6
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
@@ -3932,110 +3520,86 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ;
 ; GFX8-LABEL: s_saddsat_v6i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
-; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
-; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
-; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s14, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s15, 0
-; GFX8-NEXT:    s_cmp_gt_i32 s14, s15
-; GFX8-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s16, s14, s15
-; GFX8-NEXT:    s_sub_i32 s16, s12, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s14, s15
+; GFX8-NEXT:    s_max_i32 s16, s14, s15
 ; GFX8-NEXT:    s_movk_i32 s13, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s14, s14, s15
+; GFX8-NEXT:    s_min_i32 s14, s14, s15
 ; GFX8-NEXT:    s_sub_i32 s14, s13, s14
+; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
+; GFX8-NEXT:    s_movk_i32 s12, 0x7fff
 ; GFX8-NEXT:    s_sext_i32_i16 s14, s14
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s14, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s14, s3
+; GFX8-NEXT:    s_max_i32 s3, s14, s3
+; GFX8-NEXT:    s_sub_i32 s16, s12, s16
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s14, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s14
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s14
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX8-NEXT:    s_min_i32 s3, s3, s14
 ; GFX8-NEXT:    s_add_i32 s0, s0, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s6
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s15
-; GFX8-NEXT:    s_cselect_b32 s14, s3, s15
-; GFX8-NEXT:    s_sub_i32 s14, s12, s14
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s15
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s15
+; GFX8-NEXT:    s_max_i32 s14, s3, s15
+; GFX8-NEXT:    s_min_i32 s3, s3, s15
 ; GFX8-NEXT:    s_sub_i32 s3, s13, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s9
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s9
+; GFX8-NEXT:    s_max_i32 s3, s3, s9
+; GFX8-NEXT:    s_sub_i32 s14, s12, s14
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s14
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s9
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s9
+; GFX8-NEXT:    s_min_i32 s3, s3, s9
 ; GFX8-NEXT:    s_add_i32 s6, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s15
-; GFX8-NEXT:    s_cselect_b32 s9, s3, s15
-; GFX8-NEXT:    s_sub_i32 s9, s12, s9
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s15
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s15
+; GFX8-NEXT:    s_max_i32 s9, s3, s15
+; GFX8-NEXT:    s_min_i32 s3, s3, s15
 ; GFX8-NEXT:    s_sub_i32 s3, s13, s3
+; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX8-NEXT:    s_max_i32 s3, s3, s4
+; GFX8-NEXT:    s_sub_i32 s9, s12, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_add_i32 s1, s1, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s7
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s15
-; GFX8-NEXT:    s_cselect_b32 s4, s3, s15
-; GFX8-NEXT:    s_sub_i32 s4, s12, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s15
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s15
+; GFX8-NEXT:    s_max_i32 s4, s3, s15
+; GFX8-NEXT:    s_min_i32 s3, s3, s15
 ; GFX8-NEXT:    s_sub_i32 s3, s13, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s10
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s9
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s9
+; GFX8-NEXT:    s_sub_i32 s4, s12, s4
+; GFX8-NEXT:    s_max_i32 s3, s3, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_add_i32 s7, s7, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s15
-; GFX8-NEXT:    s_cselect_b32 s4, s3, s15
-; GFX8-NEXT:    s_sub_i32 s4, s12, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s15
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s15
+; GFX8-NEXT:    s_max_i32 s4, s3, s15
+; GFX8-NEXT:    s_min_i32 s3, s3, s15
 ; GFX8-NEXT:    s_sub_i32 s3, s13, s3
+; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s5
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX8-NEXT:    s_max_i32 s3, s3, s5
+; GFX8-NEXT:    s_sub_i32 s4, s12, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s8
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s15
-; GFX8-NEXT:    s_cselect_b32 s4, s3, s15
-; GFX8-NEXT:    s_sub_i32 s4, s12, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s15
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s15
+; GFX8-NEXT:    s_max_i32 s4, s3, s15
+; GFX8-NEXT:    s_min_i32 s3, s3, s15
 ; GFX8-NEXT:    s_sub_i32 s3, s13, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s11
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s5
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX8-NEXT:    s_sub_i32 s4, s12, s4
+; GFX8-NEXT:    s_max_i32 s3, s3, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_add_i32 s8, s8, s3
 ; GFX8-NEXT:    s_bfe_u32 s3, s6, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
@@ -4285,132 +3849,100 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-LABEL: s_saddsat_v8i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX6-NEXT:    s_brev_b32 s17, 1
+; GFX6-NEXT:    s_min_i32 s19, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, 0
+; GFX6-NEXT:    s_sub_i32 s19, s17, s19
 ; GFX6-NEXT:    s_brev_b32 s16, -2
-; GFX6-NEXT:    s_cselect_b32 s18, s0, 0
+; GFX6-NEXT:    s_max_i32 s18, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s18, s16, s18
-; GFX6-NEXT:    s_cmp_lt_i32 s0, 0
-; GFX6-NEXT:    s_brev_b32 s17, 1
-; GFX6-NEXT:    s_cselect_b32 s19, s0, 0
-; GFX6-NEXT:    s_sub_i32 s19, s17, s19
-; GFX6-NEXT:    s_cmp_gt_i32 s19, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s19, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s18
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s18
-; GFX6-NEXT:    s_add_i32 s0, s0, s8
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_max_i32 s8, s19, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_min_i32 s8, s8, s18
+; GFX6-NEXT:    s_min_i32 s18, s1, 0
+; GFX6-NEXT:    s_add_i32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s1, 0
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
-; GFX6-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX6-NEXT:    s_cselect_b32 s18, s1, 0
+; GFX6-NEXT:    s_max_i32 s9, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s18, s17, s18
-; GFX6-NEXT:    s_cmp_gt_i32 s18, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s18, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s9
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s9
-; GFX6-NEXT:    s_add_i32 s1, s1, s8
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_sub_i32 s9, s16, s9
+; GFX6-NEXT:    s_max_i32 s8, s18, s8
+; GFX6-NEXT:    s_min_i32 s8, s8, s9
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_add_i32 s1, s1, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s2, 0
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
-; GFX6-NEXT:    s_cmp_lt_i32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s10, s2, 0
+; GFX6-NEXT:    s_min_i32 s10, s2, 0
+; GFX6-NEXT:    s_max_i32 s9, s2, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s17, s10
-; GFX6-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s9
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s9
-; GFX6-NEXT:    s_add_i32 s2, s2, s8
-; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX6-NEXT:    s_sub_i32 s9, s16, s9
+; GFX6-NEXT:    s_max_i32 s8, s10, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_min_i32 s8, s8, s9
+; GFX6-NEXT:    s_min_i32 s10, s3, 0
+; GFX6-NEXT:    s_add_i32 s2, s2, s8
+; GFX6-NEXT:    s_max_i32 s9, s3, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s3, 0
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
-; GFX6-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX6-NEXT:    s_cselect_b32 s10, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s17, s10
-; GFX6-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s9
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s9
-; GFX6-NEXT:    s_add_i32 s3, s3, s8
-; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
+; GFX6-NEXT:    s_sub_i32 s9, s16, s9
+; GFX6-NEXT:    s_max_i32 s8, s10, s8
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX6-NEXT:    s_min_i32 s8, s8, s9
+; GFX6-NEXT:    s_min_i32 s10, s4, 0
+; GFX6-NEXT:    s_add_i32 s3, s3, s8
+; GFX6-NEXT:    s_max_i32 s9, s4, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s4, 0
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
-; GFX6-NEXT:    s_cmp_lt_i32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s10, s4, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s17, s10
-; GFX6-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s9
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s9
-; GFX6-NEXT:    s_add_i32 s4, s4, s8
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX6-NEXT:    s_sub_i32 s9, s16, s9
+; GFX6-NEXT:    s_max_i32 s8, s10, s8
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX6-NEXT:    s_min_i32 s8, s8, s9
+; GFX6-NEXT:    s_min_i32 s10, s5, 0
+; GFX6-NEXT:    s_add_i32 s4, s4, s8
+; GFX6-NEXT:    s_max_i32 s9, s5, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s5, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s5, 0
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
-; GFX6-NEXT:    s_cmp_lt_i32 s5, 0
-; GFX6-NEXT:    s_cselect_b32 s10, s5, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s17, s10
-; GFX6-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s9
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s9
-; GFX6-NEXT:    s_add_i32 s5, s5, s8
-; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
+; GFX6-NEXT:    s_sub_i32 s9, s16, s9
+; GFX6-NEXT:    s_max_i32 s8, s10, s8
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX6-NEXT:    s_min_i32 s8, s8, s9
+; GFX6-NEXT:    s_min_i32 s10, s6, 0
+; GFX6-NEXT:    s_add_i32 s5, s5, s8
+; GFX6-NEXT:    s_max_i32 s9, s6, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s6, 0
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
-; GFX6-NEXT:    s_cmp_lt_i32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s10, s6, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s17, s10
-; GFX6-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s9
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s9
-; GFX6-NEXT:    s_add_i32 s6, s6, s8
-; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
+; GFX6-NEXT:    s_sub_i32 s9, s16, s9
+; GFX6-NEXT:    s_max_i32 s8, s10, s8
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX6-NEXT:    s_min_i32 s8, s8, s9
+; GFX6-NEXT:    s_min_i32 s10, s7, 0
+; GFX6-NEXT:    s_add_i32 s6, s6, s8
+; GFX6-NEXT:    s_max_i32 s9, s7, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s7, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s7, 0
-; GFX6-NEXT:    s_sub_i32 s9, s16, s9
-; GFX6-NEXT:    s_cmp_lt_i32 s7, 0
-; GFX6-NEXT:    s_cselect_b32 s10, s7, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s17, s10
-; GFX6-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s9
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX6-NEXT:    s_sub_i32 s9, s16, s9
+; GFX6-NEXT:    s_max_i32 s8, s10, s8
+; GFX6-NEXT:    s_min_i32 s8, s8, s9
 ; GFX6-NEXT:    s_add_i32 s7, s7, s8
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_mov_b32 s8, 0xffff
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s8
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, s8
 ; GFX6-NEXT:    s_and_b32 s2, s3, s8
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_and_b32 s3, s5, s8
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, s8
 ; GFX6-NEXT:    s_and_b32 s4, s7, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, s8
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
@@ -4419,144 +3951,112 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ;
 ; GFX8-LABEL: s_saddsat_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
-; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
-; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s18, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s19, 0
-; GFX8-NEXT:    s_cmp_gt_i32 s18, s19
-; GFX8-NEXT:    s_movk_i32 s16, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s20, s18, s19
-; GFX8-NEXT:    s_sub_i32 s20, s16, s20
-; GFX8-NEXT:    s_cmp_lt_i32 s18, s19
+; GFX8-NEXT:    s_max_i32 s20, s18, s19
 ; GFX8-NEXT:    s_movk_i32 s17, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s18, s18, s19
+; GFX8-NEXT:    s_min_i32 s18, s18, s19
 ; GFX8-NEXT:    s_sub_i32 s18, s17, s18
+; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
+; GFX8-NEXT:    s_movk_i32 s16, 0x7fff
 ; GFX8-NEXT:    s_sext_i32_i16 s18, s18
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s18, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s18, s4
+; GFX8-NEXT:    s_max_i32 s4, s18, s4
+; GFX8-NEXT:    s_sub_i32 s20, s16, s20
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s18, s20
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s18
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s18
+; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX8-NEXT:    s_min_i32 s4, s4, s18
 ; GFX8-NEXT:    s_add_i32 s0, s0, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s8
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s18, s4, s19
-; GFX8-NEXT:    s_sub_i32 s18, s16, s18
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s19
+; GFX8-NEXT:    s_max_i32 s18, s4, s19
+; GFX8-NEXT:    s_min_i32 s4, s4, s19
 ; GFX8-NEXT:    s_sub_i32 s4, s17, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s12
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s12
+; GFX8-NEXT:    s_max_i32 s4, s4, s12
+; GFX8-NEXT:    s_sub_i32 s18, s16, s18
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s18
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s12
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s12
+; GFX8-NEXT:    s_min_i32 s4, s4, s12
 ; GFX8-NEXT:    s_add_i32 s8, s8, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s12, s4, s19
-; GFX8-NEXT:    s_sub_i32 s12, s16, s12
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s19
+; GFX8-NEXT:    s_max_i32 s12, s4, s19
+; GFX8-NEXT:    s_min_i32 s4, s4, s19
 ; GFX8-NEXT:    s_sub_i32 s4, s17, s4
+; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_max_i32 s4, s4, s5
+; GFX8-NEXT:    s_sub_i32 s12, s16, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s1, s1, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s9
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s5, s4, s19
-; GFX8-NEXT:    s_sub_i32 s5, s16, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s19
+; GFX8-NEXT:    s_max_i32 s5, s4, s19
+; GFX8-NEXT:    s_min_i32 s4, s4, s19
 ; GFX8-NEXT:    s_sub_i32 s4, s17, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s13
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s12
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s12
+; GFX8-NEXT:    s_sub_i32 s5, s16, s5
+; GFX8-NEXT:    s_max_i32 s4, s4, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s9, s9, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s5, s4, s19
-; GFX8-NEXT:    s_sub_i32 s5, s16, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s19
+; GFX8-NEXT:    s_max_i32 s5, s4, s19
+; GFX8-NEXT:    s_min_i32 s4, s4, s19
 ; GFX8-NEXT:    s_sub_i32 s4, s17, s4
+; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s6
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX8-NEXT:    s_max_i32 s4, s4, s6
+; GFX8-NEXT:    s_sub_i32 s5, s16, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s10
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s5, s4, s19
-; GFX8-NEXT:    s_sub_i32 s5, s16, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s19
+; GFX8-NEXT:    s_max_i32 s5, s4, s19
+; GFX8-NEXT:    s_min_i32 s4, s4, s19
 ; GFX8-NEXT:    s_sub_i32 s4, s17, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s14
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s6
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX8-NEXT:    s_max_i32 s4, s4, s6
+; GFX8-NEXT:    s_sub_i32 s5, s16, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s10, s10, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s5, s4, s19
-; GFX8-NEXT:    s_sub_i32 s5, s16, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s19
+; GFX8-NEXT:    s_max_i32 s5, s4, s19
+; GFX8-NEXT:    s_min_i32 s4, s4, s19
 ; GFX8-NEXT:    s_sub_i32 s4, s17, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s7
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s6
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX8-NEXT:    s_sub_i32 s5, s16, s5
+; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s11
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s5, s4, s19
-; GFX8-NEXT:    s_sub_i32 s5, s16, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s19
+; GFX8-NEXT:    s_max_i32 s5, s4, s19
+; GFX8-NEXT:    s_min_i32 s4, s4, s19
 ; GFX8-NEXT:    s_sub_i32 s4, s17, s4
+; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s15
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s6
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX8-NEXT:    s_sub_i32 s5, s16, s5
+; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s11, s11, s4
 ; GFX8-NEXT:    s_bfe_u32 s4, s8, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 6209f7e4335b..945bac091858 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -61,17 +61,13 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i7:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
+; GFX6-NEXT:    s_max_i32 s2, s0, -1
+; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s2, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s3, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
-; GFX6-NEXT:    s_cmp_gt_i32 s2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX6-NEXT:    s_max_i32 s1, s2, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s3
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 25
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -80,23 +76,19 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_bfe_u32 s2, 9, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s4, -1
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s5, s3, s4
+; GFX8-NEXT:    s_max_i32 s5, s3, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fff
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX8-NEXT:    s_sub_i32 s3, s3, 0xffff8000
+; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX8-NEXT:    s_sub_i32 s3, s3, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s1, s4, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s3
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX8-NEXT:    s_min_i32 s1, s1, s3
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s2
@@ -183,17 +175,13 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX6-NEXT:    s_max_i32 s2, s0, -1
+; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s2, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s3, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
-; GFX6-NEXT:    s_cmp_gt_i32 s2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX6-NEXT:    s_max_i32 s1, s2, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s3
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -202,23 +190,19 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_bfe_u32 s2, 8, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s4, -1
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s5, s3, s4
+; GFX8-NEXT:    s_max_i32 s5, s3, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fff
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX8-NEXT:    s_sub_i32 s3, s3, 0xffff8000
+; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX8-NEXT:    s_sub_i32 s3, s3, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s1, s4, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s3
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX8-NEXT:    s_min_i32 s1, s1, s3
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s2
@@ -360,38 +344,30 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX6-LABEL: s_ssubsat_v2i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
-; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    s_cselect_b32 s6, s0, -1
+; GFX6-NEXT:    s_max_i32 s6, s0, -1
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_cselect_b32 s7, s0, -1
+; GFX6-NEXT:    s_min_i32 s7, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s6, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s6, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s7
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX6-NEXT:    s_max_i32 s1, s6, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s7
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s3, s1, -1
+; GFX6-NEXT:    s_max_i32 s3, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s4, s1, -1
+; GFX6-NEXT:    s_min_i32 s4, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s3, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s2, s4
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX6-NEXT:    s_max_i32 s2, s3, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s4
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
-; GFX6-NEXT:    s_movk_i32 s2, 0xff
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
+; GFX6-NEXT:    s_movk_i32 s2, 0xff
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX6-NEXT:    s_and_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
@@ -403,50 +379,42 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_bfe_u32 s4, 8, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s8, -1
-; GFX8-NEXT:    s_cmp_gt_i32 s7, s8
+; GFX8-NEXT:    s_max_i32 s9, s7, s8
 ; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s9, s7, s8
 ; GFX8-NEXT:    s_sub_i32 s9, s9, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s7, s8
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX8-NEXT:    s_movk_i32 s6, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s7, s7, s8
-; GFX8-NEXT:    s_sub_i32 s7, s7, s6
+; GFX8-NEXT:    s_min_i32 s7, s7, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s9, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s9, s1
+; GFX8-NEXT:    s_sub_i32 s7, s7, s6
+; GFX8-NEXT:    s_max_i32 s1, s9, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s7
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX8-NEXT:    s_min_i32 s1, s1, s7
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, s4
-; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s8
-; GFX8-NEXT:    s_cselect_b32 s7, s3, s8
+; GFX8-NEXT:    s_max_i32 s7, s3, s8
 ; GFX8-NEXT:    s_sub_i32 s5, s7, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s8
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s8
-; GFX8-NEXT:    s_sub_i32 s3, s3, s6
+; GFX8-NEXT:    s_min_i32 s3, s3, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s2
-; GFX8-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX8-NEXT:    s_sub_i32 s3, s3, s6
+; GFX8-NEXT:    s_max_i32 s2, s5, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s3
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s4
+; GFX8-NEXT:    s_movk_i32 s2, 0xff
+; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
 ; GFX8-NEXT:    s_and_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s0, s0, s2
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
@@ -714,68 +682,52 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX6-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX6-NEXT:    s_brev_b32 s8, -2
+; GFX6-NEXT:    s_max_i32 s10, s0, -1
 ; GFX6-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
-; GFX6-NEXT:    s_brev_b32 s8, -2
-; GFX6-NEXT:    s_cselect_b32 s10, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s9, 1
-; GFX6-NEXT:    s_cselect_b32 s11, s0, -1
+; GFX6-NEXT:    s_min_i32 s11, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s11, s11, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s10, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s10, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s11
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s11
+; GFX6-NEXT:    s_max_i32 s1, s10, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s11
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s5, s1, -1
+; GFX6-NEXT:    s_max_i32 s5, s1, -1
+; GFX6-NEXT:    s_min_i32 s10, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s10, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s5, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s2, s10
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s10
+; GFX6-NEXT:    s_max_i32 s2, s5, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s10
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
+; GFX6-NEXT:    s_max_i32 s5, s2, -1
 ; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s5, s2, -1
+; GFX6-NEXT:    s_min_i32 s6, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s6, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s5, s3
-; GFX6-NEXT:    s_cselect_b32 s3, s5, s3
-; GFX6-NEXT:    s_cmp_lt_i32 s3, s6
-; GFX6-NEXT:    s_cselect_b32 s3, s3, s6
+; GFX6-NEXT:    s_max_i32 s3, s5, s3
+; GFX6-NEXT:    s_min_i32 s3, s3, s6
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
-; GFX6-NEXT:    s_ashr_i32 s2, s2, 24
+; GFX6-NEXT:    s_max_i32 s5, s3, -1
+; GFX6-NEXT:    s_min_i32 s6, s3, -1
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
-; GFX6-NEXT:    s_cmp_gt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s5, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s6, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s6
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX6-NEXT:    s_max_i32 s4, s5, s4
+; GFX6-NEXT:    s_min_i32 s4, s4, s6
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
 ; GFX6-NEXT:    s_movk_i32 s4, 0xff
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX6-NEXT:    s_and_b32 s1, s1, s4
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 24
 ; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -795,91 +747,75 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
-; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
-; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s11, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s12, -1
-; GFX8-NEXT:    s_cmp_gt_i32 s11, s12
+; GFX8-NEXT:    s_max_i32 s13, s11, s12
 ; GFX8-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s13, s11, s12
 ; GFX8-NEXT:    s_sub_i32 s13, s13, s9
-; GFX8-NEXT:    s_cmp_lt_i32 s11, s12
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
+; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
 ; GFX8-NEXT:    s_movk_i32 s10, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s11, s11, s12
-; GFX8-NEXT:    s_sub_i32 s11, s11, s10
+; GFX8-NEXT:    s_min_i32 s11, s11, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s13, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s13, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s13, s1
+; GFX8-NEXT:    s_sub_i32 s11, s11, s10
+; GFX8-NEXT:    s_max_i32 s1, s13, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s11, s11
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s11
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s11
+; GFX8-NEXT:    s_min_i32 s1, s1, s11
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s5, s8
-; GFX8-NEXT:    s_ashr_i32 s0, s0, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s11, s5, s12
+; GFX8-NEXT:    s_max_i32 s11, s5, s12
 ; GFX8-NEXT:    s_sub_i32 s11, s11, s9
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s12
-; GFX8-NEXT:    s_sub_i32 s5, s5, s10
+; GFX8-NEXT:    s_min_i32 s5, s5, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s11, s11
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s11, s2
-; GFX8-NEXT:    s_cselect_b32 s2, s11, s2
+; GFX8-NEXT:    s_sub_i32 s5, s5, s10
+; GFX8-NEXT:    s_max_i32 s2, s11, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s5
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s5
+; GFX8-NEXT:    s_min_i32 s2, s2, s5
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s2
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, s8
-; GFX8-NEXT:    s_lshl_b32 s3, s6, s8
-; GFX8-NEXT:    s_ashr_i32 s1, s1, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s6, s5, s12
+; GFX8-NEXT:    s_lshl_b32 s3, s6, s8
+; GFX8-NEXT:    s_max_i32 s6, s5, s12
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s9
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s12
-; GFX8-NEXT:    s_sub_i32 s5, s5, s10
+; GFX8-NEXT:    s_min_i32 s5, s5, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX8-NEXT:    s_sub_i32 s5, s5, s10
+; GFX8-NEXT:    s_max_i32 s3, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s5
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX8-NEXT:    s_min_i32 s3, s3, s5
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s3
-; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_lshl_b32 s3, s4, s8
-; GFX8-NEXT:    s_lshl_b32 s4, s7, s8
-; GFX8-NEXT:    s_ashr_i32 s2, s2, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s6, s5, s12
+; GFX8-NEXT:    s_max_i32 s6, s5, s12
+; GFX8-NEXT:    s_lshl_b32 s4, s7, s8
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s9
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s12
-; GFX8-NEXT:    s_sub_i32 s5, s5, s10
+; GFX8-NEXT:    s_min_i32 s5, s5, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX8-NEXT:    s_sub_i32 s5, s5, s10
+; GFX8-NEXT:    s_max_i32 s4, s6, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_sext_i32_i16 s1, s1
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s4
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_ashr_i32 s1, s1, s8
 ; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    s_ashr_i32 s0, s0, s8
+; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_and_b32 s1, s1, s4
+; GFX8-NEXT:    s_ashr_i32 s2, s2, s8
 ; GFX8-NEXT:    s_and_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
@@ -1046,17 +982,13 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i24:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX6-NEXT:    s_max_i32 s2, s0, -1
+; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s2, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s3, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
-; GFX6-NEXT:    s_cmp_gt_i32 s2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX6-NEXT:    s_max_i32 s1, s2, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s3
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 8
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -1145,31 +1077,23 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s2, s0, -1
+; GFX6-NEXT:    s_max_i32 s2, s0, -1
+; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s3, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
-; GFX6-NEXT:    s_cmp_gt_i32 s2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX6-NEXT:    s_max_i32 s1, s2, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s3
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ssubsat_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, -1
-; GFX8-NEXT:    s_cselect_b32 s2, s0, -1
+; GFX8-NEXT:    s_max_i32 s2, s0, -1
+; GFX8-NEXT:    s_min_i32 s3, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
-; GFX8-NEXT:    s_cmp_lt_i32 s0, -1
-; GFX8-NEXT:    s_cselect_b32 s3, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s3, s3, 0x80000000
-; GFX8-NEXT:    s_cmp_gt_i32 s2, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s3
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX8-NEXT:    s_max_i32 s1, s2, s1
+; GFX8-NEXT:    s_min_i32 s1, s1, s3
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1192,11 +1116,9 @@ define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
 define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
 ; GFX6-LABEL: ssubsat_i32_sv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s1, s0, -1
+; GFX6-NEXT:    s_max_i32 s1, s0, -1
+; GFX6-NEXT:    s_min_i32 s2, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s2, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x80000000
 ; GFX6-NEXT:    v_max_i32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_min_i32_e32 v0, s2, v0
@@ -1205,11 +1127,9 @@ define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
 ;
 ; GFX8-LABEL: ssubsat_i32_sv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, -1
-; GFX8-NEXT:    s_cselect_b32 s1, s0, -1
+; GFX8-NEXT:    s_max_i32 s1, s0, -1
+; GFX8-NEXT:    s_min_i32 s2, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
-; GFX8-NEXT:    s_cmp_lt_i32 s0, -1
-; GFX8-NEXT:    s_cselect_b32 s2, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s2, s2, 0x80000000
 ; GFX8-NEXT:    v_max_i32_e32 v0, s1, v0
 ; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
@@ -1331,57 +1251,41 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    s_cselect_b32 s6, s0, -1
+; GFX6-NEXT:    s_max_i32 s6, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_cselect_b32 s7, s0, -1
+; GFX6-NEXT:    s_min_i32 s7, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s6, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s2, s7
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s7
+; GFX6-NEXT:    s_max_i32 s2, s6, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s7
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s2
-; GFX6-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s2, s1, -1
+; GFX6-NEXT:    s_max_i32 s2, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s4, s1, -1
+; GFX6-NEXT:    s_min_i32 s4, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s2, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lt_i32 s2, s4
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX6-NEXT:    s_max_i32 s2, s2, s3
+; GFX6-NEXT:    s_min_i32 s2, s2, s4
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ssubsat_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX8-NEXT:    s_brev_b32 s4, -2
-; GFX8-NEXT:    s_cselect_b32 s6, s0, -1
+; GFX8-NEXT:    s_max_i32 s6, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    s_cselect_b32 s7, s0, -1
+; GFX8-NEXT:    s_min_i32 s7, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s7, s7, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s2
-; GFX8-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s7
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s7
+; GFX8-NEXT:    s_max_i32 s2, s6, s2
+; GFX8-NEXT:    s_min_i32 s2, s2, s7
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX8-NEXT:    s_cselect_b32 s2, s1, -1
+; GFX8-NEXT:    s_max_i32 s2, s1, -1
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX8-NEXT:    s_cselect_b32 s4, s1, -1
+; GFX8-NEXT:    s_min_i32 s4, s1, -1
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s2, s3
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s4
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX8-NEXT:    s_max_i32 s2, s2, s3
+; GFX8-NEXT:    s_min_i32 s2, s2, s4
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1486,79 +1390,55 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s6, -2
-; GFX6-NEXT:    s_cselect_b32 s8, s0, -1
+; GFX6-NEXT:    s_max_i32 s8, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s7, 1
-; GFX6-NEXT:    s_cselect_b32 s9, s0, -1
+; GFX6-NEXT:    s_min_i32 s9, s0, -1
+; GFX6-NEXT:    s_max_i32 s3, s8, s3
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s7
-; GFX6-NEXT:    s_cmp_gt_i32 s8, s3
-; GFX6-NEXT:    s_cselect_b32 s3, s8, s3
-; GFX6-NEXT:    s_cmp_lt_i32 s3, s9
-; GFX6-NEXT:    s_cselect_b32 s3, s3, s9
+; GFX6-NEXT:    s_min_i32 s3, s3, s9
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s3
-; GFX6-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s3, s1, -1
+; GFX6-NEXT:    s_max_i32 s3, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s8, s1, -1
+; GFX6-NEXT:    s_min_i32 s8, s1, -1
+; GFX6-NEXT:    s_max_i32 s3, s3, s4
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s7
-; GFX6-NEXT:    s_cmp_gt_i32 s3, s4
-; GFX6-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s3, s8
-; GFX6-NEXT:    s_cselect_b32 s3, s3, s8
+; GFX6-NEXT:    s_min_i32 s3, s3, s8
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s3
-; GFX6-NEXT:    s_cmp_gt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s3, s2, -1
+; GFX6-NEXT:    s_max_i32 s3, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s4, s2, -1
+; GFX6-NEXT:    s_min_i32 s4, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s7
-; GFX6-NEXT:    s_cmp_gt_i32 s3, s5
-; GFX6-NEXT:    s_cselect_b32 s3, s3, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX6-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX6-NEXT:    s_max_i32 s3, s3, s5
+; GFX6-NEXT:    s_min_i32 s3, s3, s4
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ssubsat_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX8-NEXT:    s_brev_b32 s6, -2
-; GFX8-NEXT:    s_cselect_b32 s8, s0, -1
+; GFX8-NEXT:    s_max_i32 s8, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX8-NEXT:    s_brev_b32 s7, 1
-; GFX8-NEXT:    s_cselect_b32 s9, s0, -1
+; GFX8-NEXT:    s_min_i32 s9, s0, -1
+; GFX8-NEXT:    s_max_i32 s3, s8, s3
 ; GFX8-NEXT:    s_sub_i32 s9, s9, s7
-; GFX8-NEXT:    s_cmp_gt_i32 s8, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s8, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s9
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s9
+; GFX8-NEXT:    s_min_i32 s3, s3, s9
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX8-NEXT:    s_cselect_b32 s3, s1, -1
+; GFX8-NEXT:    s_max_i32 s3, s1, -1
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX8-NEXT:    s_cselect_b32 s8, s1, -1
+; GFX8-NEXT:    s_min_i32 s8, s1, -1
+; GFX8-NEXT:    s_max_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s7
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s8
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s8
+; GFX8-NEXT:    s_min_i32 s3, s3, s8
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s2, -1
-; GFX8-NEXT:    s_cselect_b32 s3, s2, -1
+; GFX8-NEXT:    s_max_i32 s3, s2, -1
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s2, -1
-; GFX8-NEXT:    s_cselect_b32 s4, s2, -1
+; GFX8-NEXT:    s_min_i32 s4, s2, -1
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s7
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s5
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX8-NEXT:    s_max_i32 s3, s3, s5
+; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1684,101 +1564,69 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s8, -2
-; GFX6-NEXT:    s_cselect_b32 s10, s0, -1
+; GFX6-NEXT:    s_max_i32 s10, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s9, 1
-; GFX6-NEXT:    s_cselect_b32 s11, s0, -1
+; GFX6-NEXT:    s_min_i32 s11, s0, -1
+; GFX6-NEXT:    s_max_i32 s4, s10, s4
 ; GFX6-NEXT:    s_sub_i32 s11, s11, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s10, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s10, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s11
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s11
+; GFX6-NEXT:    s_min_i32 s4, s4, s11
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s4
-; GFX6-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s4, s1, -1
+; GFX6-NEXT:    s_max_i32 s4, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s10, s1, -1
+; GFX6-NEXT:    s_min_i32 s10, s1, -1
+; GFX6-NEXT:    s_max_i32 s4, s4, s5
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s10
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX6-NEXT:    s_min_i32 s4, s4, s10
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s4
-; GFX6-NEXT:    s_cmp_gt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s4, s2, -1
+; GFX6-NEXT:    s_max_i32 s4, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s5, s2, -1
+; GFX6-NEXT:    s_min_i32 s5, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s4, s6
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX6-NEXT:    s_max_i32 s4, s4, s6
+; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s4
-; GFX6-NEXT:    s_cmp_gt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s4, s3, -1
+; GFX6-NEXT:    s_max_i32 s4, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s5, s3, -1
+; GFX6-NEXT:    s_min_i32 s5, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s4, s7
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s7
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX6-NEXT:    s_max_i32 s4, s4, s7
+; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ssubsat_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX8-NEXT:    s_brev_b32 s8, -2
-; GFX8-NEXT:    s_cselect_b32 s10, s0, -1
+; GFX8-NEXT:    s_max_i32 s10, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s10, s10, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX8-NEXT:    s_brev_b32 s9, 1
-; GFX8-NEXT:    s_cselect_b32 s11, s0, -1
+; GFX8-NEXT:    s_min_i32 s11, s0, -1
+; GFX8-NEXT:    s_max_i32 s4, s10, s4
 ; GFX8-NEXT:    s_sub_i32 s11, s11, s9
-; GFX8-NEXT:    s_cmp_gt_i32 s10, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s10, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s11
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s11
+; GFX8-NEXT:    s_min_i32 s4, s4, s11
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX8-NEXT:    s_cselect_b32 s4, s1, -1
+; GFX8-NEXT:    s_max_i32 s4, s1, -1
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX8-NEXT:    s_cselect_b32 s10, s1, -1
+; GFX8-NEXT:    s_min_i32 s10, s1, -1
+; GFX8-NEXT:    s_max_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s10, s10, s9
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s10
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX8-NEXT:    s_min_i32 s4, s4, s10
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s2, -1
-; GFX8-NEXT:    s_cselect_b32 s4, s2, -1
+; GFX8-NEXT:    s_max_i32 s4, s2, -1
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s2, -1
-; GFX8-NEXT:    s_cselect_b32 s5, s2, -1
+; GFX8-NEXT:    s_min_i32 s5, s2, -1
 ; GFX8-NEXT:    s_sub_i32 s5, s5, s9
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s6
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_max_i32 s4, s4, s6
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s3, -1
-; GFX8-NEXT:    s_cselect_b32 s4, s3, -1
+; GFX8-NEXT:    s_max_i32 s4, s3, -1
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s3, -1
-; GFX8-NEXT:    s_cselect_b32 s5, s3, -1
+; GFX8-NEXT:    s_min_i32 s5, s3, -1
 ; GFX8-NEXT:    s_sub_i32 s5, s5, s9
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s7
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s7
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_max_i32 s4, s4, s7
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1929,123 +1777,83 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_v5i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s10, -2
-; GFX6-NEXT:    s_cselect_b32 s12, s0, -1
+; GFX6-NEXT:    s_max_i32 s12, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s12, s12, s10
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s11, 1
-; GFX6-NEXT:    s_cselect_b32 s13, s0, -1
+; GFX6-NEXT:    s_min_i32 s13, s0, -1
+; GFX6-NEXT:    s_max_i32 s5, s12, s5
 ; GFX6-NEXT:    s_sub_i32 s13, s13, s11
-; GFX6-NEXT:    s_cmp_gt_i32 s12, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s12, s5
-; GFX6-NEXT:    s_cmp_lt_i32 s5, s13
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s13
+; GFX6-NEXT:    s_min_i32 s5, s5, s13
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s5, s1, -1
+; GFX6-NEXT:    s_max_i32 s5, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s10
-; GFX6-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s12, s1, -1
+; GFX6-NEXT:    s_min_i32 s12, s1, -1
+; GFX6-NEXT:    s_max_i32 s5, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s12, s12, s11
-; GFX6-NEXT:    s_cmp_gt_i32 s5, s6
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s5, s12
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s12
+; GFX6-NEXT:    s_min_i32 s5, s5, s12
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s5, s2, -1
+; GFX6-NEXT:    s_max_i32 s5, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s10
-; GFX6-NEXT:    s_cmp_lt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s6, s2, -1
+; GFX6-NEXT:    s_min_i32 s6, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s11
-; GFX6-NEXT:    s_cmp_gt_i32 s5, s7
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX6-NEXT:    s_cmp_lt_i32 s5, s6
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s6
+; GFX6-NEXT:    s_max_i32 s5, s5, s7
+; GFX6-NEXT:    s_min_i32 s5, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s5, s3, -1
+; GFX6-NEXT:    s_max_i32 s5, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s10
-; GFX6-NEXT:    s_cmp_lt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s6, s3, -1
+; GFX6-NEXT:    s_min_i32 s6, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s11
-; GFX6-NEXT:    s_cmp_gt_i32 s5, s8
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s5, s6
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s6
+; GFX6-NEXT:    s_max_i32 s5, s5, s8
+; GFX6-NEXT:    s_min_i32 s5, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s4, -1
-; GFX6-NEXT:    s_cselect_b32 s5, s4, -1
+; GFX6-NEXT:    s_max_i32 s5, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s10
-; GFX6-NEXT:    s_cmp_lt_i32 s4, -1
-; GFX6-NEXT:    s_cselect_b32 s6, s4, -1
+; GFX6-NEXT:    s_min_i32 s6, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s11
-; GFX6-NEXT:    s_cmp_gt_i32 s5, s9
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s9
-; GFX6-NEXT:    s_cmp_lt_i32 s5, s6
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s6
+; GFX6-NEXT:    s_max_i32 s5, s5, s9
+; GFX6-NEXT:    s_min_i32 s5, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s5
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ssubsat_v5i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX8-NEXT:    s_brev_b32 s10, -2
-; GFX8-NEXT:    s_cselect_b32 s12, s0, -1
+; GFX8-NEXT:    s_max_i32 s12, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s12, s12, s10
-; GFX8-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX8-NEXT:    s_brev_b32 s11, 1
-; GFX8-NEXT:    s_cselect_b32 s13, s0, -1
+; GFX8-NEXT:    s_min_i32 s13, s0, -1
+; GFX8-NEXT:    s_max_i32 s5, s12, s5
 ; GFX8-NEXT:    s_sub_i32 s13, s13, s11
-; GFX8-NEXT:    s_cmp_gt_i32 s12, s5
-; GFX8-NEXT:    s_cselect_b32 s5, s12, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s13
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s13
+; GFX8-NEXT:    s_min_i32 s5, s5, s13
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX8-NEXT:    s_cselect_b32 s5, s1, -1
+; GFX8-NEXT:    s_max_i32 s5, s1, -1
 ; GFX8-NEXT:    s_sub_i32 s5, s5, s10
-; GFX8-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX8-NEXT:    s_cselect_b32 s12, s1, -1
+; GFX8-NEXT:    s_min_i32 s12, s1, -1
+; GFX8-NEXT:    s_max_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s12, s12, s11
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s6
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s12
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s12
+; GFX8-NEXT:    s_min_i32 s5, s5, s12
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s2, -1
-; GFX8-NEXT:    s_cselect_b32 s5, s2, -1
+; GFX8-NEXT:    s_max_i32 s5, s2, -1
 ; GFX8-NEXT:    s_sub_i32 s5, s5, s10
-; GFX8-NEXT:    s_cmp_lt_i32 s2, -1
-; GFX8-NEXT:    s_cselect_b32 s6, s2, -1
+; GFX8-NEXT:    s_min_i32 s6, s2, -1
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s11
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s7
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s6
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s6
+; GFX8-NEXT:    s_max_i32 s5, s5, s7
+; GFX8-NEXT:    s_min_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s3, -1
-; GFX8-NEXT:    s_cselect_b32 s5, s3, -1
+; GFX8-NEXT:    s_max_i32 s5, s3, -1
 ; GFX8-NEXT:    s_sub_i32 s5, s5, s10
-; GFX8-NEXT:    s_cmp_lt_i32 s3, -1
-; GFX8-NEXT:    s_cselect_b32 s6, s3, -1
+; GFX8-NEXT:    s_min_i32 s6, s3, -1
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s11
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s8
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s6
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s6
+; GFX8-NEXT:    s_max_i32 s5, s5, s8
+; GFX8-NEXT:    s_min_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s4, -1
-; GFX8-NEXT:    s_cselect_b32 s5, s4, -1
+; GFX8-NEXT:    s_max_i32 s5, s4, -1
 ; GFX8-NEXT:    s_sub_i32 s5, s5, s10
-; GFX8-NEXT:    s_cmp_lt_i32 s4, -1
-; GFX8-NEXT:    s_cselect_b32 s6, s4, -1
+; GFX8-NEXT:    s_min_i32 s6, s4, -1
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s11
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s9
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s9
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s6
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s6
+; GFX8-NEXT:    s_max_i32 s5, s5, s9
+; GFX8-NEXT:    s_min_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s5
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2377,365 +2185,237 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_v16i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s32, -2
-; GFX6-NEXT:    s_cselect_b32 s34, s0, -1
+; GFX6-NEXT:    s_max_i32 s34, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s34, s34, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s33, 1
-; GFX6-NEXT:    s_cselect_b32 s35, s0, -1
+; GFX6-NEXT:    s_min_i32 s35, s0, -1
+; GFX6-NEXT:    s_max_i32 s16, s34, s16
 ; GFX6-NEXT:    s_sub_i32 s35, s35, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s34, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s34, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s35
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s35
+; GFX6-NEXT:    s_min_i32 s16, s16, s35
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s1, -1
+; GFX6-NEXT:    s_max_i32 s16, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s34, s1, -1
+; GFX6-NEXT:    s_min_i32 s34, s1, -1
+; GFX6-NEXT:    s_max_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s34, s34, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s34
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s34
+; GFX6-NEXT:    s_min_i32 s16, s16, s34
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s2, -1
+; GFX6-NEXT:    s_max_i32 s16, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s2, -1
+; GFX6-NEXT:    s_min_i32 s17, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s18
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s18
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s18
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s3, -1
+; GFX6-NEXT:    s_max_i32 s16, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s3, -1
+; GFX6-NEXT:    s_min_i32 s17, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s19
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s19
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s19
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s4, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s4, -1
+; GFX6-NEXT:    s_max_i32 s16, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s4, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s4, -1
+; GFX6-NEXT:    s_min_i32 s17, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s20
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s20
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s20
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s5, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s5, -1
+; GFX6-NEXT:    s_max_i32 s16, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s5, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s5, -1
+; GFX6-NEXT:    s_min_i32 s17, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s21
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s21
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s21
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s6, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s6, -1
+; GFX6-NEXT:    s_max_i32 s16, s6, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s6, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s6, -1
+; GFX6-NEXT:    s_min_i32 s17, s6, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s22
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s22
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s22
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s7, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s7, -1
+; GFX6-NEXT:    s_max_i32 s16, s7, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s7, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s7, -1
+; GFX6-NEXT:    s_min_i32 s17, s7, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s23
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s23
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s23
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s8, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s8, -1
+; GFX6-NEXT:    s_max_i32 s16, s8, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s8, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s8, -1
+; GFX6-NEXT:    s_min_i32 s17, s8, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s24
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s24
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s24
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s9, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s9, -1
+; GFX6-NEXT:    s_max_i32 s16, s9, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s9, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s9, -1
+; GFX6-NEXT:    s_min_i32 s17, s9, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s25
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s25
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s25
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s10, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s10, -1
+; GFX6-NEXT:    s_max_i32 s16, s10, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s10, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s10, -1
+; GFX6-NEXT:    s_min_i32 s17, s10, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s26
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s26
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s26
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s11, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s11, -1
+; GFX6-NEXT:    s_max_i32 s16, s11, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s11, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s11, -1
+; GFX6-NEXT:    s_min_i32 s17, s11, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s27
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s27
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s27
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s11, s11, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s12, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s12, -1
+; GFX6-NEXT:    s_max_i32 s16, s12, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s12, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s12, -1
+; GFX6-NEXT:    s_min_i32 s17, s12, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s28
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s28
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s28
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s12, s12, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s13, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s13, -1
+; GFX6-NEXT:    s_max_i32 s16, s13, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s13, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s13, -1
+; GFX6-NEXT:    s_min_i32 s17, s13, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s29
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s29
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s29
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s13, s13, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s14, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s14, -1
+; GFX6-NEXT:    s_max_i32 s16, s14, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s14, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s14, -1
+; GFX6-NEXT:    s_min_i32 s17, s14, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s30
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s30
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s30
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s14, s14, s16
-; GFX6-NEXT:    s_cmp_gt_i32 s15, -1
-; GFX6-NEXT:    s_cselect_b32 s16, s15, -1
+; GFX6-NEXT:    s_max_i32 s16, s15, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
-; GFX6-NEXT:    s_cmp_lt_i32 s15, -1
-; GFX6-NEXT:    s_cselect_b32 s17, s15, -1
+; GFX6-NEXT:    s_min_i32 s17, s15, -1
 ; GFX6-NEXT:    s_sub_i32 s17, s17, s33
-; GFX6-NEXT:    s_cmp_gt_i32 s16, s31
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s31
-; GFX6-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_max_i32 s16, s16, s31
+; GFX6-NEXT:    s_min_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s15, s15, s16
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ssubsat_v16i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX8-NEXT:    s_brev_b32 s32, -2
-; GFX8-NEXT:    s_cselect_b32 s34, s0, -1
+; GFX8-NEXT:    s_max_i32 s34, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s34, s34, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX8-NEXT:    s_brev_b32 s33, 1
-; GFX8-NEXT:    s_cselect_b32 s35, s0, -1
+; GFX8-NEXT:    s_min_i32 s35, s0, -1
+; GFX8-NEXT:    s_max_i32 s16, s34, s16
 ; GFX8-NEXT:    s_sub_i32 s35, s35, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s34, s16
-; GFX8-NEXT:    s_cselect_b32 s16, s34, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s35
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s35
+; GFX8-NEXT:    s_min_i32 s16, s16, s35
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s1, -1
+; GFX8-NEXT:    s_max_i32 s16, s1, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX8-NEXT:    s_cselect_b32 s34, s1, -1
+; GFX8-NEXT:    s_min_i32 s34, s1, -1
+; GFX8-NEXT:    s_max_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s34, s34, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s34
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s34
+; GFX8-NEXT:    s_min_i32 s16, s16, s34
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s2, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s2, -1
+; GFX8-NEXT:    s_max_i32 s16, s2, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s2, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s2, -1
+; GFX8-NEXT:    s_min_i32 s17, s2, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s18
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s18
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s18
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s3, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s3, -1
+; GFX8-NEXT:    s_max_i32 s16, s3, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s3, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s3, -1
+; GFX8-NEXT:    s_min_i32 s17, s3, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s19
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s19
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s19
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s4, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s4, -1
+; GFX8-NEXT:    s_max_i32 s16, s4, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s4, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s4, -1
+; GFX8-NEXT:    s_min_i32 s17, s4, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s20
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s20
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s20
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s5, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s5, -1
+; GFX8-NEXT:    s_max_i32 s16, s5, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s5, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s5, -1
+; GFX8-NEXT:    s_min_i32 s17, s5, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s21
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s21
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s21
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s5, s5, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s6, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s6, -1
+; GFX8-NEXT:    s_max_i32 s16, s6, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s6, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s6, -1
+; GFX8-NEXT:    s_min_i32 s17, s6, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s22
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s22
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s22
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s7, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s7, -1
+; GFX8-NEXT:    s_max_i32 s16, s7, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s7, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s7, -1
+; GFX8-NEXT:    s_min_i32 s17, s7, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s23
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s23
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s23
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s7, s7, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s8, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s8, -1
+; GFX8-NEXT:    s_max_i32 s16, s8, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s8, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s8, -1
+; GFX8-NEXT:    s_min_i32 s17, s8, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s24
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s24
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s24
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s9, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s9, -1
+; GFX8-NEXT:    s_max_i32 s16, s9, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s9, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s9, -1
+; GFX8-NEXT:    s_min_i32 s17, s9, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s25
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s25
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s25
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s9, s9, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s10, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s10, -1
+; GFX8-NEXT:    s_max_i32 s16, s10, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s10, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s10, -1
+; GFX8-NEXT:    s_min_i32 s17, s10, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s26
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s26
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s26
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s10, s10, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s11, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s11, -1
+; GFX8-NEXT:    s_max_i32 s16, s11, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s11, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s11, -1
+; GFX8-NEXT:    s_min_i32 s17, s11, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s27
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s27
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s27
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s11, s11, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s12, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s12, -1
+; GFX8-NEXT:    s_max_i32 s16, s12, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s12, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s12, -1
+; GFX8-NEXT:    s_min_i32 s17, s12, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s28
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s28
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s28
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s12, s12, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s13, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s13, -1
+; GFX8-NEXT:    s_max_i32 s16, s13, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s13, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s13, -1
+; GFX8-NEXT:    s_min_i32 s17, s13, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s29
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s29
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s29
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s13, s13, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s14, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s14, -1
+; GFX8-NEXT:    s_max_i32 s16, s14, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s14, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s14, -1
+; GFX8-NEXT:    s_min_i32 s17, s14, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s30
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s30
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s30
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s14, s14, s16
-; GFX8-NEXT:    s_cmp_gt_i32 s15, -1
-; GFX8-NEXT:    s_cselect_b32 s16, s15, -1
+; GFX8-NEXT:    s_max_i32 s16, s15, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
-; GFX8-NEXT:    s_cmp_lt_i32 s15, -1
-; GFX8-NEXT:    s_cselect_b32 s17, s15, -1
+; GFX8-NEXT:    s_min_i32 s17, s15, -1
 ; GFX8-NEXT:    s_sub_i32 s17, s17, s33
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s31
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s31
-; GFX8-NEXT:    s_cmp_lt_i32 s16, s17
-; GFX8-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX8-NEXT:    s_max_i32 s16, s16, s31
+; GFX8-NEXT:    s_min_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s15, s15, s16
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2878,17 +2558,13 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX6-NEXT:    s_max_i32 s2, s0, -1
+; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s2, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s3, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
-; GFX6-NEXT:    s_cmp_gt_i32 s2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    s_cmp_lt_i32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX6-NEXT:    s_max_i32 s1, s2, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s3
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2897,20 +2573,16 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s3, -1
-; GFX8-NEXT:    s_cmp_gt_i32 s2, s3
-; GFX8-NEXT:    s_cselect_b32 s4, s2, s3
+; GFX8-NEXT:    s_max_i32 s4, s2, s3
 ; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s3
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s3
-; GFX8-NEXT:    s_sub_i32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX8-NEXT:    s_sub_i32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_max_i32 s1, s3, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s2
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX8-NEXT:    s_min_i32 s1, s1, s2
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2934,12 +2606,10 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
 ; GFX6-LABEL: ssubsat_i16_sv:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s1, s0, -1
-; GFX6-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s2, s0, -1
+; GFX6-NEXT:    s_max_i32 s1, s0, -1
+; GFX6-NEXT:    s_min_i32 s2, s0, -1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x80000000
 ; GFX6-NEXT:    v_max_i32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_min_i32_e32 v0, s2, v0
@@ -2951,11 +2621,9 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s2, -1
-; GFX8-NEXT:    s_cmp_gt_i32 s1, s2
-; GFX8-NEXT:    s_cselect_b32 s3, s1, s2
+; GFX8-NEXT:    s_max_i32 s3, s1, s2
+; GFX8-NEXT:    s_min_i32 s1, s1, s2
 ; GFX8-NEXT:    s_sub_i32 s3, s3, 0x7fff
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s2
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s2
 ; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffff8000
 ; GFX8-NEXT:    v_max_i16_e32 v0, s3, v0
 ; GFX8-NEXT:    v_min_i16_e32 v0, s1, v0
@@ -3087,36 +2755,28 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6-LABEL: s_ssubsat_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    s_cselect_b32 s6, s0, -1
+; GFX6-NEXT:    s_max_i32 s6, s0, -1
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_cselect_b32 s7, s0, -1
+; GFX6-NEXT:    s_min_i32 s7, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s6, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s2, s7
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s7
-; GFX6-NEXT:    s_sub_i32 s0, s0, s2
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_max_i32 s2, s6, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s7
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s3, s1, -1
+; GFX6-NEXT:    s_max_i32 s3, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s4, s1, -1
+; GFX6-NEXT:    s_min_i32 s4, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_gt_i32 s3, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s2, s4
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX6-NEXT:    s_max_i32 s2, s3, s2
+; GFX6-NEXT:    s_min_i32 s2, s2, s4
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_mov_b32 s2, 0xffff
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -3125,42 +2785,34 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ;
 ; GFX8-LABEL: s_ssubsat_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s7, -1
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s7
+; GFX8-NEXT:    s_max_i32 s8, s6, s7
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s8, s6, s7
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s7
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s6, s6, s7
-; GFX8-NEXT:    s_sub_i32 s6, s6, s5
+; GFX8-NEXT:    s_min_i32 s6, s6, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s8, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s8, s1
+; GFX8-NEXT:    s_sub_i32 s6, s6, s5
+; GFX8-NEXT:    s_max_i32 s1, s8, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s6
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s6
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_min_i32 s1, s1, s6
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s1, s7
-; GFX8-NEXT:    s_cselect_b32 s6, s1, s7
+; GFX8-NEXT:    s_max_i32 s6, s1, s7
 ; GFX8-NEXT:    s_sub_i32 s4, s6, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s1, s7
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s7
-; GFX8-NEXT:    s_sub_i32 s1, s1, s5
+; GFX8-NEXT:    s_min_i32 s1, s1, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX8-NEXT:    s_sub_i32 s1, s1, s5
+; GFX8-NEXT:    s_max_i32 s3, s4, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s1
-; GFX8-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX8-NEXT:    s_min_i32 s1, s3, s1
 ; GFX8-NEXT:    s_sub_i32 s1, s2, s1
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
@@ -3189,25 +2841,21 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: ssubsat_v2i16_sv:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s2, -2
-; GFX6-NEXT:    s_cselect_b32 s4, s0, -1
-; GFX6-NEXT:    s_sub_i32 s4, s4, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
+; GFX6-NEXT:    s_max_i32 s4, s0, -1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_sub_i32 s4, s4, s2
 ; GFX6-NEXT:    s_brev_b32 s3, 1
-; GFX6-NEXT:    s_cselect_b32 s5, s0, -1
+; GFX6-NEXT:    s_min_i32 s5, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s3
 ; GFX6-NEXT:    v_max_i32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_min_i32_e32 v0, s5, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s1, s0, -1
+; GFX6-NEXT:    s_max_i32 s1, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
-; GFX6-NEXT:    s_cselect_b32 s2, s0, -1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    s_min_i32 s2, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX6-NEXT:    v_max_i32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, s2, v1
@@ -3223,25 +2871,21 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ;
 ; GFX8-LABEL: ssubsat_v2i16_sv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s5, -1
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s5
+; GFX8-NEXT:    s_max_i32 s6, s4, s5
 ; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s6, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s2
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
 ; GFX8-NEXT:    s_movk_i32 s3, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s3
 ; GFX8-NEXT:    v_max_i16_e32 v1, s6, v0
 ; GFX8-NEXT:    v_min_i16_e32 v1, s4, v1
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s6, s4, s5
+; GFX8-NEXT:    s_max_i32 s6, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s2, s6, s2
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_sub_i32 s3, s4, s3
@@ -3467,64 +3111,48 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-LABEL: s_ssubsat_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s8, -2
-; GFX6-NEXT:    s_cselect_b32 s10, s0, -1
+; GFX6-NEXT:    s_max_i32 s10, s0, -1
+; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s9, 1
-; GFX6-NEXT:    s_cselect_b32 s11, s0, -1
+; GFX6-NEXT:    s_min_i32 s11, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s11, s11, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s10, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s10, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s11
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s11
-; GFX6-NEXT:    s_sub_i32 s0, s0, s4
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_max_i32 s4, s10, s4
+; GFX6-NEXT:    s_min_i32 s4, s4, s11
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s5, s1, -1
+; GFX6-NEXT:    s_max_i32 s5, s1, -1
+; GFX6-NEXT:    s_min_i32 s10, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s10, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s10
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s10
-; GFX6-NEXT:    s_sub_i32 s1, s1, s4
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_max_i32 s4, s5, s4
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_max_i32 s5, s2, -1
+; GFX6-NEXT:    s_min_i32 s4, s4, s10
+; GFX6-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s5, s2, -1
+; GFX6-NEXT:    s_min_i32 s6, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s6, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s6
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s6
-; GFX6-NEXT:    s_sub_i32 s2, s2, s4
-; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX6-NEXT:    s_max_i32 s4, s5, s4
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_min_i32 s4, s4, s6
+; GFX6-NEXT:    s_max_i32 s5, s3, -1
+; GFX6-NEXT:    s_sub_i32 s2, s2, s4
+; GFX6-NEXT:    s_min_i32 s6, s3, -1
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s5, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s6, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s9
-; GFX6-NEXT:    s_cmp_gt_i32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX6-NEXT:    s_cmp_lt_i32 s4, s6
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX6-NEXT:    s_max_i32 s4, s5, s4
+; GFX6-NEXT:    s_min_i32 s4, s4, s6
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s4
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -3537,76 +3165,60 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ;
 ; GFX8-LABEL: s_ssubsat_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s11, -1
-; GFX8-NEXT:    s_cmp_gt_i32 s10, s11
+; GFX8-NEXT:    s_max_i32 s12, s10, s11
 ; GFX8-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s12, s10, s11
 ; GFX8-NEXT:    s_sub_i32 s12, s12, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s10, s11
+; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX8-NEXT:    s_movk_i32 s9, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s10, s10, s11
-; GFX8-NEXT:    s_sub_i32 s10, s10, s9
+; GFX8-NEXT:    s_min_i32 s10, s10, s11
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s12, s2
-; GFX8-NEXT:    s_cselect_b32 s2, s12, s2
+; GFX8-NEXT:    s_sub_i32 s10, s10, s9
+; GFX8-NEXT:    s_max_i32 s2, s12, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s10
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s10
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s10
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_min_i32 s2, s2, s10
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s2, s11
-; GFX8-NEXT:    s_cselect_b32 s10, s2, s11
+; GFX8-NEXT:    s_max_i32 s10, s2, s11
 ; GFX8-NEXT:    s_sub_i32 s10, s10, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s2, s11
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s11
-; GFX8-NEXT:    s_sub_i32 s2, s2, s9
+; GFX8-NEXT:    s_min_i32 s2, s2, s11
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s10
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_cmp_gt_i32 s10, s6
-; GFX8-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX8-NEXT:    s_sub_i32 s2, s2, s9
+; GFX8-NEXT:    s_max_i32 s6, s10, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s2
-; GFX8-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX8-NEXT:    s_min_i32 s2, s6, s2
 ; GFX8-NEXT:    s_sub_i32 s2, s4, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s11
-; GFX8-NEXT:    s_cselect_b32 s6, s4, s11
+; GFX8-NEXT:    s_max_i32 s6, s4, s11
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s11
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s11
-; GFX8-NEXT:    s_sub_i32 s4, s4, s9
+; GFX8-NEXT:    s_min_i32 s4, s4, s11
+; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX8-NEXT:    s_max_i32 s3, s6, s3
+; GFX8-NEXT:    s_sub_i32 s4, s4, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s11
-; GFX8-NEXT:    s_cselect_b32 s4, s3, s11
+; GFX8-NEXT:    s_max_i32 s4, s3, s11
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s11
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s11
-; GFX8-NEXT:    s_sub_i32 s3, s3, s9
+; GFX8-NEXT:    s_min_i32 s3, s3, s11
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s7
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s6
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX8-NEXT:    s_sub_i32 s3, s3, s9
+; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX8-NEXT:    s_min_i32 s3, s4, s3
 ; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_sub_i32 s3, s5, s3
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
@@ -3816,92 +3428,67 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-LABEL: s_ssubsat_v6i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s12, -2
-; GFX6-NEXT:    s_cselect_b32 s14, s0, -1
+; GFX6-NEXT:    s_max_i32 s14, s0, -1
+; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX6-NEXT:    s_sub_i32 s14, s14, s12
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s13, 1
-; GFX6-NEXT:    s_cselect_b32 s15, s0, -1
+; GFX6-NEXT:    s_min_i32 s15, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s15, s15, s13
-; GFX6-NEXT:    s_cmp_gt_i32 s14, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s14, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s15
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s15
-; GFX6-NEXT:    s_sub_i32 s0, s0, s6
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_max_i32 s6, s14, s6
+; GFX6-NEXT:    s_min_i32 s6, s6, s15
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_sub_i32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s7, s1, -1
+; GFX6-NEXT:    s_max_i32 s7, s1, -1
+; GFX6-NEXT:    s_min_i32 s14, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s12
-; GFX6-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s14, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s14, s14, s13
-; GFX6-NEXT:    s_cmp_gt_i32 s7, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s14
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
-; GFX6-NEXT:    s_sub_i32 s1, s1, s6
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_max_i32 s7, s2, -1
+; GFX6-NEXT:    s_min_i32 s6, s6, s14
+; GFX6-NEXT:    s_sub_i32 s1, s1, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s7, s2, -1
+; GFX6-NEXT:    s_min_i32 s8, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s12
-; GFX6-NEXT:    s_cmp_lt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s8, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s13
-; GFX6-NEXT:    s_cmp_gt_i32 s7, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s8
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s8
-; GFX6-NEXT:    s_sub_i32 s2, s2, s6
-; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_min_i32 s6, s6, s8
+; GFX6-NEXT:    s_max_i32 s7, s3, -1
+; GFX6-NEXT:    s_sub_i32 s2, s2, s6
+; GFX6-NEXT:    s_min_i32 s8, s3, -1
 ; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s7, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s12
-; GFX6-NEXT:    s_cmp_lt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s8, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s13
-; GFX6-NEXT:    s_cmp_gt_i32 s7, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s8
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s8
-; GFX6-NEXT:    s_sub_i32 s3, s3, s6
-; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
+; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX6-NEXT:    s_min_i32 s6, s6, s8
+; GFX6-NEXT:    s_max_i32 s7, s4, -1
+; GFX6-NEXT:    s_sub_i32 s3, s3, s6
+; GFX6-NEXT:    s_min_i32 s8, s4, -1
 ; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s4, -1
-; GFX6-NEXT:    s_cselect_b32 s7, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s12
-; GFX6-NEXT:    s_cmp_lt_i32 s4, -1
-; GFX6-NEXT:    s_cselect_b32 s8, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s13
-; GFX6-NEXT:    s_cmp_gt_i32 s7, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s8
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s8
-; GFX6-NEXT:    s_sub_i32 s4, s4, s6
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX6-NEXT:    s_min_i32 s6, s6, s8
+; GFX6-NEXT:    s_max_i32 s7, s5, -1
+; GFX6-NEXT:    s_sub_i32 s4, s4, s6
+; GFX6-NEXT:    s_min_i32 s8, s5, -1
 ; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s5, -1
-; GFX6-NEXT:    s_cselect_b32 s7, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s12
-; GFX6-NEXT:    s_cmp_lt_i32 s5, -1
-; GFX6-NEXT:    s_cselect_b32 s8, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s13
-; GFX6-NEXT:    s_cmp_gt_i32 s7, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_cmp_lt_i32 s6, s8
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s8
+; GFX6-NEXT:    s_max_i32 s6, s7, s6
+; GFX6-NEXT:    s_min_i32 s6, s6, s8
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s6
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_mov_b32 s6, 0xffff
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s6
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -3910,6 +3497,7 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_and_b32 s3, s5, s6
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
@@ -3918,113 +3506,89 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ;
 ; GFX8-LABEL: s_ssubsat_v6i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
-; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
-; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
-; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s14, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s15, -1
-; GFX8-NEXT:    s_cmp_gt_i32 s14, s15
+; GFX8-NEXT:    s_max_i32 s16, s14, s15
 ; GFX8-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s16, s14, s15
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s12
-; GFX8-NEXT:    s_cmp_lt_i32 s14, s15
+; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
 ; GFX8-NEXT:    s_movk_i32 s13, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s14, s14, s15
-; GFX8-NEXT:    s_sub_i32 s14, s14, s13
+; GFX8-NEXT:    s_min_i32 s14, s14, s15
 ; GFX8-NEXT:    s_sext_i32_i16 s16, s16
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s16, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s16, s3
+; GFX8-NEXT:    s_sub_i32 s14, s14, s13
+; GFX8-NEXT:    s_max_i32 s3, s16, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s14, s14
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s14
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s14
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX8-NEXT:    s_min_i32 s3, s3, s14
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s6
-; GFX8-NEXT:    s_cmp_gt_i32 s3, s15
-; GFX8-NEXT:    s_cselect_b32 s14, s3, s15
+; GFX8-NEXT:    s_max_i32 s14, s3, s15
 ; GFX8-NEXT:    s_sub_i32 s14, s14, s12
-; GFX8-NEXT:    s_cmp_lt_i32 s3, s15
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s15
-; GFX8-NEXT:    s_sub_i32 s3, s3, s13
+; GFX8-NEXT:    s_min_i32 s3, s3, s15
 ; GFX8-NEXT:    s_sext_i32_i16 s14, s14
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
-; GFX8-NEXT:    s_cmp_gt_i32 s14, s9
-; GFX8-NEXT:    s_cselect_b32 s9, s14, s9
+; GFX8-NEXT:    s_sub_i32 s3, s3, s13
+; GFX8-NEXT:    s_max_i32 s9, s14, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_cmp_lt_i32 s9, s3
-; GFX8-NEXT:    s_cselect_b32 s3, s9, s3
+; GFX8-NEXT:    s_min_i32 s3, s9, s3
 ; GFX8-NEXT:    s_sub_i32 s3, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s15
-; GFX8-NEXT:    s_cselect_b32 s9, s6, s15
+; GFX8-NEXT:    s_max_i32 s9, s6, s15
 ; GFX8-NEXT:    s_sub_i32 s9, s9, s12
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s15
-; GFX8-NEXT:    s_cselect_b32 s6, s6, s15
-; GFX8-NEXT:    s_sub_i32 s6, s6, s13
+; GFX8-NEXT:    s_min_i32 s6, s6, s15
+; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s9, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s9, s4
+; GFX8-NEXT:    s_max_i32 s4, s9, s4
+; GFX8-NEXT:    s_sub_i32 s6, s6, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s6
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX8-NEXT:    s_min_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s7
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s15
-; GFX8-NEXT:    s_cselect_b32 s6, s4, s15
+; GFX8-NEXT:    s_max_i32 s6, s4, s15
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s12
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s15
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s15
-; GFX8-NEXT:    s_sub_i32 s4, s4, s13
+; GFX8-NEXT:    s_min_i32 s4, s4, s15
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s10
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s9
-; GFX8-NEXT:    s_cselect_b32 s6, s6, s9
+; GFX8-NEXT:    s_sub_i32 s4, s4, s13
+; GFX8-NEXT:    s_max_i32 s6, s6, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX8-NEXT:    s_sub_i32 s4, s7, s4
+; GFX8-NEXT:    s_min_i32 s4, s6, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s15
-; GFX8-NEXT:    s_cselect_b32 s7, s6, s15
+; GFX8-NEXT:    s_sub_i32 s4, s7, s4
+; GFX8-NEXT:    s_max_i32 s7, s6, s15
 ; GFX8-NEXT:    s_sub_i32 s7, s7, s12
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s15
-; GFX8-NEXT:    s_cselect_b32 s6, s6, s15
-; GFX8-NEXT:    s_sub_i32 s6, s6, s13
+; GFX8-NEXT:    s_min_i32 s6, s6, s15
+; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s7, s5
-; GFX8-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX8-NEXT:    s_max_i32 s5, s7, s5
+; GFX8-NEXT:    s_sub_i32 s6, s6, s13
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s6
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s6
+; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX8-NEXT:    s_min_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s8
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s15
-; GFX8-NEXT:    s_cselect_b32 s6, s5, s15
+; GFX8-NEXT:    s_max_i32 s6, s5, s15
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s12
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s15
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s15
-; GFX8-NEXT:    s_sub_i32 s5, s5, s13
+; GFX8-NEXT:    s_min_i32 s5, s5, s15
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s11
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s7
-; GFX8-NEXT:    s_cselect_b32 s6, s6, s7
+; GFX8-NEXT:    s_sub_i32 s5, s5, s13
+; GFX8-NEXT:    s_max_i32 s6, s6, s7
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s5
-; GFX8-NEXT:    s_cselect_b32 s5, s6, s5
-; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX8-NEXT:    s_min_i32 s5, s6, s5
 ; GFX8-NEXT:    s_or_b32 s0, s0, s3
 ; GFX8-NEXT:    s_bfe_u32 s3, s4, 0x100000
 ; GFX8-NEXT:    s_sub_i32 s5, s8, s5
@@ -4271,132 +3835,100 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-LABEL: s_ssubsat_v8i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s16, -2
-; GFX6-NEXT:    s_cselect_b32 s18, s0, -1
+; GFX6-NEXT:    s_max_i32 s18, s0, -1
+; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX6-NEXT:    s_sub_i32 s18, s18, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s0, -1
 ; GFX6-NEXT:    s_brev_b32 s17, 1
-; GFX6-NEXT:    s_cselect_b32 s19, s0, -1
+; GFX6-NEXT:    s_min_i32 s19, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s19, s19, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s18, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s18, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s19
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s19
-; GFX6-NEXT:    s_sub_i32 s0, s0, s8
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_max_i32 s8, s18, s8
+; GFX6-NEXT:    s_min_i32 s8, s8, s19
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_sub_i32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s9, s1, -1
+; GFX6-NEXT:    s_max_i32 s9, s1, -1
+; GFX6-NEXT:    s_min_i32 s18, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s1, -1
-; GFX6-NEXT:    s_cselect_b32 s18, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s18, s18, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s18
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s18
-; GFX6-NEXT:    s_sub_i32 s1, s1, s8
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_max_i32 s9, s2, -1
+; GFX6-NEXT:    s_min_i32 s8, s8, s18
+; GFX6-NEXT:    s_sub_i32 s1, s1, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s9, s2, -1
+; GFX6-NEXT:    s_min_i32 s10, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s2, -1
-; GFX6-NEXT:    s_cselect_b32 s10, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s10
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s10
-; GFX6-NEXT:    s_sub_i32 s2, s2, s8
-; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_min_i32 s8, s8, s10
+; GFX6-NEXT:    s_max_i32 s9, s3, -1
+; GFX6-NEXT:    s_sub_i32 s2, s2, s8
+; GFX6-NEXT:    s_min_i32 s10, s3, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s9, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s3, -1
-; GFX6-NEXT:    s_cselect_b32 s10, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s10
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s10
-; GFX6-NEXT:    s_sub_i32 s3, s3, s8
-; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
+; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX6-NEXT:    s_min_i32 s8, s8, s10
+; GFX6-NEXT:    s_max_i32 s9, s4, -1
+; GFX6-NEXT:    s_sub_i32 s3, s3, s8
+; GFX6-NEXT:    s_min_i32 s10, s4, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s4, -1
-; GFX6-NEXT:    s_cselect_b32 s9, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s4, -1
-; GFX6-NEXT:    s_cselect_b32 s10, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s10
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s10
-; GFX6-NEXT:    s_sub_i32 s4, s4, s8
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX6-NEXT:    s_min_i32 s8, s8, s10
+; GFX6-NEXT:    s_max_i32 s9, s5, -1
+; GFX6-NEXT:    s_sub_i32 s4, s4, s8
+; GFX6-NEXT:    s_min_i32 s10, s5, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s5, -1
-; GFX6-NEXT:    s_cselect_b32 s9, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s5, -1
-; GFX6-NEXT:    s_cselect_b32 s10, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s10
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s10
-; GFX6-NEXT:    s_sub_i32 s5, s5, s8
-; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
+; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX6-NEXT:    s_min_i32 s8, s8, s10
+; GFX6-NEXT:    s_max_i32 s9, s6, -1
+; GFX6-NEXT:    s_sub_i32 s5, s5, s8
+; GFX6-NEXT:    s_min_i32 s10, s6, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s6, -1
-; GFX6-NEXT:    s_cselect_b32 s9, s6, -1
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s6, -1
-; GFX6-NEXT:    s_cselect_b32 s10, s6, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s10
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s10
-; GFX6-NEXT:    s_sub_i32 s6, s6, s8
-; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
+; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX6-NEXT:    s_min_i32 s8, s8, s10
+; GFX6-NEXT:    s_max_i32 s9, s7, -1
+; GFX6-NEXT:    s_sub_i32 s6, s6, s8
+; GFX6-NEXT:    s_min_i32 s10, s7, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
-; GFX6-NEXT:    s_cmp_gt_i32 s7, -1
-; GFX6-NEXT:    s_cselect_b32 s9, s7, -1
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
-; GFX6-NEXT:    s_cmp_lt_i32 s7, -1
-; GFX6-NEXT:    s_cselect_b32 s10, s7, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s17
-; GFX6-NEXT:    s_cmp_gt_i32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_cmp_lt_i32 s8, s10
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s10
+; GFX6-NEXT:    s_max_i32 s8, s9, s8
+; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s8
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_mov_b32 s8, 0xffff
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s8
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, s8
 ; GFX6-NEXT:    s_and_b32 s2, s3, s8
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_and_b32 s3, s5, s8
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, s8
 ; GFX6-NEXT:    s_and_b32 s4, s7, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, s8
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
@@ -4405,151 +3937,119 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ;
 ; GFX8-LABEL: s_ssubsat_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
-; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
-; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
-; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s18, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s19, -1
-; GFX8-NEXT:    s_cmp_gt_i32 s18, s19
+; GFX8-NEXT:    s_max_i32 s20, s18, s19
 ; GFX8-NEXT:    s_movk_i32 s16, 0x7fff
-; GFX8-NEXT:    s_cselect_b32 s20, s18, s19
 ; GFX8-NEXT:    s_sub_i32 s20, s20, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s18, s19
+; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
 ; GFX8-NEXT:    s_movk_i32 s17, 0x8000
-; GFX8-NEXT:    s_cselect_b32 s18, s18, s19
-; GFX8-NEXT:    s_sub_i32 s18, s18, s17
+; GFX8-NEXT:    s_min_i32 s18, s18, s19
 ; GFX8-NEXT:    s_sext_i32_i16 s20, s20
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_gt_i32 s20, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s20, s4
+; GFX8-NEXT:    s_sub_i32 s18, s18, s17
+; GFX8-NEXT:    s_max_i32 s4, s20, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s18, s18
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s18
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s18
+; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX8-NEXT:    s_min_i32 s4, s4, s18
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s8
-; GFX8-NEXT:    s_cmp_gt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s18, s4, s19
+; GFX8-NEXT:    s_max_i32 s18, s4, s19
 ; GFX8-NEXT:    s_sub_i32 s18, s18, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s4, s19
-; GFX8-NEXT:    s_cselect_b32 s4, s4, s19
-; GFX8-NEXT:    s_sub_i32 s4, s4, s17
+; GFX8-NEXT:    s_min_i32 s4, s4, s19
 ; GFX8-NEXT:    s_sext_i32_i16 s18, s18
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
-; GFX8-NEXT:    s_cmp_gt_i32 s18, s12
-; GFX8-NEXT:    s_cselect_b32 s12, s18, s12
+; GFX8-NEXT:    s_sub_i32 s4, s4, s17
+; GFX8-NEXT:    s_max_i32 s12, s18, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_cmp_lt_i32 s12, s4
-; GFX8-NEXT:    s_cselect_b32 s4, s12, s4
+; GFX8-NEXT:    s_min_i32 s4, s12, s4
 ; GFX8-NEXT:    s_sub_i32 s4, s8, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s1
-; GFX8-NEXT:    s_cmp_gt_i32 s8, s19
-; GFX8-NEXT:    s_cselect_b32 s12, s8, s19
+; GFX8-NEXT:    s_max_i32 s12, s8, s19
 ; GFX8-NEXT:    s_sub_i32 s12, s12, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s8, s19
-; GFX8-NEXT:    s_cselect_b32 s8, s8, s19
-; GFX8-NEXT:    s_sub_i32 s8, s8, s17
+; GFX8-NEXT:    s_min_i32 s8, s8, s19
+; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_gt_i32 s12, s5
-; GFX8-NEXT:    s_cselect_b32 s5, s12, s5
+; GFX8-NEXT:    s_max_i32 s5, s12, s5
+; GFX8-NEXT:    s_sub_i32 s8, s8, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s8
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
+; GFX8-NEXT:    s_min_i32 s5, s5, s8
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s9
-; GFX8-NEXT:    s_cmp_gt_i32 s5, s19
-; GFX8-NEXT:    s_cselect_b32 s8, s5, s19
+; GFX8-NEXT:    s_max_i32 s8, s5, s19
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s5, s19
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s19
-; GFX8-NEXT:    s_sub_i32 s5, s5, s17
+; GFX8-NEXT:    s_min_i32 s5, s5, s19
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s13
-; GFX8-NEXT:    s_cmp_gt_i32 s8, s12
-; GFX8-NEXT:    s_cselect_b32 s8, s8, s12
+; GFX8-NEXT:    s_sub_i32 s5, s5, s17
+; GFX8-NEXT:    s_max_i32 s8, s8, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_cmp_lt_i32 s8, s5
-; GFX8-NEXT:    s_cselect_b32 s5, s8, s5
-; GFX8-NEXT:    s_sub_i32 s5, s9, s5
+; GFX8-NEXT:    s_min_i32 s5, s8, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s2
-; GFX8-NEXT:    s_cmp_gt_i32 s8, s19
-; GFX8-NEXT:    s_cselect_b32 s9, s8, s19
+; GFX8-NEXT:    s_sub_i32 s5, s9, s5
+; GFX8-NEXT:    s_max_i32 s9, s8, s19
 ; GFX8-NEXT:    s_sub_i32 s9, s9, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s8, s19
-; GFX8-NEXT:    s_cselect_b32 s8, s8, s19
-; GFX8-NEXT:    s_sub_i32 s8, s8, s17
+; GFX8-NEXT:    s_min_i32 s8, s8, s19
+; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_cmp_gt_i32 s9, s6
-; GFX8-NEXT:    s_cselect_b32 s6, s9, s6
+; GFX8-NEXT:    s_max_i32 s6, s9, s6
+; GFX8-NEXT:    s_sub_i32 s8, s8, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s8
-; GFX8-NEXT:    s_cselect_b32 s6, s6, s8
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX8-NEXT:    s_min_i32 s6, s6, s8
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s10
-; GFX8-NEXT:    s_cmp_gt_i32 s6, s19
-; GFX8-NEXT:    s_cselect_b32 s8, s6, s19
+; GFX8-NEXT:    s_max_i32 s8, s6, s19
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s6, s19
-; GFX8-NEXT:    s_cselect_b32 s6, s6, s19
-; GFX8-NEXT:    s_sub_i32 s6, s6, s17
+; GFX8-NEXT:    s_min_i32 s6, s6, s19
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s14
-; GFX8-NEXT:    s_cmp_gt_i32 s8, s9
-; GFX8-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX8-NEXT:    s_sub_i32 s6, s6, s17
+; GFX8-NEXT:    s_max_i32 s8, s8, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_cmp_lt_i32 s8, s6
-; GFX8-NEXT:    s_cselect_b32 s6, s8, s6
-; GFX8-NEXT:    s_sub_i32 s6, s10, s6
+; GFX8-NEXT:    s_min_i32 s6, s8, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s3
-; GFX8-NEXT:    s_cmp_gt_i32 s8, s19
-; GFX8-NEXT:    s_cselect_b32 s9, s8, s19
+; GFX8-NEXT:    s_max_i32 s9, s8, s19
 ; GFX8-NEXT:    s_sub_i32 s9, s9, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s8, s19
-; GFX8-NEXT:    s_cselect_b32 s8, s8, s19
-; GFX8-NEXT:    s_sub_i32 s8, s8, s17
+; GFX8-NEXT:    s_min_i32 s8, s8, s19
+; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
-; GFX8-NEXT:    s_cmp_gt_i32 s9, s7
-; GFX8-NEXT:    s_cselect_b32 s7, s9, s7
+; GFX8-NEXT:    s_max_i32 s7, s9, s7
+; GFX8-NEXT:    s_sub_i32 s8, s8, s17
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
-; GFX8-NEXT:    s_cmp_lt_i32 s7, s8
-; GFX8-NEXT:    s_cselect_b32 s7, s7, s8
+; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
+; GFX8-NEXT:    s_min_i32 s7, s7, s8
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s11
-; GFX8-NEXT:    s_cmp_gt_i32 s7, s19
-; GFX8-NEXT:    s_cselect_b32 s8, s7, s19
+; GFX8-NEXT:    s_max_i32 s8, s7, s19
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s16
-; GFX8-NEXT:    s_cmp_lt_i32 s7, s19
-; GFX8-NEXT:    s_cselect_b32 s7, s7, s19
-; GFX8-NEXT:    s_sub_i32 s7, s7, s17
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_min_i32 s7, s7, s19
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s15
-; GFX8-NEXT:    s_cmp_gt_i32 s8, s9
-; GFX8-NEXT:    s_cselect_b32 s8, s8, s9
-; GFX8-NEXT:    s_sext_i32_i16 s8, s8
-; GFX8-NEXT:    s_sext_i32_i16 s7, s7
-; GFX8-NEXT:    s_cmp_lt_i32 s8, s7
-; GFX8-NEXT:    s_cselect_b32 s7, s8, s7
-; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_sub_i32 s7, s7, s17
+; GFX8-NEXT:    s_max_i32 s8, s8, s9
 ; GFX8-NEXT:    s_or_b32 s0, s0, s4
 ; GFX8-NEXT:    s_bfe_u32 s4, s5, 0x100000
+; GFX8-NEXT:    s_sub_i32 s6, s10, s6
+; GFX8-NEXT:    s_sext_i32_i16 s8, s8
+; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_min_i32 s7, s8, s7
 ; GFX8-NEXT:    s_or_b32 s1, s1, s4
 ; GFX8-NEXT:    s_bfe_u32 s4, s6, 0x100000
 ; GFX8-NEXT:    s_sub_i32 s7, s11, s7

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 5234d021259b..06232cec3309 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -53,8 +53,7 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
 ; GFX6-NEXT:    s_not_b32 s2, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX6-NEXT:    s_min_u32 s1, s2, s1
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 25
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -143,8 +142,7 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX6-NEXT:    s_not_b32 s2, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX6-NEXT:    s_min_u32 s1, s2, s1
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -272,17 +270,15 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX6-NEXT:    s_not_b32 s4, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s4, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX6-NEXT:    s_min_u32 s1, s4, s1
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX6-NEXT:    s_not_b32 s3, s1
-; GFX6-NEXT:    s_cmp_lt_u32 s3, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_min_u32 s2, s3, s2
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -521,31 +517,27 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX6-NEXT:    s_not_b32 s8, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s8, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s8, s1
+; GFX6-NEXT:    s_min_u32 s1, s8, s1
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX6-NEXT:    s_not_b32 s5, s1
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX6-NEXT:    s_min_u32 s2, s5, s2
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
 ; GFX6-NEXT:    s_not_b32 s5, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s3
-; GFX6-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX6-NEXT:    s_min_u32 s3, s5, s3
 ; GFX6-NEXT:    s_add_i32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
 ; GFX6-NEXT:    s_not_b32 s5, s3
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX6-NEXT:    s_min_u32 s4, s5, s4
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX6-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 24
@@ -736,8 +728,7 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_not_b32 s2, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX6-NEXT:    s_min_u32 s1, s2, s1
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 8
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -809,8 +800,7 @@ define amdgpu_ps i32 @s_uaddsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
 ; GFX6-LABEL: s_uaddsat_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_not_b32 s2, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX6-NEXT:    s_min_u32 s1, s2, s1
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -932,12 +922,10 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 ; GFX6-LABEL: s_uaddsat_v2i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_not_b32 s4, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s4, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX6-NEXT:    s_min_u32 s2, s4, s2
 ; GFX6-NEXT:    s_add_i32 s0, s0, s2
 ; GFX6-NEXT:    s_not_b32 s2, s1
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX6-NEXT:    s_min_u32 s2, s2, s3
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1019,16 +1007,13 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 ; GFX6-LABEL: s_uaddsat_v3i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_not_b32 s6, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s6, s3
-; GFX6-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX6-NEXT:    s_min_u32 s3, s6, s3
 ; GFX6-NEXT:    s_add_i32 s0, s0, s3
 ; GFX6-NEXT:    s_not_b32 s3, s1
-; GFX6-NEXT:    s_cmp_lt_u32 s3, s4
-; GFX6-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX6-NEXT:    s_min_u32 s3, s3, s4
 ; GFX6-NEXT:    s_add_i32 s1, s1, s3
 ; GFX6-NEXT:    s_not_b32 s3, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s3, s5
-; GFX6-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX6-NEXT:    s_min_u32 s3, s3, s5
 ; GFX6-NEXT:    s_add_i32 s2, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1124,20 +1109,16 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 ; GFX6-LABEL: s_uaddsat_v4i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_not_b32 s8, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s8, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX6-NEXT:    s_min_u32 s4, s8, s4
 ; GFX6-NEXT:    s_add_i32 s0, s0, s4
 ; GFX6-NEXT:    s_not_b32 s4, s1
-; GFX6-NEXT:    s_cmp_lt_u32 s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX6-NEXT:    s_min_u32 s4, s4, s5
 ; GFX6-NEXT:    s_add_i32 s1, s1, s4
 ; GFX6-NEXT:    s_not_b32 s4, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s4, s6
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX6-NEXT:    s_min_u32 s4, s4, s6
 ; GFX6-NEXT:    s_add_i32 s2, s2, s4
 ; GFX6-NEXT:    s_not_b32 s4, s3
-; GFX6-NEXT:    s_cmp_lt_u32 s4, s7
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s7
+; GFX6-NEXT:    s_min_u32 s4, s4, s7
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1247,24 +1228,19 @@ define amdgpu_ps <5 x i32> @s_uaddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 ; GFX6-LABEL: s_uaddsat_v5i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_not_b32 s10, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s10, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s10, s5
+; GFX6-NEXT:    s_min_u32 s5, s10, s5
 ; GFX6-NEXT:    s_add_i32 s0, s0, s5
 ; GFX6-NEXT:    s_not_b32 s5, s1
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s6
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s6
+; GFX6-NEXT:    s_min_u32 s5, s5, s6
 ; GFX6-NEXT:    s_add_i32 s1, s1, s5
 ; GFX6-NEXT:    s_not_b32 s5, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s7
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s7
+; GFX6-NEXT:    s_min_u32 s5, s5, s7
 ; GFX6-NEXT:    s_add_i32 s2, s2, s5
 ; GFX6-NEXT:    s_not_b32 s5, s3
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s8
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX6-NEXT:    s_min_u32 s5, s5, s8
 ; GFX6-NEXT:    s_add_i32 s3, s3, s5
 ; GFX6-NEXT:    s_not_b32 s5, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s9
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s9
+; GFX6-NEXT:    s_min_u32 s5, s5, s9
 ; GFX6-NEXT:    s_add_i32 s4, s4, s5
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1448,68 +1424,52 @@ define amdgpu_ps <16 x i32> @s_uaddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 ; GFX6-LABEL: s_uaddsat_v16i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_not_b32 s32, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s32, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s32, s16
+; GFX6-NEXT:    s_min_u32 s16, s32, s16
 ; GFX6-NEXT:    s_add_i32 s0, s0, s16
 ; GFX6-NEXT:    s_not_b32 s16, s1
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX6-NEXT:    s_min_u32 s16, s16, s17
 ; GFX6-NEXT:    s_add_i32 s1, s1, s16
 ; GFX6-NEXT:    s_not_b32 s16, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s18
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s18
+; GFX6-NEXT:    s_min_u32 s16, s16, s18
 ; GFX6-NEXT:    s_add_i32 s2, s2, s16
 ; GFX6-NEXT:    s_not_b32 s16, s3
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s19
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s19
+; GFX6-NEXT:    s_min_u32 s16, s16, s19
 ; GFX6-NEXT:    s_add_i32 s3, s3, s16
 ; GFX6-NEXT:    s_not_b32 s16, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s20
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s20
+; GFX6-NEXT:    s_min_u32 s16, s16, s20
 ; GFX6-NEXT:    s_add_i32 s4, s4, s16
 ; GFX6-NEXT:    s_not_b32 s16, s5
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s21
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s21
+; GFX6-NEXT:    s_min_u32 s16, s16, s21
 ; GFX6-NEXT:    s_add_i32 s5, s5, s16
 ; GFX6-NEXT:    s_not_b32 s16, s6
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s22
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s22
+; GFX6-NEXT:    s_min_u32 s16, s16, s22
 ; GFX6-NEXT:    s_add_i32 s6, s6, s16
 ; GFX6-NEXT:    s_not_b32 s16, s7
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s23
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s23
+; GFX6-NEXT:    s_min_u32 s16, s16, s23
 ; GFX6-NEXT:    s_add_i32 s7, s7, s16
 ; GFX6-NEXT:    s_not_b32 s16, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s24
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s24
+; GFX6-NEXT:    s_min_u32 s16, s16, s24
 ; GFX6-NEXT:    s_add_i32 s8, s8, s16
 ; GFX6-NEXT:    s_not_b32 s16, s9
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s25
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s25
+; GFX6-NEXT:    s_min_u32 s16, s16, s25
 ; GFX6-NEXT:    s_add_i32 s9, s9, s16
 ; GFX6-NEXT:    s_not_b32 s16, s10
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s26
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s26
+; GFX6-NEXT:    s_min_u32 s16, s16, s26
 ; GFX6-NEXT:    s_add_i32 s10, s10, s16
 ; GFX6-NEXT:    s_not_b32 s16, s11
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s27
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s27
+; GFX6-NEXT:    s_min_u32 s16, s16, s27
 ; GFX6-NEXT:    s_add_i32 s11, s11, s16
 ; GFX6-NEXT:    s_not_b32 s16, s12
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s28
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s28
+; GFX6-NEXT:    s_min_u32 s16, s16, s28
 ; GFX6-NEXT:    s_add_i32 s12, s12, s16
 ; GFX6-NEXT:    s_not_b32 s16, s13
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s29
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s29
+; GFX6-NEXT:    s_min_u32 s16, s16, s29
 ; GFX6-NEXT:    s_add_i32 s13, s13, s16
 ; GFX6-NEXT:    s_not_b32 s16, s14
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s30
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s30
+; GFX6-NEXT:    s_min_u32 s16, s16, s30
 ; GFX6-NEXT:    s_add_i32 s14, s14, s16
 ; GFX6-NEXT:    s_not_b32 s16, s15
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s31
-; GFX6-NEXT:    s_cselect_b32 s16, s16, s31
+; GFX6-NEXT:    s_min_u32 s16, s16, s31
 ; GFX6-NEXT:    s_add_i32 s15, s15, s16
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1696,8 +1656,7 @@ define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_not_b32 s2, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX6-NEXT:    s_min_u32 s1, s2, s1
 ; GFX6-NEXT:    s_add_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -1835,17 +1794,15 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_not_b32 s4, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s4, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX6-NEXT:    s_add_i32 s0, s0, s2
+; GFX6-NEXT:    s_min_u32 s2, s4, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_add_i32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_not_b32 s3, s1
-; GFX6-NEXT:    s_cmp_lt_u32 s3, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_min_u32 s2, s3, s2
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2053,33 +2010,29 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_not_b32 s8, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s8, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s8, s4
-; GFX6-NEXT:    s_add_i32 s0, s0, s4
+; GFX6-NEXT:    s_min_u32 s4, s8, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_add_i32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_not_b32 s5, s1
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX6-NEXT:    s_add_i32 s1, s1, s4
+; GFX6-NEXT:    s_min_u32 s4, s5, s4
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX6-NEXT:    s_add_i32 s1, s1, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
 ; GFX6-NEXT:    s_not_b32 s5, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX6-NEXT:    s_add_i32 s2, s2, s4
+; GFX6-NEXT:    s_min_u32 s4, s5, s4
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX6-NEXT:    s_add_i32 s2, s2, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
 ; GFX6-NEXT:    s_not_b32 s5, s3
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX6-NEXT:    s_min_u32 s4, s5, s4
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_or_b32 s1, s2, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2234,49 +2187,43 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX6-NEXT:    s_not_b32 s12, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s12, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s12, s6
-; GFX6-NEXT:    s_add_i32 s0, s0, s6
+; GFX6-NEXT:    s_min_u32 s6, s12, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_add_i32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_not_b32 s7, s1
-; GFX6-NEXT:    s_cmp_lt_u32 s7, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_add_i32 s1, s1, s6
+; GFX6-NEXT:    s_min_u32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX6-NEXT:    s_add_i32 s1, s1, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
 ; GFX6-NEXT:    s_not_b32 s7, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s7, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_add_i32 s2, s2, s6
+; GFX6-NEXT:    s_min_u32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX6-NEXT:    s_add_i32 s2, s2, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
 ; GFX6-NEXT:    s_not_b32 s7, s3
-; GFX6-NEXT:    s_cmp_lt_u32 s7, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_add_i32 s3, s3, s6
+; GFX6-NEXT:    s_min_u32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX6-NEXT:    s_add_i32 s3, s3, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
 ; GFX6-NEXT:    s_not_b32 s7, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s7, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_add_i32 s4, s4, s6
+; GFX6-NEXT:    s_min_u32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX6-NEXT:    s_add_i32 s4, s4, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
 ; GFX6-NEXT:    s_not_b32 s7, s5
-; GFX6-NEXT:    s_cmp_lt_u32 s7, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX6-NEXT:    s_min_u32 s6, s7, s6
 ; GFX6-NEXT:    s_add_i32 s5, s5, s6
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
 ; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_or_b32 s2, s4, s2
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2454,65 +2401,57 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX6-NEXT:    s_not_b32 s16, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s16, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s16, s8
-; GFX6-NEXT:    s_add_i32 s0, s0, s8
+; GFX6-NEXT:    s_min_u32 s8, s16, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_add_i32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_not_b32 s9, s1
-; GFX6-NEXT:    s_cmp_lt_u32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_add_i32 s1, s1, s8
+; GFX6-NEXT:    s_min_u32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX6-NEXT:    s_add_i32 s1, s1, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
 ; GFX6-NEXT:    s_not_b32 s9, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_add_i32 s2, s2, s8
+; GFX6-NEXT:    s_min_u32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX6-NEXT:    s_add_i32 s2, s2, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
 ; GFX6-NEXT:    s_not_b32 s9, s3
-; GFX6-NEXT:    s_cmp_lt_u32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_add_i32 s3, s3, s8
+; GFX6-NEXT:    s_min_u32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX6-NEXT:    s_add_i32 s3, s3, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
 ; GFX6-NEXT:    s_not_b32 s9, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_add_i32 s4, s4, s8
+; GFX6-NEXT:    s_min_u32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX6-NEXT:    s_add_i32 s4, s4, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
 ; GFX6-NEXT:    s_not_b32 s9, s5
-; GFX6-NEXT:    s_cmp_lt_u32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_add_i32 s5, s5, s8
+; GFX6-NEXT:    s_min_u32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX6-NEXT:    s_add_i32 s5, s5, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
 ; GFX6-NEXT:    s_not_b32 s9, s6
-; GFX6-NEXT:    s_cmp_lt_u32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
-; GFX6-NEXT:    s_add_i32 s6, s6, s8
+; GFX6-NEXT:    s_min_u32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX6-NEXT:    s_add_i32 s6, s6, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
 ; GFX6-NEXT:    s_not_b32 s9, s7
-; GFX6-NEXT:    s_cmp_lt_u32 s9, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX6-NEXT:    s_min_u32 s8, s9, s8
 ; GFX6-NEXT:    s_add_i32 s7, s7, s8
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s7, 16
-; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
+; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
 ; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
+; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
 ; GFX6-NEXT:    s_or_b32 s2, s4, s2
 ; GFX6-NEXT:    s_or_b32 s3, s6, s3
 ; GFX6-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 3a44903f80f1..cf3427dac88f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -51,8 +51,7 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s0, s1
+; GFX6-NEXT:    s_min_u32 s1, s0, s1
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 25
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -139,8 +138,7 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s0, s1
+; GFX6-NEXT:    s_min_u32 s1, s0, s1
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -265,16 +263,14 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s0, s1
+; GFX6-NEXT:    s_min_u32 s1, s0, s1
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
-; GFX6-NEXT:    s_cmp_lt_u32 s1, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s1, s2
+; GFX6-NEXT:    s_min_u32 s2, s1, s2
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -508,28 +504,24 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s0, s1
+; GFX6-NEXT:    s_min_u32 s1, s0, s1
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
-; GFX6-NEXT:    s_cmp_lt_u32 s1, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s1, s2
+; GFX6-NEXT:    s_min_u32 s2, s1, s2
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s3
-; GFX6-NEXT:    s_cselect_b32 s3, s2, s3
+; GFX6-NEXT:    s_min_u32 s3, s2, s3
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
-; GFX6-NEXT:    s_cmp_lt_u32 s3, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s3, s4
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX6-NEXT:    s_min_u32 s4, s3, s4
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX6-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 24
@@ -718,8 +710,7 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s0, s1
+; GFX6-NEXT:    s_min_u32 s1, s0, s1
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 8
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -789,8 +780,7 @@ define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
 define amdgpu_ps i32 @s_usubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s0, s1
+; GFX6-NEXT:    s_min_u32 s1, s0, s1
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -907,11 +897,9 @@ define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 define amdgpu_ps <2 x i32> @s_usubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s0, s2
+; GFX6-NEXT:    s_min_u32 s2, s0, s2
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s1, s3
+; GFX6-NEXT:    s_min_u32 s2, s1, s3
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -989,14 +977,11 @@ define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 define amdgpu_ps <3 x i32> @s_usubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s3
-; GFX6-NEXT:    s_cselect_b32 s3, s0, s3
+; GFX6-NEXT:    s_min_u32 s3, s0, s3
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s3
-; GFX6-NEXT:    s_cmp_lt_u32 s1, s4
-; GFX6-NEXT:    s_cselect_b32 s3, s1, s4
+; GFX6-NEXT:    s_min_u32 s3, s1, s4
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s3
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s5
-; GFX6-NEXT:    s_cselect_b32 s3, s2, s5
+; GFX6-NEXT:    s_min_u32 s3, s2, s5
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1087,17 +1072,13 @@ define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 define amdgpu_ps <4 x i32> @s_usubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s0, s4
+; GFX6-NEXT:    s_min_u32 s4, s0, s4
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s1, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s1, s5
+; GFX6-NEXT:    s_min_u32 s4, s1, s5
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s6
-; GFX6-NEXT:    s_cselect_b32 s4, s2, s6
+; GFX6-NEXT:    s_min_u32 s4, s2, s6
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s3, s7
-; GFX6-NEXT:    s_cselect_b32 s4, s3, s7
+; GFX6-NEXT:    s_min_u32 s4, s3, s7
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1201,20 +1182,15 @@ define <5 x i32> @v_usubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 define amdgpu_ps <5 x i32> @s_usubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_v5i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s0, s5
+; GFX6-NEXT:    s_min_u32 s5, s0, s5
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s5
-; GFX6-NEXT:    s_cmp_lt_u32 s1, s6
-; GFX6-NEXT:    s_cselect_b32 s5, s1, s6
+; GFX6-NEXT:    s_min_u32 s5, s1, s6
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s5
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s7
-; GFX6-NEXT:    s_cselect_b32 s5, s2, s7
+; GFX6-NEXT:    s_min_u32 s5, s2, s7
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s5
-; GFX6-NEXT:    s_cmp_lt_u32 s3, s8
-; GFX6-NEXT:    s_cselect_b32 s5, s3, s8
+; GFX6-NEXT:    s_min_u32 s5, s3, s8
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s5
-; GFX6-NEXT:    s_cmp_lt_u32 s4, s9
-; GFX6-NEXT:    s_cselect_b32 s5, s4, s9
+; GFX6-NEXT:    s_min_u32 s5, s4, s9
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s5
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1381,53 +1357,37 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 define amdgpu_ps <16 x i32> @s_usubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_v16i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s16
-; GFX6-NEXT:    s_cselect_b32 s16, s0, s16
+; GFX6-NEXT:    s_min_u32 s16, s0, s16
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s1, s17
-; GFX6-NEXT:    s_cselect_b32 s16, s1, s17
+; GFX6-NEXT:    s_min_u32 s16, s1, s17
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s18
-; GFX6-NEXT:    s_cselect_b32 s16, s2, s18
+; GFX6-NEXT:    s_min_u32 s16, s2, s18
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s3, s19
-; GFX6-NEXT:    s_cselect_b32 s16, s3, s19
+; GFX6-NEXT:    s_min_u32 s16, s3, s19
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s4, s20
-; GFX6-NEXT:    s_cselect_b32 s16, s4, s20
+; GFX6-NEXT:    s_min_u32 s16, s4, s20
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s21
-; GFX6-NEXT:    s_cselect_b32 s16, s5, s21
+; GFX6-NEXT:    s_min_u32 s16, s5, s21
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s6, s22
-; GFX6-NEXT:    s_cselect_b32 s16, s6, s22
+; GFX6-NEXT:    s_min_u32 s16, s6, s22
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s7, s23
-; GFX6-NEXT:    s_cselect_b32 s16, s7, s23
+; GFX6-NEXT:    s_min_u32 s16, s7, s23
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s8, s24
-; GFX6-NEXT:    s_cselect_b32 s16, s8, s24
+; GFX6-NEXT:    s_min_u32 s16, s8, s24
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s9, s25
-; GFX6-NEXT:    s_cselect_b32 s16, s9, s25
+; GFX6-NEXT:    s_min_u32 s16, s9, s25
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s10, s26
-; GFX6-NEXT:    s_cselect_b32 s16, s10, s26
+; GFX6-NEXT:    s_min_u32 s16, s10, s26
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s11, s27
-; GFX6-NEXT:    s_cselect_b32 s16, s11, s27
+; GFX6-NEXT:    s_min_u32 s16, s11, s27
 ; GFX6-NEXT:    s_sub_i32 s11, s11, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s12, s28
-; GFX6-NEXT:    s_cselect_b32 s16, s12, s28
+; GFX6-NEXT:    s_min_u32 s16, s12, s28
 ; GFX6-NEXT:    s_sub_i32 s12, s12, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s13, s29
-; GFX6-NEXT:    s_cselect_b32 s16, s13, s29
+; GFX6-NEXT:    s_min_u32 s16, s13, s29
 ; GFX6-NEXT:    s_sub_i32 s13, s13, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s14, s30
-; GFX6-NEXT:    s_cselect_b32 s16, s14, s30
+; GFX6-NEXT:    s_min_u32 s16, s14, s30
 ; GFX6-NEXT:    s_sub_i32 s14, s14, s16
-; GFX6-NEXT:    s_cmp_lt_u32 s15, s31
-; GFX6-NEXT:    s_cselect_b32 s16, s15, s31
+; GFX6-NEXT:    s_min_u32 s16, s15, s31
 ; GFX6-NEXT:    s_sub_i32 s15, s15, s16
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1612,8 +1572,7 @@ define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s1
-; GFX6-NEXT:    s_cselect_b32 s1, s0, s1
+; GFX6-NEXT:    s_min_u32 s1, s0, s1
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -1746,16 +1705,14 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s0, s2
+; GFX6-NEXT:    s_min_u32 s2, s0, s2
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s2
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s1, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s1, s2
+; GFX6-NEXT:    s_min_u32 s2, s1, s2
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -1954,30 +1911,26 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s0, s4
+; GFX6-NEXT:    s_min_u32 s4, s0, s4
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s4
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s1, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s1, s4
+; GFX6-NEXT:    s_min_u32 s4, s1, s4
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s4
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s2, s4
+; GFX6-NEXT:    s_min_u32 s4, s2, s4
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s4
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s3, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s3, s4
+; GFX6-NEXT:    s_min_u32 s4, s3, s4
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_or_b32 s1, s2, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2125,44 +2078,38 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s0, s6
+; GFX6-NEXT:    s_min_u32 s6, s0, s6
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s6
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s1, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s1, s6
+; GFX6-NEXT:    s_min_u32 s6, s1, s6
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s6
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s2, s6
+; GFX6-NEXT:    s_min_u32 s6, s2, s6
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s6
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s3, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s3, s6
+; GFX6-NEXT:    s_min_u32 s6, s3, s6
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s6
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s4, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s4, s6
+; GFX6-NEXT:    s_min_u32 s6, s4, s6
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s6
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s6
-; GFX6-NEXT:    s_cselect_b32 s6, s5, s6
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX6-NEXT:    s_min_u32 s6, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s6
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
 ; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_or_b32 s2, s4, s2
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2331,58 +2278,50 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s0, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s0, s8
+; GFX6-NEXT:    s_min_u32 s8, s0, s8
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s8
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s1, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s1, s8
+; GFX6-NEXT:    s_min_u32 s8, s1, s8
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s8
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s2, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s2, s8
+; GFX6-NEXT:    s_min_u32 s8, s2, s8
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s8
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s3, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s3, s8
+; GFX6-NEXT:    s_min_u32 s8, s3, s8
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s8
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s4, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s4, s8
+; GFX6-NEXT:    s_min_u32 s8, s4, s8
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s8
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s5, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s5, s8
+; GFX6-NEXT:    s_min_u32 s8, s5, s8
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
-; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s6, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s6, s8
+; GFX6-NEXT:    s_min_u32 s8, s6, s8
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s8
-; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
-; GFX6-NEXT:    s_cmp_lt_u32 s7, s8
-; GFX6-NEXT:    s_cselect_b32 s8, s7, s8
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX6-NEXT:    s_min_u32 s8, s7, s8
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s8
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s7, 16
-; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
+; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
 ; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
+; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
 ; GFX6-NEXT:    s_or_b32 s2, s4, s2
 ; GFX6-NEXT:    s_or_b32 s3, s6, s3
 ; GFX6-NEXT:    ; return to shader part epilog


        


More information about the llvm-commits mailing list