[llvm] 5819159 - AMDGPU/GlobalISel: Pack constant G_BUILD_VECTOR_TRUNCs when selecting

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Sun Jul 26 06:55:41 PDT 2020


Author: Matt Arsenault
Date: 2020-07-26T09:55:34-04:00
New Revision: 5819159995657091e4e21e538509b2af210fd48d

URL: https://github.com/llvm/llvm-project/commit/5819159995657091e4e21e538509b2af210fd48d
DIFF: https://github.com/llvm/llvm-project/commit/5819159995657091e4e21e538509b2af210fd48d.diff

LOG: AMDGPU/GlobalISel: Pack constant G_BUILD_VECTOR_TRUNCs when selecting

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a126ed1daf17..8bc597664634 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -616,11 +616,6 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
   return true;
 }
 
-static bool isZero(Register Reg, const MachineRegisterInfo &MRI) {
-  int64_t Val;
-  return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0;
-}
-
 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
   MachineInstr &MI) const {
   if (selectImpl(MI, *CoverageInfo))
@@ -645,6 +640,20 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock *BB = MI.getParent();
 
+  auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true);
+  if (ConstSrc1) {
+    auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true);
+    if (ConstSrc0) {
+      uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff;
+      uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff;
+
+      BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
+        .addImm(Lo16 | (Hi16 << 16));
+      MI.eraseFromParent();
+      return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
+    }
+  }
+
   // TODO: This should probably be a combine somewhere
   // (build_vector_trunc $src0, undef -> copy $src0
   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
@@ -686,7 +695,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
   } else if (Shift1) {
     Opc = AMDGPU::S_PACK_LH_B32_B16;
     MI.getOperand(2).setReg(ShiftSrc1);
-  } else if (Shift0 && isZero(Src1, *MRI)) {
+  } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
       .addReg(ShiftSrc0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index c6c0eb7c4a93..2205bfe3c71d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -91,9 +91,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0xffc0
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffc0ffc0
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat:
@@ -113,8 +112,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, 0xffffffc0, 4
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4ffc0
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_lo:
@@ -133,8 +132,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, 4, 0xffffffc0
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffc00004
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi:
@@ -152,13 +151,10 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
 define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
 ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_splat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s1, 0xffc0
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s1
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-NEXT:    s_add_i32 s0, s0, s1
-; GFX9-NEXT:    s_add_i32 s2, s2, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX9-NEXT:    s_add_i32 s0, s0, 0xffc0ffc0
+; GFX9-NEXT:    s_add_i32 s1, s1, 0xffc0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat:
@@ -182,12 +178,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
 define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
 ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_lo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, 0xffffffc0, 4
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-NEXT:    s_add_i32 s0, s0, s1
-; GFX9-NEXT:    s_add_i32 s2, s2, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX9-NEXT:    s_add_i32 s0, s0, 0x4ffc0
+; GFX9-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo:
@@ -210,12 +204,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
 define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
 ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_hi:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, 4, 0xffffffc0
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-NEXT:    s_add_i32 s0, s0, s1
-; GFX9-NEXT:    s_add_i32 s2, s2, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX9-NEXT:    s_add_i32 s0, s0, 0xffc00004
+; GFX9-NEXT:    s_add_i32 s1, s1, 0xffc0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 71ee562f0ecc..c1896f81ef29 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -522,8 +522,7 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) {
 ; GFX9-LABEL: v_ashr_v2i16_15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, 15, 15
-; GFX9-NEXT:    v_pk_ashrrev_i16 v0, s4, v0
+; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <2 x i16> %value, <i16 15, i16 15>
   ret <2 x i16> %result

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir
index c380d3c77def..056ea79a9898 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir
@@ -430,3 +430,273 @@ body: |
     %5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %3, %4
     S_ENDPGM 0, implicit %5
 ...
+
+---
+name: test_build_vector_trunc_s_v2s16_constant_constant
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_constant
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 29884539
+    ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]]
+    %0:sgpr(s32) = G_CONSTANT i32 123
+    %1:sgpr(s32) = G_CONSTANT i32 456
+    %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
+    S_ENDPGM 0, implicit %2
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_constant_impdef
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_impdef
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123
+    ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]]
+    %0:sgpr(s32) = G_CONSTANT i32 123
+    %1:sgpr(s32) = G_IMPLICIT_DEF
+    %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
+    S_ENDPGM 0, implicit %2
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_impdef_constant
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_impdef_constant
+    ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123
+    ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[S_MOV_B32_]]
+    ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
+    %0:sgpr(s32) = G_IMPLICIT_DEF
+    %1:sgpr(s32) = G_CONSTANT i32 123
+    %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
+    S_ENDPGM 0, implicit %2
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_impdef_impdef
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_impdef_impdef
+    ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GFX9: S_ENDPGM 0, implicit [[DEF]]
+    %0:sgpr(s32) = G_IMPLICIT_DEF
+    %1:sgpr(s32) = G_IMPLICIT_DEF
+    %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
+    S_ENDPGM 0, implicit %2
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_zext_constant_zext_constant
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_zext_constant_zext_constant
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 29884539
+    ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]]
+    %0:sgpr(s16) = G_CONSTANT i16 123
+    %1:sgpr(s16) = G_CONSTANT i16 456
+    %2:sgpr(s32) = G_ZEXT %0
+    %3:sgpr(s32) = G_ZEXT %1
+    %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant
+    ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123
+    ; GFX9: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[DEF]], 1048576, implicit-def $scc
+    ; GFX9: [[S_BFE_U32_1:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_MOV_B32_]], 1048576, implicit-def $scc
+    ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_BFE_U32_]], [[S_BFE_U32_1]]
+    ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
+    %0:sgpr(s16) = G_IMPLICIT_DEF
+    %1:sgpr(s16) = G_CONSTANT i16 123
+    %2:sgpr(s32) = G_ZEXT %0
+    %3:sgpr(s32) = G_ZEXT %1
+    %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_sext_constant_sext_constant
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_sext_constant_sext_constant
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294836208
+    ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]]
+    %0:sgpr(s16) = G_CONSTANT i16 -16
+    %1:sgpr(s16) = G_CONSTANT i16 -3
+    %2:sgpr(s32) = G_SEXT %0
+    %3:sgpr(s32) = G_SEXT %1
+    %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_anyext_constant_anyext_constant
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_anyext_constant_anyext_constant
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123
+    ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 456
+    ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[S_MOV_B32_1]]
+    ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
+    %0:sgpr(s16) = G_CONSTANT i16 123
+    %1:sgpr(s16) = G_CONSTANT i16 456
+    %2:sgpr(s32) = G_ANYEXT %0
+    %3:sgpr(s32) = G_ANYEXT %1
+    %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_anyext_impdef_anyext_constant
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_anyext_impdef_anyext_constant
+    ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123
+    ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[S_MOV_B32_]]
+    ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
+    %0:sgpr(s16) = G_IMPLICIT_DEF
+    %1:sgpr(s16) = G_CONSTANT i16 123
+    %2:sgpr(s32) = G_ANYEXT %0
+    %3:sgpr(s32) = G_ANYEXT %1
+    %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_var_constant
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_var_constant
+    ; GFX9: liveins: $sgpr0
+    ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 456
+    ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]]
+    ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
+    %0:sgpr(s32) = COPY $sgpr0
+    %1:sgpr(s32) = G_CONSTANT i32 456
+    %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
+    S_ENDPGM 0, implicit %2
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_constant_var
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_var
+    ; GFX9: liveins: $sgpr0
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 456
+    ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]]
+    ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
+    %0:sgpr(s32) = G_CONSTANT i32 456
+    %1:sgpr(s32) = COPY $sgpr0
+    %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
+    S_ENDPGM 0, implicit %2
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_var_0
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_var_0
+    ; GFX9: liveins: $sgpr0
+    ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]]
+    ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
+    %0:sgpr(s32) = COPY $sgpr0
+    %1:sgpr(s32) = G_CONSTANT i32 0
+    %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
+    S_ENDPGM 0, implicit %2
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_0_var
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_0_var
+    ; GFX9: liveins: $sgpr0
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]]
+    ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
+    %0:sgpr(s32) = G_CONSTANT i32 0
+    %1:sgpr(s32) = COPY $sgpr0
+    %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
+    S_ENDPGM 0, implicit %2
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
index fdcf0f1515f9..172656f08aef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -82,24 +82,21 @@ define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_sdot2_inline_literal_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX906-NEXT:    v_dot2_i32_i16 v0, s4, v0, v1
+; GFX906-NEXT:    v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_inline_literal_a:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX908-NEXT:    v_dot2_i32_i16 v0, s4, v0, v1
+; GFX908-NEXT:    v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_inline_literal_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
+; GFX10-NEXT:    v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_dot2_i32_i16 v0, s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false)
   ret i32 %r
@@ -109,24 +106,21 @@ define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) {
 ; GFX906-LABEL: v_sdot2_inline_literal_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, s4, v1
+; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_inline_literal_b:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, s4, v1
+; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_inline_literal_b:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
+; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, s4, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
   ret i32 %r
@@ -136,29 +130,21 @@ define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
 ; GFX906-LABEL: v_sdot2_inline_literal_a_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
-; GFX906-NEXT:    v_mov_b32_e32 v0, s5
-; GFX906-NEXT:    v_dot2_i32_i16 v0, s4, v0, v1
+; GFX906-NEXT:    v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_inline_literal_a_b:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
-; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
-; GFX908-NEXT:    v_mov_b32_e32 v0, s5
-; GFX908-NEXT:    v_dot2_i32_i16 v0, s4, v0, v1
+; GFX908-NEXT:    v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_inline_literal_a_b:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
+; GFX10-NEXT:    v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_dot2_i32_i16 v0, s4, s5, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
   ret i32 %r
@@ -168,29 +154,21 @@ define i32 @v_sdot2_inline_literal_a_b_c() {
 ; GFX906-LABEL: v_sdot2_inline_literal_a_b_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
-; GFX906-NEXT:    v_mov_b32_e32 v0, s5
-; GFX906-NEXT:    v_dot2_i32_i16 v0, s4, v0, 8
+; GFX906-NEXT:    v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_inline_literal_a_b_c:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
-; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
-; GFX908-NEXT:    v_mov_b32_e32 v0, s5
-; GFX908-NEXT:    v_dot2_i32_i16 v0, s4, v0, 8
+; GFX908-NEXT:    v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_inline_literal_a_b_c:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
+; GFX10-NEXT:    v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_dot2_i32_i16 v0, s4, s5, 8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false)
   ret i32 %r

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
index d285ee132cc2..976536c72883 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -82,24 +82,21 @@ define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_udot2_inline_literal_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX906-NEXT:    v_dot2_u32_u16 v0, s4, v0, v1
+; GFX906-NEXT:    v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_inline_literal_a:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX908-NEXT:    v_dot2_u32_u16 v0, s4, v0, v1
+; GFX908-NEXT:    v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_inline_literal_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
+; GFX10-NEXT:    v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_dot2_u32_u16 v0, s4, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false)
   ret i32 %r
@@ -109,24 +106,21 @@ define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) {
 ; GFX906-LABEL: v_udot2_inline_literal_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, s4, v1
+; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_inline_literal_b:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, s4, v1
+; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_inline_literal_b:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
+; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, s4, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
   ret i32 %r
@@ -136,29 +130,21 @@ define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
 ; GFX906-LABEL: v_udot2_inline_literal_a_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
-; GFX906-NEXT:    v_mov_b32_e32 v0, s5
-; GFX906-NEXT:    v_dot2_u32_u16 v0, s4, v0, v1
+; GFX906-NEXT:    v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_inline_literal_a_b:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
-; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
-; GFX908-NEXT:    v_mov_b32_e32 v0, s5
-; GFX908-NEXT:    v_dot2_u32_u16 v0, s4, v0, v1
+; GFX908-NEXT:    v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_inline_literal_a_b:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
+; GFX10-NEXT:    v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_dot2_u32_u16 v0, s4, s5, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
   ret i32 %r
@@ -168,29 +154,21 @@ define i32 @v_udot2_inline_literal_a_b_c() {
 ; GFX906-LABEL: v_udot2_inline_literal_a_b_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
-; GFX906-NEXT:    v_mov_b32_e32 v0, s5
-; GFX906-NEXT:    v_dot2_u32_u16 v0, s4, v0, 8
+; GFX906-NEXT:    v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_inline_literal_a_b_c:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
-; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
-; GFX908-NEXT:    v_mov_b32_e32 v0, s5
-; GFX908-NEXT:    v_dot2_u32_u16 v0, s4, v0, 8
+; GFX908-NEXT:    v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_inline_literal_a_b_c:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
+; GFX10-NEXT:    v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_dot2_u32_u16 v0, s4, s5, 8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false)
   ret i32 %r

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 9d82396bbc36..ea2631cbcb29 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -533,8 +533,7 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
 ; GFX9-LABEL: v_lshr_v2i16_15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, 15, 15
-; GFX9-NEXT:    v_pk_lshrrev_b16 v0, s4, v0
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <2 x i16> %value, <i16 15, i16 15>
   ret <2 x i16> %result

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index b2e7f1ea326f..ba672883fa56 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4527,15 +4527,12 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX9-LABEL: v_saddsat_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s5, 0x8000
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, 0, 0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX9-NEXT:    v_pk_min_i16 v3, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v3, s5, v3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX9-NEXT:    v_pk_max_i16 v2, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v2, s4, v2
+; GFX9-NEXT:    v_pk_max_i16 v2, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fff7fff
+; GFX9-NEXT:    v_pk_sub_i16 v2, v3, v2
+; GFX9-NEXT:    v_pk_min_i16 v3, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x80008000
+; GFX9-NEXT:    v_pk_sub_i16 v3, v4, v3
 ; GFX9-NEXT:    v_pk_max_i16 v1, v3, v1
 ; GFX9-NEXT:    v_pk_min_i16 v1, v1, v2
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
@@ -4545,16 +4542,11 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 0, 0
-; GFX10-NEXT:    s_movk_i32 s5, 0x8000
-; GFX10-NEXT:    v_pk_min_i16 v2, v0, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX10-NEXT:    v_pk_max_i16 v3, v0, s4
-; GFX10-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX10-NEXT:    v_pk_min_i16 v2, v0, 0
+; GFX10-NEXT:    v_pk_max_i16 v3, v0, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_pk_sub_i16 v2, s5, v2
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s6, s6
-; GFX10-NEXT:    v_pk_sub_i16 v3, s4, v3
+; GFX10-NEXT:    v_pk_sub_i16 v2, 0x80008000, v2
+; GFX10-NEXT:    v_pk_sub_i16 v3, 0x7fff7fff, v3
 ; GFX10-NEXT:    v_pk_max_i16 v1, v2, v1
 ; GFX10-NEXT:    v_pk_min_i16 v1, v1, v3
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1
@@ -4650,53 +4642,45 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ;
 ; GFX9-LABEL: s_saddsat_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, 0, 0
-; GFX9-NEXT:    s_sext_i32_i16 s7, s4
-; GFX9-NEXT:    s_sext_i32_i16 s5, s0
-; GFX9-NEXT:    s_ashr_i32 s6, s0, 16
-; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s5, s7
-; GFX9-NEXT:    s_cselect_b32 s8, s5, s7
-; GFX9-NEXT:    s_cmp_gt_i32 s6, s4
-; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX9-NEXT:    s_cselect_b32 s9, s6, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s2
-; GFX9-NEXT:    s_lshr_b32 s9, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s8, 16
-; GFX9-NEXT:    s_sub_i32 s2, s2, s8
-; GFX9-NEXT:    s_sub_i32 s8, s9, s10
-; GFX9-NEXT:    s_cmp_lt_i32 s5, s7
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX9-NEXT:    s_cmp_lt_i32 s6, s4
-; GFX9-NEXT:    s_movk_i32 s3, 0x8000
-; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s5, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s3
-; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX9-NEXT:    s_sub_i32 s3, s3, s4
-; GFX9-NEXT:    s_sub_i32 s4, s5, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
-; GFX9-NEXT:    s_sext_i32_i16 s4, s3
-; GFX9-NEXT:    s_sext_i32_i16 s5, s1
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s4, s5
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX9-NEXT:    s_cmp_gt_i32 s3, s1
-; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s4, s1
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
-; GFX9-NEXT:    s_sext_i32_i16 s3, s1
-; GFX9-NEXT:    s_sext_i32_i16 s4, s2
-; GFX9-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX9-NEXT:    s_sext_i32_i16 s2, s0
+; GFX9-NEXT:    s_ashr_i32 s3, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s4, 0
+; GFX9-NEXT:    s_cmp_gt_i32 s2, s4
+; GFX9-NEXT:    s_cselect_b32 s5, s2, s4
+; GFX9-NEXT:    s_cmp_gt_i32 s3, 0
+; GFX9-NEXT:    s_cselect_b32 s6, s3, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX9-NEXT:    s_sub_i32 s5, 0x7fff7fff, s5
+; GFX9-NEXT:    s_sub_i32 s6, 0x7fff, s6
+; GFX9-NEXT:    s_cmp_lt_i32 s2, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX9-NEXT:    s_cmp_lt_i32 s3, 0
+; GFX9-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX9-NEXT:    s_sub_i32 s2, 0x80008000, s2
+; GFX9-NEXT:    s_sub_i32 s3, 0x8000, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GFX9-NEXT:    s_sext_i32_i16 s3, s2
+; GFX9-NEXT:    s_sext_i32_i16 s4, s1
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s3, s4
+; GFX9-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX9-NEXT:    s_cmp_gt_i32 s3, s4
 ; GFX9-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX9-NEXT:    s_cmp_lt_i32 s1, s2
-; GFX9-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX9-NEXT:    s_cmp_gt_i32 s2, s1
+; GFX9-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s3, s1
+; GFX9-NEXT:    s_sext_i32_i16 s2, s1
+; GFX9-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX9-NEXT:    s_sext_i32_i16 s3, s5
+; GFX9-NEXT:    s_ashr_i32 s4, s5, 16
+; GFX9-NEXT:    s_cmp_lt_i32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX9-NEXT:    s_cmp_lt_i32 s1, s4
+; GFX9-NEXT:    s_cselect_b32 s1, s1, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s2, s1
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX9-NEXT:    s_add_i32 s0, s0, s1
@@ -4706,55 +4690,47 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ;
 ; GFX10-LABEL: s_saddsat_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, 0, 0
-; GFX10-NEXT:    s_sext_i32_i16 s3, s0
-; GFX10-NEXT:    s_sext_i32_i16 s5, s2
+; GFX10-NEXT:    s_sext_i32_i16 s2, s0
+; GFX10-NEXT:    s_sext_i32_i16 s3, 0
 ; GFX10-NEXT:    s_ashr_i32 s4, s0, 16
-; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s3, s5
-; GFX10-NEXT:    s_movk_i32 s7, 0x7fff
-; GFX10-NEXT:    s_cselect_b32 s6, s3, s5
-; GFX10-NEXT:    s_cmp_gt_i32 s4, s2
-; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s7, s7
-; GFX10-NEXT:    s_cselect_b32 s8, s4, s2
+; GFX10-NEXT:    s_cmp_gt_i32 s2, s3
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s8
-; GFX10-NEXT:    s_lshr_b32 s8, s7, 16
-; GFX10-NEXT:    s_lshr_b32 s9, s6, 16
-; GFX10-NEXT:    s_sub_i32 s6, s7, s6
-; GFX10-NEXT:    s_sub_i32 s7, s8, s9
-; GFX10-NEXT:    s_cmp_lt_i32 s3, s5
-; GFX10-NEXT:    s_cselect_b32 s3, s3, s5
-; GFX10-NEXT:    s_cmp_lt_i32 s4, s2
-; GFX10-NEXT:    s_movk_i32 s5, 0x8000
-; GFX10-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s5, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
-; GFX10-NEXT:    s_lshr_b32 s3, s4, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX10-NEXT:    s_sub_i32 s2, s4, s2
-; GFX10-NEXT:    s_sub_i32 s3, s3, s5
+; GFX10-NEXT:    s_cselect_b32 s5, s2, s3
+; GFX10-NEXT:    s_cmp_gt_i32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s6, s4, 0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX10-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX10-NEXT:    s_sub_i32 s5, 0x7fff7fff, s5
+; GFX10-NEXT:    s_sub_i32 s6, 0x7fff, s6
+; GFX10-NEXT:    s_cmp_lt_i32 s2, s3
+; GFX10-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX10-NEXT:    s_cmp_lt_i32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s3, s4, 0
 ; GFX10-NEXT:    s_sext_i32_i16 s4, s1
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
 ; GFX10-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX10-NEXT:    s_sub_i32 s2, 0x80008000, s2
+; GFX10-NEXT:    s_sub_i32 s3, 0x8000, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
 ; GFX10-NEXT:    s_sext_i32_i16 s3, s2
 ; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX10-NEXT:    s_cmp_gt_i32 s3, s4
 ; GFX10-NEXT:    s_cselect_b32 s3, s3, s4
 ; GFX10-NEXT:    s_cmp_gt_i32 s2, s1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s5, s6
 ; GFX10-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s6, s7
+; GFX10-NEXT:    s_sext_i32_i16 s2, s4
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s3, s1
-; GFX10-NEXT:    s_sext_i32_i16 s4, s2
-; GFX10-NEXT:    s_sext_i32_i16 s3, s1
+; GFX10-NEXT:    s_ashr_i32 s3, s4, 16
+; GFX10-NEXT:    s_sext_i32_i16 s4, s1
 ; GFX10-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX10-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX10-NEXT:    s_cmp_lt_i32 s1, s2
-; GFX10-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX10-NEXT:    s_cmp_lt_i32 s4, s2
+; GFX10-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX10-NEXT:    s_cmp_lt_i32 s1, s3
+; GFX10-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s2, s1
 ; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s3, s1
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX10-NEXT:    s_add_i32 s0, s0, s1
 ; GFX10-NEXT:    s_add_i32 s2, s2, s3
@@ -4834,73 +4810,57 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ;
 ; GFX9-LABEL: saddsat_v2i16_sv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, 0, 0
-; GFX9-NEXT:    s_sext_i32_i16 s6, s3
-; GFX9-NEXT:    s_sext_i32_i16 s4, s0
-; GFX9-NEXT:    s_ashr_i32 s5, s0, 16
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s4, s6
-; GFX9-NEXT:    s_cselect_b32 s7, s4, s6
-; GFX9-NEXT:    s_cmp_gt_i32 s5, s3
-; GFX9-NEXT:    s_movk_i32 s1, 0x7fff
-; GFX9-NEXT:    s_cselect_b32 s8, s5, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s1
-; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s7, 16
-; GFX9-NEXT:    s_sub_i32 s1, s1, s7
-; GFX9-NEXT:    s_sub_i32 s7, s8, s9
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s6
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s6
-; GFX9-NEXT:    s_cmp_lt_i32 s5, s3
-; GFX9-NEXT:    s_movk_i32 s2, 0x8000
-; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s2
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s3, s4, s5
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    v_pk_max_i16 v0, s2, v0
-; GFX9-NEXT:    v_pk_min_i16 v0, v0, s1
+; GFX9-NEXT:    s_sext_i32_i16 s1, s0
+; GFX9-NEXT:    s_ashr_i32 s2, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s3, 0
+; GFX9-NEXT:    s_cmp_gt_i32 s1, s3
+; GFX9-NEXT:    s_cselect_b32 s4, s1, s3
+; GFX9-NEXT:    s_cmp_gt_i32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s5, s2, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
+; GFX9-NEXT:    s_sub_i32 s4, 0x7fff7fff, s4
+; GFX9-NEXT:    s_sub_i32 s5, 0x7fff, s5
+; GFX9-NEXT:    s_cmp_lt_i32 s1, s3
+; GFX9-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX9-NEXT:    s_cmp_lt_i32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX9-NEXT:    s_sub_i32 s1, 0x80008000, s1
+; GFX9-NEXT:    s_sub_i32 s2, 0x8000, s2
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-NEXT:    v_pk_max_i16 v0, s1, v0
+; GFX9-NEXT:    v_pk_min_i16 v0, v0, s4
 ; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: saddsat_v2i16_sv:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, 0, 0
-; GFX10-NEXT:    s_sext_i32_i16 s2, s0
-; GFX10-NEXT:    s_sext_i32_i16 s4, s1
+; GFX10-NEXT:    s_sext_i32_i16 s1, s0
+; GFX10-NEXT:    s_sext_i32_i16 s2, 0
 ; GFX10-NEXT:    s_ashr_i32 s3, s0, 16
-; GFX10-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s2, s4
-; GFX10-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX10-NEXT:    s_cselect_b32 s5, s2, s4
-; GFX10-NEXT:    s_cmp_gt_i32 s3, s1
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s6
-; GFX10-NEXT:    s_cselect_b32 s7, s3, s1
+; GFX10-NEXT:    s_cmp_gt_i32 s1, s2
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s7
-; GFX10-NEXT:    s_lshr_b32 s7, s6, 16
-; GFX10-NEXT:    s_lshr_b32 s8, s5, 16
-; GFX10-NEXT:    s_sub_i32 s5, s6, s5
-; GFX10-NEXT:    s_sub_i32 s6, s7, s8
-; GFX10-NEXT:    s_cmp_lt_i32 s2, s4
-; GFX10-NEXT:    s_cselect_b32 s2, s2, s4
-; GFX10-NEXT:    s_cmp_lt_i32 s3, s1
-; GFX10-NEXT:    s_movk_i32 s4, 0x8000
-; GFX10-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s4, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s2, s1
-; GFX10-NEXT:    s_lshr_b32 s2, s3, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX10-NEXT:    s_sub_i32 s1, s3, s1
-; GFX10-NEXT:    s_sub_i32 s2, s2, s4
+; GFX10-NEXT:    s_cselect_b32 s4, s1, s2
+; GFX10-NEXT:    s_cmp_gt_i32 s3, 0
+; GFX10-NEXT:    s_cselect_b32 s5, s3, 0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX10-NEXT:    s_lshr_b32 s5, s4, 16
+; GFX10-NEXT:    s_sub_i32 s4, 0x7fff7fff, s4
+; GFX10-NEXT:    s_sub_i32 s5, 0x7fff, s5
+; GFX10-NEXT:    s_cmp_lt_i32 s1, s2
+; GFX10-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX10-NEXT:    s_cmp_lt_i32 s3, 0
+; GFX10-NEXT:    s_cselect_b32 s2, s3, 0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX10-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX10-NEXT:    s_sub_i32 s1, 0x80008000, s1
+; GFX10-NEXT:    s_sub_i32 s2, 0x8000, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
 ; GFX10-NEXT:    v_pk_max_i16 v0, s1, v0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s5, s6
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s4, s5
 ; GFX10-NEXT:    v_pk_min_i16 v0, v0, s1
 ; GFX10-NEXT:    v_pk_add_u16 v0, s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -4966,15 +4926,12 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ;
 ; GFX9-LABEL: saddsat_v2i16_vs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s2, 0x8000
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, 0, 0
-; GFX9-NEXT:    s_movk_i32 s1, 0x7fff
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s2
-; GFX9-NEXT:    v_pk_min_i16 v2, v0, s3
-; GFX9-NEXT:    v_pk_sub_i16 v2, s2, v2
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s1
-; GFX9-NEXT:    v_pk_max_i16 v1, v0, s3
-; GFX9-NEXT:    v_pk_sub_i16 v1, s1, v1
+; GFX9-NEXT:    v_pk_max_i16 v1, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff7fff
+; GFX9-NEXT:    v_pk_sub_i16 v1, v2, v1
+; GFX9-NEXT:    v_pk_min_i16 v2, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x80008000
+; GFX9-NEXT:    v_pk_sub_i16 v2, v3, v2
 ; GFX9-NEXT:    v_pk_max_i16 v2, v2, s0
 ; GFX9-NEXT:    v_pk_min_i16 v1, v2, v1
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
@@ -4982,16 +4939,11 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ;
 ; GFX10-LABEL: saddsat_v2i16_vs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, 0, 0
-; GFX10-NEXT:    s_movk_i32 s2, 0x8000
-; GFX10-NEXT:    v_pk_min_i16 v1, v0, s1
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s2
-; GFX10-NEXT:    v_pk_max_i16 v2, v0, s1
-; GFX10-NEXT:    s_movk_i32 s3, 0x7fff
+; GFX10-NEXT:    v_pk_min_i16 v1, v0, 0
+; GFX10-NEXT:    v_pk_max_i16 v2, v0, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_pk_sub_i16 v1, s2, v1
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s3, s3
-; GFX10-NEXT:    v_pk_sub_i16 v2, s1, v2
+; GFX10-NEXT:    v_pk_sub_i16 v1, 0x80008000, v1
+; GFX10-NEXT:    v_pk_sub_i16 v2, 0x7fff7fff, v2
 ; GFX10-NEXT:    v_pk_max_i16 v1, v1, s0
 ; GFX10-NEXT:    v_pk_min_i16 v1, v1, v2
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1
@@ -5113,22 +5065,19 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX9-LABEL: v_saddsat_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s5, 0x8000
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, 0, 0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX9-NEXT:    v_pk_min_i16 v5, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v5, s5, v5
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX9-NEXT:    v_pk_max_i16 v4, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v4, s4, v4
-; GFX9-NEXT:    v_pk_max_i16 v2, v5, v2
+; GFX9-NEXT:    v_pk_min_i16 v6, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x80008000
+; GFX9-NEXT:    v_pk_sub_i16 v6, v7, v6
+; GFX9-NEXT:    v_pk_max_i16 v4, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fff7fff
+; GFX9-NEXT:    v_pk_sub_i16 v4, v5, v4
+; GFX9-NEXT:    v_pk_max_i16 v2, v6, v2
 ; GFX9-NEXT:    v_pk_min_i16 v2, v2, v4
-; GFX9-NEXT:    v_pk_min_i16 v4, v1, s6
+; GFX9-NEXT:    v_pk_min_i16 v4, v1, 0
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_i16 v2, v1, s6
-; GFX9-NEXT:    v_pk_sub_i16 v4, s5, v4
-; GFX9-NEXT:    v_pk_sub_i16 v2, s4, v2
+; GFX9-NEXT:    v_pk_max_i16 v2, v1, 0
+; GFX9-NEXT:    v_pk_sub_i16 v4, v7, v4
+; GFX9-NEXT:    v_pk_sub_i16 v2, v5, v2
 ; GFX9-NEXT:    v_pk_max_i16 v3, v4, v3
 ; GFX9-NEXT:    v_pk_min_i16 v2, v3, v2
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v2
@@ -5138,24 +5087,19 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, 0, 0
-; GFX10-NEXT:    s_movk_i32 s6, 0x8000
-; GFX10-NEXT:    v_pk_min_i16 v4, v0, s5
-; GFX10-NEXT:    v_pk_min_i16 v5, v1, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s6
-; GFX10-NEXT:    v_pk_max_i16 v6, v0, s5
-; GFX10-NEXT:    v_pk_max_i16 v7, v1, s5
-; GFX10-NEXT:    v_pk_sub_i16 v4, s6, v4
-; GFX10-NEXT:    v_pk_sub_i16 v5, s6, v5
-; GFX10-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX10-NEXT:    v_pk_min_i16 v4, v0, 0
+; GFX10-NEXT:    v_pk_min_i16 v5, v1, 0
+; GFX10-NEXT:    v_pk_max_i16 v6, v0, 0
+; GFX10-NEXT:    v_pk_max_i16 v7, v1, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
+; GFX10-NEXT:    v_pk_sub_i16 v4, 0x80008000, v4
+; GFX10-NEXT:    v_pk_sub_i16 v5, 0x80008000, v5
+; GFX10-NEXT:    v_pk_sub_i16 v6, 0x7fff7fff, v6
+; GFX10-NEXT:    v_pk_sub_i16 v7, 0x7fff7fff, v7
 ; GFX10-NEXT:    v_pk_max_i16 v11, v4, v2
-; GFX10-NEXT:    v_pk_sub_i16 v6, s4, v6
-; GFX10-NEXT:    v_pk_sub_i16 v4, s4, v7
-; GFX10-NEXT:    v_pk_max_i16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_i16 v10, v5, v3
 ; GFX10-NEXT:    v_pk_min_i16 v2, v11, v6
-; GFX10-NEXT:    v_pk_min_i16 v3, v3, v4
+; GFX10-NEXT:    v_pk_min_i16 v3, v10, v7
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_add_u16 v1, v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -5321,76 +5265,72 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ;
 ; GFX9-LABEL: s_saddsat_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, 0, 0
-; GFX9-NEXT:    s_sext_i32_i16 s9, s6
-; GFX9-NEXT:    s_sext_i32_i16 s7, s0
-; GFX9-NEXT:    s_ashr_i32 s8, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s6, s0
+; GFX9-NEXT:    s_ashr_i32 s7, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s8, 0
+; GFX9-NEXT:    s_cmp_gt_i32 s6, s8
+; GFX9-NEXT:    s_cselect_b32 s9, s6, s8
+; GFX9-NEXT:    s_cmp_gt_i32 s7, 0
+; GFX9-NEXT:    s_cselect_b32 s10, s7, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
+; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
+; GFX9-NEXT:    s_lshr_b32 s11, s9, 16
+; GFX9-NEXT:    s_movk_i32 s10, 0x7fff
+; GFX9-NEXT:    s_sub_i32 s9, s4, s9
+; GFX9-NEXT:    s_sub_i32 s11, s10, s11
+; GFX9-NEXT:    s_cmp_lt_i32 s6, s8
+; GFX9-NEXT:    s_cselect_b32 s6, s6, s8
+; GFX9-NEXT:    s_cmp_lt_i32 s7, 0
+; GFX9-NEXT:    s_cselect_b32 s7, s7, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
+; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s11
+; GFX9-NEXT:    s_mov_b32 s5, 0x80008000
+; GFX9-NEXT:    s_lshr_b32 s11, s6, 16
+; GFX9-NEXT:    s_mov_b32 s7, 0x8000
+; GFX9-NEXT:    s_sub_i32 s6, s5, s6
+; GFX9-NEXT:    s_sub_i32 s11, s7, s11
+; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s11
+; GFX9-NEXT:    s_sext_i32_i16 s11, s6
+; GFX9-NEXT:    s_sext_i32_i16 s12, s2
 ; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s7, s9
-; GFX9-NEXT:    s_cselect_b32 s10, s7, s9
-; GFX9-NEXT:    s_cmp_gt_i32 s8, s6
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    s_cselect_b32 s11, s8, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s11
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX9-NEXT:    s_lshr_b32 s12, s10, 16
-; GFX9-NEXT:    s_lshr_b32 s11, s4, 16
-; GFX9-NEXT:    s_sub_i32 s10, s4, s10
-; GFX9-NEXT:    s_sub_i32 s12, s11, s12
-; GFX9-NEXT:    s_cmp_lt_i32 s7, s9
-; GFX9-NEXT:    s_cselect_b32 s7, s7, s9
-; GFX9-NEXT:    s_cmp_lt_i32 s8, s6
-; GFX9-NEXT:    s_movk_i32 s5, 0x8000
-; GFX9-NEXT:    s_cselect_b32 s8, s8, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s12
-; GFX9-NEXT:    s_lshr_b32 s12, s7, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s5, 16
-; GFX9-NEXT:    s_sub_i32 s7, s5, s7
-; GFX9-NEXT:    s_sub_i32 s12, s8, s12
-; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s12
-; GFX9-NEXT:    s_sext_i32_i16 s12, s7
-; GFX9-NEXT:    s_sext_i32_i16 s13, s2
-; GFX9-NEXT:    s_ashr_i32 s7, s7, 16
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s12, s13
-; GFX9-NEXT:    s_cselect_b32 s12, s12, s13
-; GFX9-NEXT:    s_cmp_gt_i32 s7, s2
-; GFX9-NEXT:    s_cselect_b32 s2, s7, s2
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s12, s2
-; GFX9-NEXT:    s_sext_i32_i16 s7, s2
-; GFX9-NEXT:    s_sext_i32_i16 s12, s10
+; GFX9-NEXT:    s_cmp_gt_i32 s11, s12
+; GFX9-NEXT:    s_cselect_b32 s11, s11, s12
+; GFX9-NEXT:    s_cmp_gt_i32 s6, s2
+; GFX9-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
+; GFX9-NEXT:    s_sext_i32_i16 s6, s2
+; GFX9-NEXT:    s_sext_i32_i16 s11, s9
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NEXT:    s_ashr_i32 s10, s10, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s7, s12
-; GFX9-NEXT:    s_cselect_b32 s7, s7, s12
-; GFX9-NEXT:    s_cmp_lt_i32 s2, s10
-; GFX9-NEXT:    s_cselect_b32 s2, s2, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s7, s2
-; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX9-NEXT:    s_add_i32 s0, s0, s2
-; GFX9-NEXT:    s_add_i32 s7, s7, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s7
-; GFX9-NEXT:    s_sext_i32_i16 s2, s1
-; GFX9-NEXT:    s_ashr_i32 s7, s1, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s2, s9
-; GFX9-NEXT:    s_cselect_b32 s10, s2, s9
-; GFX9-NEXT:    s_cmp_gt_i32 s7, s6
-; GFX9-NEXT:    s_cselect_b32 s12, s7, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s12
-; GFX9-NEXT:    s_lshr_b32 s12, s10, 16
-; GFX9-NEXT:    s_sub_i32 s4, s4, s10
-; GFX9-NEXT:    s_sub_i32 s10, s11, s12
+; GFX9-NEXT:    s_ashr_i32 s9, s9, 16
+; GFX9-NEXT:    s_cmp_lt_i32 s6, s11
+; GFX9-NEXT:    s_cselect_b32 s6, s6, s11
 ; GFX9-NEXT:    s_cmp_lt_i32 s2, s9
 ; GFX9-NEXT:    s_cselect_b32 s2, s2, s9
-; GFX9-NEXT:    s_cmp_lt_i32 s7, s6
-; GFX9-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s6, s2
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s2, 16
+; GFX9-NEXT:    s_add_i32 s0, s0, s2
+; GFX9-NEXT:    s_add_i32 s6, s6, s9
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX9-NEXT:    s_sext_i32_i16 s2, s1
+; GFX9-NEXT:    s_ashr_i32 s6, s1, 16
+; GFX9-NEXT:    s_cmp_gt_i32 s2, s8
+; GFX9-NEXT:    s_cselect_b32 s9, s2, s8
+; GFX9-NEXT:    s_cmp_gt_i32 s6, 0
+; GFX9-NEXT:    s_cselect_b32 s11, s6, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s11
+; GFX9-NEXT:    s_lshr_b32 s11, s9, 16
+; GFX9-NEXT:    s_sub_i32 s4, s4, s9
+; GFX9-NEXT:    s_sub_i32 s9, s10, s11
+; GFX9-NEXT:    s_cmp_lt_i32 s2, s8
+; GFX9-NEXT:    s_cselect_b32 s2, s2, s8
+; GFX9-NEXT:    s_cmp_lt_i32 s6, 0
+; GFX9-NEXT:    s_cselect_b32 s6, s6, 0
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
 ; GFX9-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX9-NEXT:    s_sub_i32 s2, s5, s2
-; GFX9-NEXT:    s_sub_i32 s5, s8, s6
+; GFX9-NEXT:    s_sub_i32 s5, s7, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NEXT:    s_sext_i32_i16 s6, s3
@@ -5401,7 +5341,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX9-NEXT:    s_cmp_gt_i32 s2, s3
 ; GFX9-NEXT:    s_cselect_b32 s2, s2, s3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s5, s2
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s10
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s9
 ; GFX9-NEXT:    s_sext_i32_i16 s3, s2
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s4
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
@@ -5420,94 +5360,90 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ;
 ; GFX10-LABEL: s_saddsat_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 0, 0
-; GFX10-NEXT:    s_sext_i32_i16 s5, s0
-; GFX10-NEXT:    s_sext_i32_i16 s7, s4
+; GFX10-NEXT:    s_sext_i32_i16 s4, s0
+; GFX10-NEXT:    s_sext_i32_i16 s5, 0
 ; GFX10-NEXT:    s_ashr_i32 s6, s0, 16
-; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s5, s7
-; GFX10-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX10-NEXT:    s_cselect_b32 s8, s5, s7
-; GFX10-NEXT:    s_cmp_gt_i32 s6, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s9
-; GFX10-NEXT:    s_cselect_b32 s10, s6, s4
-; GFX10-NEXT:    s_movk_i32 s12, 0x8000
-; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s8, s10
-; GFX10-NEXT:    s_lshr_b32 s10, s9, 16
-; GFX10-NEXT:    s_lshr_b32 s11, s8, 16
-; GFX10-NEXT:    s_sub_i32 s8, s9, s8
-; GFX10-NEXT:    s_sub_i32 s11, s10, s11
-; GFX10-NEXT:    s_cmp_lt_i32 s5, s7
-; GFX10-NEXT:    s_pack_ll_b32_b16 s12, s12, s12
-; GFX10-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX10-NEXT:    s_cmp_lt_i32 s6, s4
-; GFX10-NEXT:    s_sext_i32_i16 s14, s2
-; GFX10-NEXT:    s_cselect_b32 s6, s6, s4
-; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX10-NEXT:    s_lshr_b32 s6, s12, 16
-; GFX10-NEXT:    s_lshr_b32 s13, s5, 16
-; GFX10-NEXT:    s_sub_i32 s5, s12, s5
-; GFX10-NEXT:    s_sub_i32 s13, s6, s13
+; GFX10-NEXT:    s_cmp_gt_i32 s4, s5
+; GFX10-NEXT:    s_mov_b32 s9, 0x7fff7fff
+; GFX10-NEXT:    s_cselect_b32 s7, s4, s5
+; GFX10-NEXT:    s_cmp_gt_i32 s6, 0
+; GFX10-NEXT:    s_mov_b32 s11, 0x80008000
+; GFX10-NEXT:    s_cselect_b32 s8, s6, 0
+; GFX10-NEXT:    s_sext_i32_i16 s13, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX10-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX10-NEXT:    s_lshr_b32 s10, s7, 16
+; GFX10-NEXT:    s_sub_i32 s7, s9, s7
+; GFX10-NEXT:    s_sub_i32 s10, s8, s10
+; GFX10-NEXT:    s_cmp_lt_i32 s4, s5
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s13
-; GFX10-NEXT:    s_sext_i32_i16 s13, s5
-; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s13, s14
-; GFX10-NEXT:    s_cselect_b32 s13, s13, s14
-; GFX10-NEXT:    s_cmp_gt_i32 s5, s2
-; GFX10-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s8, s11
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s13, s2
-; GFX10-NEXT:    s_sext_i32_i16 s11, s5
-; GFX10-NEXT:    s_sext_i32_i16 s8, s2
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX10-NEXT:    s_cmp_lt_i32 s6, 0
+; GFX10-NEXT:    s_cselect_b32 s6, s6, 0
 ; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s8, s11
-; GFX10-NEXT:    s_cselect_b32 s8, s8, s11
-; GFX10-NEXT:    s_cmp_lt_i32 s2, s5
-; GFX10-NEXT:    s_cselect_b32 s2, s2, s5
-; GFX10-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s8, s2
-; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
+; GFX10-NEXT:    s_mov_b32 s6, 0x8000
+; GFX10-NEXT:    s_lshr_b32 s12, s4, 16
+; GFX10-NEXT:    s_sub_i32 s4, s11, s4
+; GFX10-NEXT:    s_sub_i32 s12, s6, s12
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s12
+; GFX10-NEXT:    s_sext_i32_i16 s12, s4
+; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX10-NEXT:    s_cmp_gt_i32 s12, s13
+; GFX10-NEXT:    s_cselect_b32 s12, s12, s13
+; GFX10-NEXT:    s_cmp_gt_i32 s4, s2
+; GFX10-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s7, s10
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s12, s2
+; GFX10-NEXT:    s_sext_i32_i16 s10, s4
+; GFX10-NEXT:    s_sext_i32_i16 s7, s2
+; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX10-NEXT:    s_cmp_lt_i32 s7, s10
+; GFX10-NEXT:    s_cselect_b32 s7, s7, s10
+; GFX10-NEXT:    s_cmp_lt_i32 s2, s4
+; GFX10-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX10-NEXT:    s_sext_i32_i16 s4, s1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s7, s2
+; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX10-NEXT:    s_add_i32 s0, s0, s2
-; GFX10-NEXT:    s_sext_i32_i16 s2, s1
-; GFX10-NEXT:    s_add_i32 s5, s5, s8
-; GFX10-NEXT:    s_ashr_i32 s8, s1, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s2, s7
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s5
-; GFX10-NEXT:    s_cselect_b32 s11, s2, s7
-; GFX10-NEXT:    s_cmp_gt_i32 s8, s4
-; GFX10-NEXT:    s_cselect_b32 s13, s8, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s11, s11, s13
-; GFX10-NEXT:    s_lshr_b32 s13, s11, 16
-; GFX10-NEXT:    s_sub_i32 s9, s9, s11
-; GFX10-NEXT:    s_sub_i32 s10, s10, s13
-; GFX10-NEXT:    s_cmp_lt_i32 s2, s7
-; GFX10-NEXT:    s_cselect_b32 s2, s2, s7
-; GFX10-NEXT:    s_cmp_lt_i32 s8, s4
-; GFX10-NEXT:    s_cselect_b32 s4, s8, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX10-NEXT:    s_add_i32 s7, s7, s10
+; GFX10-NEXT:    s_ashr_i32 s2, s1, 16
+; GFX10-NEXT:    s_cmp_gt_i32 s4, s5
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s7
+; GFX10-NEXT:    s_cselect_b32 s10, s4, s5
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 0
+; GFX10-NEXT:    s_cselect_b32 s12, s2, 0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s10, s10, s12
+; GFX10-NEXT:    s_lshr_b32 s12, s10, 16
+; GFX10-NEXT:    s_sub_i32 s9, s9, s10
+; GFX10-NEXT:    s_sub_i32 s8, s8, s12
+; GFX10-NEXT:    s_cmp_lt_i32 s4, s5
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX10-NEXT:    s_cmp_lt_i32 s2, 0
+; GFX10-NEXT:    s_sext_i32_i16 s5, s3
+; GFX10-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s4, s2
 ; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX10-NEXT:    s_sub_i32 s2, s12, s2
+; GFX10-NEXT:    s_sub_i32 s2, s11, s2
 ; GFX10-NEXT:    s_sub_i32 s4, s6, s4
-; GFX10-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
-; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX10-NEXT:    s_sext_i32_i16 s4, s2
 ; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s4, s6
-; GFX10-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX10-NEXT:    s_cmp_gt_i32 s4, s5
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
 ; GFX10-NEXT:    s_cmp_gt_i32 s2, s3
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s9, s10
+; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s9, s8
 ; GFX10-NEXT:    s_cselect_b32 s2, s2, s3
-; GFX10-NEXT:    s_sext_i32_i16 s3, s6
+; GFX10-NEXT:    s_sext_i32_i16 s3, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s4, s2
-; GFX10-NEXT:    s_ashr_i32 s4, s6, 16
-; GFX10-NEXT:    s_sext_i32_i16 s6, s2
+; GFX10-NEXT:    s_ashr_i32 s4, s5, 16
+; GFX10-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s6, s3
-; GFX10-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX10-NEXT:    s_cmp_lt_i32 s5, s3
+; GFX10-NEXT:    s_cselect_b32 s3, s5, s3
 ; GFX10-NEXT:    s_cmp_lt_i32 s2, s4
 ; GFX10-NEXT:    s_cselect_b32 s2, s2, s4
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
@@ -5676,29 +5612,26 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX9-LABEL: v_saddsat_v6i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s5, 0x8000
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, 0, 0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX9-NEXT:    v_pk_min_i16 v7, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v7, s5, v7
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX9-NEXT:    v_pk_max_i16 v6, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v6, s4, v6
-; GFX9-NEXT:    v_pk_max_i16 v3, v7, v3
+; GFX9-NEXT:    v_pk_min_i16 v8, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x80008000
+; GFX9-NEXT:    v_pk_sub_i16 v8, v9, v8
+; GFX9-NEXT:    v_pk_max_i16 v6, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fff7fff
+; GFX9-NEXT:    v_pk_sub_i16 v6, v7, v6
+; GFX9-NEXT:    v_pk_max_i16 v3, v8, v3
 ; GFX9-NEXT:    v_pk_min_i16 v3, v3, v6
-; GFX9-NEXT:    v_pk_min_i16 v6, v1, s6
+; GFX9-NEXT:    v_pk_min_i16 v6, v1, 0
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v3
-; GFX9-NEXT:    v_pk_max_i16 v3, v1, s6
-; GFX9-NEXT:    v_pk_sub_i16 v6, s5, v6
-; GFX9-NEXT:    v_pk_sub_i16 v3, s4, v3
+; GFX9-NEXT:    v_pk_max_i16 v3, v1, 0
+; GFX9-NEXT:    v_pk_sub_i16 v6, v9, v6
+; GFX9-NEXT:    v_pk_sub_i16 v3, v7, v3
 ; GFX9-NEXT:    v_pk_max_i16 v4, v6, v4
 ; GFX9-NEXT:    v_pk_min_i16 v3, v4, v3
-; GFX9-NEXT:    v_pk_min_i16 v4, v2, s6
-; GFX9-NEXT:    v_pk_sub_i16 v4, s5, v4
+; GFX9-NEXT:    v_pk_min_i16 v4, v2, 0
+; GFX9-NEXT:    v_pk_sub_i16 v4, v9, v4
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3
-; GFX9-NEXT:    v_pk_max_i16 v3, v2, s6
-; GFX9-NEXT:    v_pk_sub_i16 v3, s4, v3
+; GFX9-NEXT:    v_pk_max_i16 v3, v2, 0
+; GFX9-NEXT:    v_pk_sub_i16 v3, v7, v3
 ; GFX9-NEXT:    v_pk_max_i16 v4, v4, v5
 ; GFX9-NEXT:    v_pk_min_i16 v3, v4, v3
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v3
@@ -5708,28 +5641,23 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, 0, 0
-; GFX10-NEXT:    s_movk_i32 s6, 0x8000
-; GFX10-NEXT:    v_pk_min_i16 v7, v0, s5
-; GFX10-NEXT:    v_pk_min_i16 v8, v1, s5
-; GFX10-NEXT:    v_pk_min_i16 v9, v2, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s6
-; GFX10-NEXT:    v_pk_max_i16 v6, v0, s5
-; GFX10-NEXT:    v_pk_sub_i16 v14, s6, v7
-; GFX10-NEXT:    v_pk_sub_i16 v15, s6, v8
-; GFX10-NEXT:    v_pk_sub_i16 v19, s6, v9
-; GFX10-NEXT:    v_pk_max_i16 v10, v1, s5
-; GFX10-NEXT:    v_pk_max_i16 v11, v2, s5
-; GFX10-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX10-NEXT:    v_pk_min_i16 v7, v0, 0
+; GFX10-NEXT:    v_pk_min_i16 v8, v1, 0
+; GFX10-NEXT:    v_pk_min_i16 v9, v2, 0
+; GFX10-NEXT:    v_pk_max_i16 v6, v0, 0
+; GFX10-NEXT:    v_pk_max_i16 v10, v1, 0
+; GFX10-NEXT:    v_pk_sub_i16 v14, 0x80008000, v7
+; GFX10-NEXT:    v_pk_sub_i16 v15, 0x80008000, v8
+; GFX10-NEXT:    v_pk_max_i16 v11, v2, 0
+; GFX10-NEXT:    v_pk_sub_i16 v19, 0x80008000, v9
+; GFX10-NEXT:    v_pk_sub_i16 v6, 0x7fff7fff, v6
 ; GFX10-NEXT:    v_pk_max_i16 v3, v14, v3
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
+; GFX10-NEXT:    v_pk_sub_i16 v7, 0x7fff7fff, v10
 ; GFX10-NEXT:    v_pk_max_i16 v4, v15, v4
-; GFX10-NEXT:    v_pk_sub_i16 v6, s4, v6
-; GFX10-NEXT:    v_pk_sub_i16 v7, s4, v10
-; GFX10-NEXT:    v_pk_sub_i16 v8, s4, v11
+; GFX10-NEXT:    v_pk_sub_i16 v8, 0x7fff7fff, v11
 ; GFX10-NEXT:    v_pk_max_i16 v5, v19, v5
-; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_pk_min_i16 v3, v3, v6
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_pk_min_i16 v4, v4, v7
 ; GFX10-NEXT:    v_pk_min_i16 v5, v5, v8
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v3
@@ -5968,119 +5896,115 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ;
 ; GFX9-LABEL: s_saddsat_v6i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s8, 0, 0
-; GFX9-NEXT:    s_sext_i32_i16 s11, s8
-; GFX9-NEXT:    s_sext_i32_i16 s9, s0
-; GFX9-NEXT:    s_ashr_i32 s10, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s8, s0
+; GFX9-NEXT:    s_ashr_i32 s9, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s10, 0
+; GFX9-NEXT:    s_cmp_gt_i32 s8, s10
+; GFX9-NEXT:    s_cselect_b32 s11, s8, s10
+; GFX9-NEXT:    s_cmp_gt_i32 s9, 0
+; GFX9-NEXT:    s_cselect_b32 s12, s9, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
+; GFX9-NEXT:    s_mov_b32 s6, 0x7fff7fff
+; GFX9-NEXT:    s_lshr_b32 s13, s11, 16
+; GFX9-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX9-NEXT:    s_sub_i32 s11, s6, s11
+; GFX9-NEXT:    s_sub_i32 s13, s12, s13
+; GFX9-NEXT:    s_cmp_lt_i32 s8, s10
+; GFX9-NEXT:    s_cselect_b32 s8, s8, s10
+; GFX9-NEXT:    s_cmp_lt_i32 s9, 0
+; GFX9-NEXT:    s_cselect_b32 s9, s9, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s13
+; GFX9-NEXT:    s_mov_b32 s7, 0x80008000
+; GFX9-NEXT:    s_lshr_b32 s13, s8, 16
+; GFX9-NEXT:    s_mov_b32 s9, 0x8000
+; GFX9-NEXT:    s_sub_i32 s8, s7, s8
+; GFX9-NEXT:    s_sub_i32 s13, s9, s13
+; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s13
+; GFX9-NEXT:    s_sext_i32_i16 s13, s8
+; GFX9-NEXT:    s_sext_i32_i16 s14, s3
 ; GFX9-NEXT:    s_ashr_i32 s8, s8, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s9, s11
-; GFX9-NEXT:    s_cselect_b32 s12, s9, s11
-; GFX9-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX9-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX9-NEXT:    s_cselect_b32 s13, s10, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s12, s12, s13
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s6
-; GFX9-NEXT:    s_lshr_b32 s14, s12, 16
-; GFX9-NEXT:    s_lshr_b32 s13, s6, 16
-; GFX9-NEXT:    s_sub_i32 s12, s6, s12
-; GFX9-NEXT:    s_sub_i32 s14, s13, s14
-; GFX9-NEXT:    s_cmp_lt_i32 s9, s11
-; GFX9-NEXT:    s_cselect_b32 s9, s9, s11
-; GFX9-NEXT:    s_cmp_lt_i32 s10, s8
-; GFX9-NEXT:    s_movk_i32 s7, 0x8000
-; GFX9-NEXT:    s_cselect_b32 s10, s10, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s7
-; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s12, s12, s14
-; GFX9-NEXT:    s_lshr_b32 s14, s9, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s7, 16
-; GFX9-NEXT:    s_sub_i32 s9, s7, s9
-; GFX9-NEXT:    s_sub_i32 s14, s10, s14
-; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s14
-; GFX9-NEXT:    s_sext_i32_i16 s14, s9
-; GFX9-NEXT:    s_sext_i32_i16 s15, s3
-; GFX9-NEXT:    s_ashr_i32 s9, s9, 16
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s14, s15
-; GFX9-NEXT:    s_cselect_b32 s14, s14, s15
-; GFX9-NEXT:    s_cmp_gt_i32 s9, s3
-; GFX9-NEXT:    s_cselect_b32 s3, s9, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s14, s3
-; GFX9-NEXT:    s_sext_i32_i16 s9, s3
-; GFX9-NEXT:    s_sext_i32_i16 s14, s12
+; GFX9-NEXT:    s_cmp_gt_i32 s13, s14
+; GFX9-NEXT:    s_cselect_b32 s13, s13, s14
+; GFX9-NEXT:    s_cmp_gt_i32 s8, s3
+; GFX9-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s13, s3
+; GFX9-NEXT:    s_sext_i32_i16 s8, s3
+; GFX9-NEXT:    s_sext_i32_i16 s13, s11
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NEXT:    s_ashr_i32 s12, s12, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s9, s14
-; GFX9-NEXT:    s_cselect_b32 s9, s9, s14
-; GFX9-NEXT:    s_cmp_lt_i32 s3, s12
-; GFX9-NEXT:    s_cselect_b32 s3, s3, s12
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s9, s3
-; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s12, s3, 16
-; GFX9-NEXT:    s_add_i32 s0, s0, s3
-; GFX9-NEXT:    s_add_i32 s9, s9, s12
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s9
-; GFX9-NEXT:    s_sext_i32_i16 s3, s1
-; GFX9-NEXT:    s_ashr_i32 s9, s1, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s3, s11
-; GFX9-NEXT:    s_cselect_b32 s12, s3, s11
-; GFX9-NEXT:    s_cmp_gt_i32 s9, s8
-; GFX9-NEXT:    s_cselect_b32 s14, s9, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s12, s12, s14
-; GFX9-NEXT:    s_lshr_b32 s14, s12, 16
-; GFX9-NEXT:    s_sub_i32 s12, s6, s12
-; GFX9-NEXT:    s_sub_i32 s14, s13, s14
+; GFX9-NEXT:    s_ashr_i32 s11, s11, 16
+; GFX9-NEXT:    s_cmp_lt_i32 s8, s13
+; GFX9-NEXT:    s_cselect_b32 s8, s8, s13
 ; GFX9-NEXT:    s_cmp_lt_i32 s3, s11
 ; GFX9-NEXT:    s_cselect_b32 s3, s3, s11
-; GFX9-NEXT:    s_cmp_lt_i32 s9, s8
-; GFX9-NEXT:    s_cselect_b32 s9, s9, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s9
-; GFX9-NEXT:    s_lshr_b32 s9, s3, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s8, s3
+; GFX9-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s11, s3, 16
+; GFX9-NEXT:    s_add_i32 s0, s0, s3
+; GFX9-NEXT:    s_add_i32 s8, s8, s11
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s8
+; GFX9-NEXT:    s_sext_i32_i16 s3, s1
+; GFX9-NEXT:    s_ashr_i32 s8, s1, 16
+; GFX9-NEXT:    s_cmp_gt_i32 s3, s10
+; GFX9-NEXT:    s_cselect_b32 s11, s3, s10
+; GFX9-NEXT:    s_cmp_gt_i32 s8, 0
+; GFX9-NEXT:    s_cselect_b32 s13, s8, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s13
+; GFX9-NEXT:    s_lshr_b32 s13, s11, 16
+; GFX9-NEXT:    s_sub_i32 s11, s6, s11
+; GFX9-NEXT:    s_sub_i32 s13, s12, s13
+; GFX9-NEXT:    s_cmp_lt_i32 s3, s10
+; GFX9-NEXT:    s_cselect_b32 s3, s3, s10
+; GFX9-NEXT:    s_cmp_lt_i32 s8, 0
+; GFX9-NEXT:    s_cselect_b32 s8, s8, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s8
+; GFX9-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX9-NEXT:    s_sub_i32 s3, s7, s3
-; GFX9-NEXT:    s_sub_i32 s9, s10, s9
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s9
-; GFX9-NEXT:    s_pack_ll_b32_b16 s12, s12, s14
-; GFX9-NEXT:    s_sext_i32_i16 s9, s3
-; GFX9-NEXT:    s_sext_i32_i16 s14, s4
+; GFX9-NEXT:    s_sub_i32 s8, s9, s8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s13
+; GFX9-NEXT:    s_sext_i32_i16 s8, s3
+; GFX9-NEXT:    s_sext_i32_i16 s13, s4
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s9, s14
-; GFX9-NEXT:    s_cselect_b32 s9, s9, s14
+; GFX9-NEXT:    s_cmp_gt_i32 s8, s13
+; GFX9-NEXT:    s_cselect_b32 s8, s8, s13
 ; GFX9-NEXT:    s_cmp_gt_i32 s3, s4
 ; GFX9-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s9, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s8, s3
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s3
-; GFX9-NEXT:    s_sext_i32_i16 s9, s12
+; GFX9-NEXT:    s_sext_i32_i16 s8, s11
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NEXT:    s_ashr_i32 s12, s12, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s9
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s9
-; GFX9-NEXT:    s_cmp_lt_i32 s3, s12
-; GFX9-NEXT:    s_cselect_b32 s3, s3, s12
+; GFX9-NEXT:    s_ashr_i32 s11, s11, 16
+; GFX9-NEXT:    s_cmp_lt_i32 s4, s8
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s8
+; GFX9-NEXT:    s_cmp_lt_i32 s3, s11
+; GFX9-NEXT:    s_cselect_b32 s3, s3, s11
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s3
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX9-NEXT:    s_add_i32 s1, s1, s3
-; GFX9-NEXT:    s_add_i32 s4, s4, s9
+; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
 ; GFX9-NEXT:    s_sext_i32_i16 s3, s2
 ; GFX9-NEXT:    s_ashr_i32 s4, s2, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s3, s11
-; GFX9-NEXT:    s_cselect_b32 s9, s3, s11
-; GFX9-NEXT:    s_cmp_gt_i32 s4, s8
-; GFX9-NEXT:    s_cselect_b32 s12, s4, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s12
-; GFX9-NEXT:    s_lshr_b32 s12, s9, 16
-; GFX9-NEXT:    s_sub_i32 s6, s6, s9
-; GFX9-NEXT:    s_sub_i32 s9, s13, s12
-; GFX9-NEXT:    s_cmp_lt_i32 s3, s11
-; GFX9-NEXT:    s_cselect_b32 s3, s3, s11
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s8
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s8
+; GFX9-NEXT:    s_cmp_gt_i32 s3, s10
+; GFX9-NEXT:    s_cselect_b32 s8, s3, s10
+; GFX9-NEXT:    s_cmp_gt_i32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s11, s4, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s11
+; GFX9-NEXT:    s_lshr_b32 s11, s8, 16
+; GFX9-NEXT:    s_sub_i32 s6, s6, s8
+; GFX9-NEXT:    s_sub_i32 s8, s12, s11
+; GFX9-NEXT:    s_cmp_lt_i32 s3, s10
+; GFX9-NEXT:    s_cselect_b32 s3, s3, s10
+; GFX9-NEXT:    s_cmp_lt_i32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s4, s4, 0
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX9-NEXT:    s_sub_i32 s3, s7, s3
-; GFX9-NEXT:    s_sub_i32 s4, s10, s4
+; GFX9-NEXT:    s_sub_i32 s4, s9, s4
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s3
 ; GFX9-NEXT:    s_sext_i32_i16 s7, s5
@@ -6091,7 +6015,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX9-NEXT:    s_cmp_gt_i32 s3, s5
 ; GFX9-NEXT:    s_cselect_b32 s3, s3, s5
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s9
+; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s8
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s3
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s6
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
@@ -6110,141 +6034,137 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ;
 ; GFX10-LABEL: s_saddsat_v6i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, 0, 0
-; GFX10-NEXT:    s_sext_i32_i16 s7, s0
-; GFX10-NEXT:    s_sext_i32_i16 s9, s6
+; GFX10-NEXT:    s_sext_i32_i16 s6, s0
+; GFX10-NEXT:    s_sext_i32_i16 s7, 0
 ; GFX10-NEXT:    s_ashr_i32 s8, s0, 16
-; GFX10-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s7, s9
-; GFX10-NEXT:    s_movk_i32 s11, 0x7fff
-; GFX10-NEXT:    s_cselect_b32 s10, s7, s9
-; GFX10-NEXT:    s_cmp_gt_i32 s8, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s11, s11, s11
-; GFX10-NEXT:    s_cselect_b32 s12, s8, s6
-; GFX10-NEXT:    s_movk_i32 s14, 0x8000
-; GFX10-NEXT:    s_pack_ll_b32_b16 s10, s10, s12
-; GFX10-NEXT:    s_lshr_b32 s12, s11, 16
-; GFX10-NEXT:    s_lshr_b32 s13, s10, 16
-; GFX10-NEXT:    s_sub_i32 s10, s11, s10
-; GFX10-NEXT:    s_sub_i32 s13, s12, s13
-; GFX10-NEXT:    s_cmp_lt_i32 s7, s9
-; GFX10-NEXT:    s_pack_ll_b32_b16 s14, s14, s14
-; GFX10-NEXT:    s_cselect_b32 s7, s7, s9
-; GFX10-NEXT:    s_cmp_lt_i32 s8, s6
-; GFX10-NEXT:    s_sext_i32_i16 s16, s3
-; GFX10-NEXT:    s_cselect_b32 s8, s8, s6
-; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX10-NEXT:    s_lshr_b32 s8, s14, 16
-; GFX10-NEXT:    s_lshr_b32 s15, s7, 16
-; GFX10-NEXT:    s_sub_i32 s7, s14, s7
-; GFX10-NEXT:    s_sub_i32 s15, s8, s15
+; GFX10-NEXT:    s_cmp_gt_i32 s6, s7
+; GFX10-NEXT:    s_mov_b32 s11, 0x7fff7fff
+; GFX10-NEXT:    s_cselect_b32 s9, s6, s7
+; GFX10-NEXT:    s_cmp_gt_i32 s8, 0
+; GFX10-NEXT:    s_mov_b32 s13, 0x80008000
+; GFX10-NEXT:    s_cselect_b32 s10, s8, 0
+; GFX10-NEXT:    s_sext_i32_i16 s15, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
+; GFX10-NEXT:    s_movk_i32 s10, 0x7fff
+; GFX10-NEXT:    s_lshr_b32 s12, s9, 16
+; GFX10-NEXT:    s_sub_i32 s9, s11, s9
+; GFX10-NEXT:    s_sub_i32 s12, s10, s12
+; GFX10-NEXT:    s_cmp_lt_i32 s6, s7
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s7, s15
-; GFX10-NEXT:    s_sext_i32_i16 s15, s7
-; GFX10-NEXT:    s_ashr_i32 s7, s7, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s15, s16
-; GFX10-NEXT:    s_cselect_b32 s15, s15, s16
-; GFX10-NEXT:    s_cmp_gt_i32 s7, s3
-; GFX10-NEXT:    s_sext_i32_i16 s16, s4
-; GFX10-NEXT:    s_cselect_b32 s3, s7, s3
-; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s10, s13
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s15, s3
-; GFX10-NEXT:    s_sext_i32_i16 s13, s7
-; GFX10-NEXT:    s_sext_i32_i16 s10, s3
+; GFX10-NEXT:    s_cselect_b32 s6, s6, s7
+; GFX10-NEXT:    s_cmp_lt_i32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s8, s8, 0
 ; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX10-NEXT:    s_ashr_i32 s7, s7, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s10, s13
-; GFX10-NEXT:    s_cselect_b32 s10, s10, s13
-; GFX10-NEXT:    s_cmp_lt_i32 s3, s7
-; GFX10-NEXT:    s_cselect_b32 s3, s3, s7
-; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s10, s3
-; GFX10-NEXT:    s_lshr_b32 s10, s3, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s8
+; GFX10-NEXT:    s_mov_b32 s8, 0x8000
+; GFX10-NEXT:    s_lshr_b32 s14, s6, 16
+; GFX10-NEXT:    s_sub_i32 s6, s13, s6
+; GFX10-NEXT:    s_sub_i32 s14, s8, s14
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s14
+; GFX10-NEXT:    s_sext_i32_i16 s14, s6
+; GFX10-NEXT:    s_ashr_i32 s6, s6, 16
+; GFX10-NEXT:    s_cmp_gt_i32 s14, s15
+; GFX10-NEXT:    s_cselect_b32 s14, s14, s15
+; GFX10-NEXT:    s_cmp_gt_i32 s6, s3
+; GFX10-NEXT:    s_sext_i32_i16 s15, s4
+; GFX10-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s9, s12
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s14, s3
+; GFX10-NEXT:    s_sext_i32_i16 s12, s6
+; GFX10-NEXT:    s_sext_i32_i16 s9, s3
+; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
+; GFX10-NEXT:    s_ashr_i32 s6, s6, 16
+; GFX10-NEXT:    s_cmp_lt_i32 s9, s12
+; GFX10-NEXT:    s_cselect_b32 s9, s9, s12
+; GFX10-NEXT:    s_cmp_lt_i32 s3, s6
+; GFX10-NEXT:    s_cselect_b32 s3, s3, s6
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s9, s3
+; GFX10-NEXT:    s_lshr_b32 s9, s3, 16
 ; GFX10-NEXT:    s_add_i32 s0, s0, s3
 ; GFX10-NEXT:    s_sext_i32_i16 s3, s1
-; GFX10-NEXT:    s_add_i32 s7, s7, s10
-; GFX10-NEXT:    s_ashr_i32 s10, s1, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s3, s9
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s7
-; GFX10-NEXT:    s_cselect_b32 s13, s3, s9
-; GFX10-NEXT:    s_cmp_gt_i32 s10, s6
-; GFX10-NEXT:    s_cselect_b32 s15, s10, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s13, s13, s15
-; GFX10-NEXT:    s_lshr_b32 s15, s13, 16
-; GFX10-NEXT:    s_sub_i32 s13, s11, s13
-; GFX10-NEXT:    s_sub_i32 s15, s12, s15
-; GFX10-NEXT:    s_cmp_lt_i32 s3, s9
-; GFX10-NEXT:    s_cselect_b32 s3, s3, s9
-; GFX10-NEXT:    s_cmp_lt_i32 s10, s6
-; GFX10-NEXT:    s_cselect_b32 s10, s10, s6
+; GFX10-NEXT:    s_add_i32 s6, s6, s9
+; GFX10-NEXT:    s_ashr_i32 s9, s1, 16
+; GFX10-NEXT:    s_cmp_gt_i32 s3, s7
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX10-NEXT:    s_cselect_b32 s12, s3, s7
+; GFX10-NEXT:    s_cmp_gt_i32 s9, 0
+; GFX10-NEXT:    s_cselect_b32 s14, s9, 0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s12, s12, s14
+; GFX10-NEXT:    s_lshr_b32 s14, s12, 16
+; GFX10-NEXT:    s_sub_i32 s12, s11, s12
+; GFX10-NEXT:    s_sub_i32 s14, s10, s14
+; GFX10-NEXT:    s_cmp_lt_i32 s3, s7
+; GFX10-NEXT:    s_pack_ll_b32_b16 s12, s12, s14
+; GFX10-NEXT:    s_cselect_b32 s3, s3, s7
+; GFX10-NEXT:    s_cmp_lt_i32 s9, 0
+; GFX10-NEXT:    s_cselect_b32 s9, s9, 0
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s10
-; GFX10-NEXT:    s_lshr_b32 s10, s3, 16
-; GFX10-NEXT:    s_sub_i32 s3, s14, s3
-; GFX10-NEXT:    s_sub_i32 s10, s8, s10
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s10
-; GFX10-NEXT:    s_sext_i32_i16 s10, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s9
+; GFX10-NEXT:    s_lshr_b32 s9, s3, 16
+; GFX10-NEXT:    s_sub_i32 s3, s13, s3
+; GFX10-NEXT:    s_sub_i32 s9, s8, s9
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s9
+; GFX10-NEXT:    s_sext_i32_i16 s9, s3
 ; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s10, s16
-; GFX10-NEXT:    s_cselect_b32 s10, s10, s16
+; GFX10-NEXT:    s_cmp_gt_i32 s9, s15
+; GFX10-NEXT:    s_cselect_b32 s9, s9, s15
 ; GFX10-NEXT:    s_cmp_gt_i32 s3, s4
 ; GFX10-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s13, s15
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s10, s3
-; GFX10-NEXT:    s_sext_i32_i16 s13, s4
-; GFX10-NEXT:    s_sext_i32_i16 s10, s3
+; GFX10-NEXT:    s_sext_i32_i16 s4, s12
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s9, s3
+; GFX10-NEXT:    s_ashr_i32 s9, s12, 16
+; GFX10-NEXT:    s_sext_i32_i16 s12, s3
 ; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s10, s13
-; GFX10-NEXT:    s_cselect_b32 s10, s10, s13
-; GFX10-NEXT:    s_cmp_lt_i32 s3, s4
-; GFX10-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX10-NEXT:    s_sext_i32_i16 s4, s2
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s10, s3
-; GFX10-NEXT:    s_lshr_b32 s10, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s13, s3, 16
-; GFX10-NEXT:    s_add_i32 s1, s1, s3
-; GFX10-NEXT:    s_add_i32 s10, s10, s13
-; GFX10-NEXT:    s_ashr_i32 s3, s2, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s4, s9
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s10
-; GFX10-NEXT:    s_cselect_b32 s13, s4, s9
-; GFX10-NEXT:    s_cmp_gt_i32 s3, s6
-; GFX10-NEXT:    s_cselect_b32 s15, s3, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s13, s13, s15
-; GFX10-NEXT:    s_lshr_b32 s15, s13, 16
-; GFX10-NEXT:    s_sub_i32 s11, s11, s13
-; GFX10-NEXT:    s_sub_i32 s12, s12, s15
-; GFX10-NEXT:    s_cmp_lt_i32 s4, s9
-; GFX10-NEXT:    s_cselect_b32 s4, s4, s9
-; GFX10-NEXT:    s_cmp_lt_i32 s3, s6
-; GFX10-NEXT:    s_cselect_b32 s3, s3, s6
-; GFX10-NEXT:    s_sext_i32_i16 s6, s5
+; GFX10-NEXT:    s_cmp_lt_i32 s12, s4
+; GFX10-NEXT:    s_cselect_b32 s4, s12, s4
+; GFX10-NEXT:    s_cmp_lt_i32 s3, s9
+; GFX10-NEXT:    s_sext_i32_i16 s12, s2
+; GFX10-NEXT:    s_cselect_b32 s3, s3, s9
+; GFX10-NEXT:    s_lshr_b32 s9, s1, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s4, s3
-; GFX10-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX10-NEXT:    s_sub_i32 s3, s14, s3
-; GFX10-NEXT:    s_sub_i32 s4, s8, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
-; GFX10-NEXT:    s_ashr_i32 s4, s5, 16
+; GFX10-NEXT:    s_ashr_i32 s4, s2, 16
+; GFX10-NEXT:    s_lshr_b32 s14, s3, 16
+; GFX10-NEXT:    s_add_i32 s1, s1, s3
+; GFX10-NEXT:    s_add_i32 s9, s9, s14
+; GFX10-NEXT:    s_cmp_gt_i32 s12, s7
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s9
+; GFX10-NEXT:    s_cselect_b32 s3, s12, s7
+; GFX10-NEXT:    s_cmp_gt_i32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s14, s4, 0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s14
+; GFX10-NEXT:    s_lshr_b32 s14, s3, 16
+; GFX10-NEXT:    s_sub_i32 s3, s11, s3
+; GFX10-NEXT:    s_sub_i32 s10, s10, s14
+; GFX10-NEXT:    s_cmp_lt_i32 s12, s7
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s10
+; GFX10-NEXT:    s_cselect_b32 s7, s12, s7
+; GFX10-NEXT:    s_cmp_lt_i32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s7, s4
+; GFX10-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX10-NEXT:    s_sub_i32 s4, s13, s4
+; GFX10-NEXT:    s_sub_i32 s7, s8, s7
+; GFX10-NEXT:    s_sext_i32_i16 s8, s5
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s7
+; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
+; GFX10-NEXT:    s_sext_i32_i16 s7, s4
+; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX10-NEXT:    s_cmp_gt_i32 s7, s8
+; GFX10-NEXT:    s_cselect_b32 s7, s7, s8
+; GFX10-NEXT:    s_cmp_gt_i32 s4, s5
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
 ; GFX10-NEXT:    s_sext_i32_i16 s5, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s7, s4
 ; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s5, s6
-; GFX10-NEXT:    s_cselect_b32 s5, s5, s6
-; GFX10-NEXT:    s_cmp_gt_i32 s3, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s11, s12
-; GFX10-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX10-NEXT:    s_sext_i32_i16 s4, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s5, s3
-; GFX10-NEXT:    s_ashr_i32 s5, s6, 16
-; GFX10-NEXT:    s_sext_i32_i16 s6, s3
-; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s6, s4
-; GFX10-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX10-NEXT:    s_cmp_lt_i32 s3, s5
-; GFX10-NEXT:    s_cselect_b32 s3, s3, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s4, s3
+; GFX10-NEXT:    s_sext_i32_i16 s7, s4
+; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX10-NEXT:    s_cmp_lt_i32 s7, s5
+; GFX10-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX10-NEXT:    s_cmp_lt_i32 s4, s3
+; GFX10-NEXT:    s_cselect_b32 s3, s4, s3
 ; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s5, s3
 ; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
 ; GFX10-NEXT:    s_add_i32 s2, s2, s3
 ; GFX10-NEXT:    s_add_i32 s4, s4, s5
@@ -6438,36 +6358,33 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX9-LABEL: v_saddsat_v8i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s5, 0x8000
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, 0, 0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX9-NEXT:    v_pk_min_i16 v9, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v9, s5, v9
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX9-NEXT:    v_pk_max_i16 v8, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v8, s4, v8
-; GFX9-NEXT:    v_pk_max_i16 v4, v9, v4
+; GFX9-NEXT:    v_pk_min_i16 v10, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0x80008000
+; GFX9-NEXT:    v_pk_sub_i16 v10, v11, v10
+; GFX9-NEXT:    v_pk_max_i16 v8, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fff7fff
+; GFX9-NEXT:    v_pk_sub_i16 v8, v9, v8
+; GFX9-NEXT:    v_pk_max_i16 v4, v10, v4
 ; GFX9-NEXT:    v_pk_min_i16 v4, v4, v8
-; GFX9-NEXT:    v_pk_min_i16 v8, v1, s6
+; GFX9-NEXT:    v_pk_min_i16 v8, v1, 0
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v4
-; GFX9-NEXT:    v_pk_max_i16 v4, v1, s6
-; GFX9-NEXT:    v_pk_sub_i16 v8, s5, v8
-; GFX9-NEXT:    v_pk_sub_i16 v4, s4, v4
+; GFX9-NEXT:    v_pk_max_i16 v4, v1, 0
+; GFX9-NEXT:    v_pk_sub_i16 v8, v11, v8
+; GFX9-NEXT:    v_pk_sub_i16 v4, v9, v4
 ; GFX9-NEXT:    v_pk_max_i16 v5, v8, v5
 ; GFX9-NEXT:    v_pk_min_i16 v4, v5, v4
-; GFX9-NEXT:    v_pk_min_i16 v5, v2, s6
-; GFX9-NEXT:    v_pk_sub_i16 v5, s5, v5
+; GFX9-NEXT:    v_pk_min_i16 v5, v2, 0
+; GFX9-NEXT:    v_pk_sub_i16 v5, v11, v5
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v4
-; GFX9-NEXT:    v_pk_max_i16 v4, v2, s6
-; GFX9-NEXT:    v_pk_sub_i16 v4, s4, v4
+; GFX9-NEXT:    v_pk_max_i16 v4, v2, 0
+; GFX9-NEXT:    v_pk_sub_i16 v4, v9, v4
 ; GFX9-NEXT:    v_pk_max_i16 v5, v5, v6
 ; GFX9-NEXT:    v_pk_min_i16 v4, v5, v4
-; GFX9-NEXT:    v_pk_min_i16 v5, v3, s6
-; GFX9-NEXT:    v_pk_sub_i16 v5, s5, v5
+; GFX9-NEXT:    v_pk_min_i16 v5, v3, 0
+; GFX9-NEXT:    v_pk_sub_i16 v5, v11, v5
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v4
-; GFX9-NEXT:    v_pk_max_i16 v4, v3, s6
-; GFX9-NEXT:    v_pk_sub_i16 v4, s4, v4
+; GFX9-NEXT:    v_pk_max_i16 v4, v3, 0
+; GFX9-NEXT:    v_pk_sub_i16 v4, v9, v4
 ; GFX9-NEXT:    v_pk_max_i16 v5, v5, v7
 ; GFX9-NEXT:    v_pk_min_i16 v4, v5, v4
 ; GFX9-NEXT:    v_pk_add_u16 v3, v3, v4
@@ -6477,30 +6394,25 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 0, 0
-; GFX10-NEXT:    s_movk_i32 s5, 0x8000
-; GFX10-NEXT:    v_pk_min_i16 v8, v0, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX10-NEXT:    v_pk_min_i16 v11, v1, s4
-; GFX10-NEXT:    v_pk_min_i16 v12, v3, s4
-; GFX10-NEXT:    v_pk_max_i16 v9, v0, s4
-; GFX10-NEXT:    v_pk_sub_i16 v15, s5, v8
-; GFX10-NEXT:    v_pk_min_i16 v8, v2, s4
-; GFX10-NEXT:    v_pk_sub_i16 v11, s5, v11
-; GFX10-NEXT:    v_pk_sub_i16 v12, s5, v12
-; GFX10-NEXT:    v_pk_max_i16 v10, v1, s4
-; GFX10-NEXT:    v_pk_max_i16 v13, v2, s4
-; GFX10-NEXT:    v_pk_sub_i16 v8, s5, v8
-; GFX10-NEXT:    v_pk_max_i16 v14, v3, s4
-; GFX10-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX10-NEXT:    v_pk_max_i16 v4, v15, v4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s6
+; GFX10-NEXT:    v_pk_min_i16 v8, v0, 0
+; GFX10-NEXT:    v_pk_min_i16 v11, v1, 0
+; GFX10-NEXT:    v_pk_min_i16 v12, v3, 0
+; GFX10-NEXT:    v_pk_max_i16 v9, v0, 0
+; GFX10-NEXT:    v_pk_max_i16 v10, v1, 0
+; GFX10-NEXT:    v_pk_sub_i16 v15, 0x80008000, v8
+; GFX10-NEXT:    v_pk_min_i16 v8, v2, 0
+; GFX10-NEXT:    v_pk_sub_i16 v11, 0x80008000, v11
+; GFX10-NEXT:    v_pk_sub_i16 v12, 0x80008000, v12
+; GFX10-NEXT:    v_pk_max_i16 v13, v2, 0
+; GFX10-NEXT:    v_pk_max_i16 v14, v3, 0
+; GFX10-NEXT:    v_pk_sub_i16 v8, 0x80008000, v8
 ; GFX10-NEXT:    v_pk_max_i16 v5, v11, v5
-; GFX10-NEXT:    v_pk_sub_i16 v9, s6, v9
-; GFX10-NEXT:    v_pk_sub_i16 v10, s6, v10
+; GFX10-NEXT:    v_pk_sub_i16 v10, 0x7fff7fff, v10
+; GFX10-NEXT:    v_pk_sub_i16 v9, 0x7fff7fff, v9
+; GFX10-NEXT:    v_pk_max_i16 v4, v15, v4
 ; GFX10-NEXT:    v_pk_max_i16 v6, v8, v6
-; GFX10-NEXT:    v_pk_sub_i16 v11, s6, v13
-; GFX10-NEXT:    v_pk_sub_i16 v8, s6, v14
+; GFX10-NEXT:    v_pk_sub_i16 v11, 0x7fff7fff, v13
+; GFX10-NEXT:    v_pk_sub_i16 v8, 0x7fff7fff, v14
 ; GFX10-NEXT:    v_pk_max_i16 v7, v12, v7
 ; GFX10-NEXT:    v_pk_min_i16 v15, v4, v9
 ; GFX10-NEXT:    v_pk_min_i16 v19, v5, v10
@@ -6814,138 +6726,134 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ;
 ; GFX9-LABEL: s_saddsat_v8i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s10, 0, 0
-; GFX9-NEXT:    s_sext_i32_i16 s13, s10
-; GFX9-NEXT:    s_sext_i32_i16 s11, s0
-; GFX9-NEXT:    s_ashr_i32 s12, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s10, s0
+; GFX9-NEXT:    s_ashr_i32 s11, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s12, 0
+; GFX9-NEXT:    s_cmp_gt_i32 s10, s12
+; GFX9-NEXT:    s_cselect_b32 s13, s10, s12
+; GFX9-NEXT:    s_cmp_gt_i32 s11, 0
+; GFX9-NEXT:    s_cselect_b32 s14, s11, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s13, s13, s14
+; GFX9-NEXT:    s_mov_b32 s8, 0x7fff7fff
+; GFX9-NEXT:    s_lshr_b32 s15, s13, 16
+; GFX9-NEXT:    s_movk_i32 s14, 0x7fff
+; GFX9-NEXT:    s_sub_i32 s13, s8, s13
+; GFX9-NEXT:    s_sub_i32 s15, s14, s15
+; GFX9-NEXT:    s_cmp_lt_i32 s10, s12
+; GFX9-NEXT:    s_cselect_b32 s10, s10, s12
+; GFX9-NEXT:    s_cmp_lt_i32 s11, 0
+; GFX9-NEXT:    s_cselect_b32 s11, s11, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s11
+; GFX9-NEXT:    s_pack_ll_b32_b16 s13, s13, s15
+; GFX9-NEXT:    s_mov_b32 s9, 0x80008000
+; GFX9-NEXT:    s_lshr_b32 s15, s10, 16
+; GFX9-NEXT:    s_mov_b32 s11, 0x8000
+; GFX9-NEXT:    s_sub_i32 s10, s9, s10
+; GFX9-NEXT:    s_sub_i32 s15, s11, s15
+; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s15
+; GFX9-NEXT:    s_sext_i32_i16 s15, s10
+; GFX9-NEXT:    s_sext_i32_i16 s16, s4
 ; GFX9-NEXT:    s_ashr_i32 s10, s10, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s11, s13
-; GFX9-NEXT:    s_cselect_b32 s14, s11, s13
-; GFX9-NEXT:    s_cmp_gt_i32 s12, s10
-; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX9-NEXT:    s_cselect_b32 s15, s12, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s14, s14, s15
-; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s8
-; GFX9-NEXT:    s_lshr_b32 s16, s14, 16
-; GFX9-NEXT:    s_lshr_b32 s15, s8, 16
-; GFX9-NEXT:    s_sub_i32 s14, s8, s14
-; GFX9-NEXT:    s_sub_i32 s16, s15, s16
-; GFX9-NEXT:    s_cmp_lt_i32 s11, s13
-; GFX9-NEXT:    s_cselect_b32 s11, s11, s13
-; GFX9-NEXT:    s_cmp_lt_i32 s12, s10
-; GFX9-NEXT:    s_movk_i32 s9, 0x8000
-; GFX9-NEXT:    s_cselect_b32 s12, s12, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s9
-; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
-; GFX9-NEXT:    s_pack_ll_b32_b16 s14, s14, s16
-; GFX9-NEXT:    s_lshr_b32 s16, s11, 16
-; GFX9-NEXT:    s_lshr_b32 s12, s9, 16
-; GFX9-NEXT:    s_sub_i32 s11, s9, s11
-; GFX9-NEXT:    s_sub_i32 s16, s12, s16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s16
-; GFX9-NEXT:    s_sext_i32_i16 s16, s11
-; GFX9-NEXT:    s_sext_i32_i16 s17, s4
-; GFX9-NEXT:    s_ashr_i32 s11, s11, 16
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s16, s17
-; GFX9-NEXT:    s_cselect_b32 s16, s16, s17
-; GFX9-NEXT:    s_cmp_gt_i32 s11, s4
-; GFX9-NEXT:    s_cselect_b32 s4, s11, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX9-NEXT:    s_sext_i32_i16 s11, s4
-; GFX9-NEXT:    s_sext_i32_i16 s16, s14
+; GFX9-NEXT:    s_cmp_gt_i32 s15, s16
+; GFX9-NEXT:    s_cselect_b32 s15, s15, s16
+; GFX9-NEXT:    s_cmp_gt_i32 s10, s4
+; GFX9-NEXT:    s_cselect_b32 s4, s10, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s15, s4
+; GFX9-NEXT:    s_sext_i32_i16 s10, s4
+; GFX9-NEXT:    s_sext_i32_i16 s15, s13
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_ashr_i32 s14, s14, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s11, s16
-; GFX9-NEXT:    s_cselect_b32 s11, s11, s16
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s14
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s14
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s11, s4
-; GFX9-NEXT:    s_lshr_b32 s11, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s14, s4, 16
-; GFX9-NEXT:    s_add_i32 s0, s0, s4
-; GFX9-NEXT:    s_add_i32 s11, s11, s14
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s11
-; GFX9-NEXT:    s_sext_i32_i16 s4, s1
-; GFX9-NEXT:    s_ashr_i32 s11, s1, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s4, s13
-; GFX9-NEXT:    s_cselect_b32 s14, s4, s13
-; GFX9-NEXT:    s_cmp_gt_i32 s11, s10
-; GFX9-NEXT:    s_cselect_b32 s16, s11, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s14, s14, s16
-; GFX9-NEXT:    s_lshr_b32 s16, s14, 16
-; GFX9-NEXT:    s_sub_i32 s14, s8, s14
-; GFX9-NEXT:    s_sub_i32 s16, s15, s16
+; GFX9-NEXT:    s_ashr_i32 s13, s13, 16
+; GFX9-NEXT:    s_cmp_lt_i32 s10, s15
+; GFX9-NEXT:    s_cselect_b32 s10, s10, s15
 ; GFX9-NEXT:    s_cmp_lt_i32 s4, s13
 ; GFX9-NEXT:    s_cselect_b32 s4, s4, s13
-; GFX9-NEXT:    s_cmp_lt_i32 s11, s10
-; GFX9-NEXT:    s_cselect_b32 s11, s11, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s11
-; GFX9-NEXT:    s_lshr_b32 s11, s4, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s10, s4
+; GFX9-NEXT:    s_lshr_b32 s10, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s13, s4, 16
+; GFX9-NEXT:    s_add_i32 s0, s0, s4
+; GFX9-NEXT:    s_add_i32 s10, s10, s13
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s10
+; GFX9-NEXT:    s_sext_i32_i16 s4, s1
+; GFX9-NEXT:    s_ashr_i32 s10, s1, 16
+; GFX9-NEXT:    s_cmp_gt_i32 s4, s12
+; GFX9-NEXT:    s_cselect_b32 s13, s4, s12
+; GFX9-NEXT:    s_cmp_gt_i32 s10, 0
+; GFX9-NEXT:    s_cselect_b32 s15, s10, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s13, s13, s15
+; GFX9-NEXT:    s_lshr_b32 s15, s13, 16
+; GFX9-NEXT:    s_sub_i32 s13, s8, s13
+; GFX9-NEXT:    s_sub_i32 s15, s14, s15
+; GFX9-NEXT:    s_cmp_lt_i32 s4, s12
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s12
+; GFX9-NEXT:    s_cmp_lt_i32 s10, 0
+; GFX9-NEXT:    s_cselect_b32 s10, s10, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s10
+; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX9-NEXT:    s_sub_i32 s4, s9, s4
-; GFX9-NEXT:    s_sub_i32 s11, s12, s11
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s11
-; GFX9-NEXT:    s_pack_ll_b32_b16 s14, s14, s16
-; GFX9-NEXT:    s_sext_i32_i16 s11, s4
-; GFX9-NEXT:    s_sext_i32_i16 s16, s5
+; GFX9-NEXT:    s_sub_i32 s10, s11, s10
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s10
+; GFX9-NEXT:    s_pack_ll_b32_b16 s13, s13, s15
+; GFX9-NEXT:    s_sext_i32_i16 s10, s4
+; GFX9-NEXT:    s_sext_i32_i16 s15, s5
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX9-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s11, s16
-; GFX9-NEXT:    s_cselect_b32 s11, s11, s16
+; GFX9-NEXT:    s_cmp_gt_i32 s10, s15
+; GFX9-NEXT:    s_cselect_b32 s10, s10, s15
 ; GFX9-NEXT:    s_cmp_gt_i32 s4, s5
 ; GFX9-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s11, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s10, s4
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s4
-; GFX9-NEXT:    s_sext_i32_i16 s11, s14
+; GFX9-NEXT:    s_sext_i32_i16 s10, s13
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_ashr_i32 s14, s14, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s5, s11
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s11
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s14
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s14
+; GFX9-NEXT:    s_ashr_i32 s13, s13, 16
+; GFX9-NEXT:    s_cmp_lt_i32 s5, s10
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s10
+; GFX9-NEXT:    s_cmp_lt_i32 s4, s13
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s13
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s5, s4
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s11, s4, 16
+; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX9-NEXT:    s_add_i32 s1, s1, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s11
+; GFX9-NEXT:    s_add_i32 s5, s5, s10
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s2
 ; GFX9-NEXT:    s_ashr_i32 s5, s2, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s4, s13
-; GFX9-NEXT:    s_cselect_b32 s11, s4, s13
-; GFX9-NEXT:    s_cmp_gt_i32 s5, s10
-; GFX9-NEXT:    s_cselect_b32 s14, s5, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s14
-; GFX9-NEXT:    s_lshr_b32 s14, s11, 16
-; GFX9-NEXT:    s_sub_i32 s11, s8, s11
-; GFX9-NEXT:    s_sub_i32 s14, s15, s14
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s13
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s13
-; GFX9-NEXT:    s_cmp_lt_i32 s5, s10
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s10
+; GFX9-NEXT:    s_cmp_gt_i32 s4, s12
+; GFX9-NEXT:    s_cselect_b32 s10, s4, s12
+; GFX9-NEXT:    s_cmp_gt_i32 s5, 0
+; GFX9-NEXT:    s_cselect_b32 s13, s5, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s13
+; GFX9-NEXT:    s_lshr_b32 s13, s10, 16
+; GFX9-NEXT:    s_sub_i32 s10, s8, s10
+; GFX9-NEXT:    s_sub_i32 s13, s14, s13
+; GFX9-NEXT:    s_cmp_lt_i32 s4, s12
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s12
+; GFX9-NEXT:    s_cmp_lt_i32 s5, 0
+; GFX9-NEXT:    s_cselect_b32 s5, s5, 0
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX9-NEXT:    s_sub_i32 s4, s9, s4
-; GFX9-NEXT:    s_sub_i32 s5, s12, s5
+; GFX9-NEXT:    s_sub_i32 s5, s11, s5
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
-; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s14
+; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s13
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s4
-; GFX9-NEXT:    s_sext_i32_i16 s14, s6
+; GFX9-NEXT:    s_sext_i32_i16 s13, s6
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s5, s14
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s14
+; GFX9-NEXT:    s_cmp_gt_i32 s5, s13
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s13
 ; GFX9-NEXT:    s_cmp_gt_i32 s4, s6
 ; GFX9-NEXT:    s_cselect_b32 s4, s4, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s5, s4
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s4
-; GFX9-NEXT:    s_sext_i32_i16 s6, s11
+; GFX9-NEXT:    s_sext_i32_i16 s6, s10
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_ashr_i32 s11, s11, 16
+; GFX9-NEXT:    s_ashr_i32 s10, s10, 16
 ; GFX9-NEXT:    s_cmp_lt_i32 s5, s6
 ; GFX9-NEXT:    s_cselect_b32 s5, s5, s6
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s11
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s11
+; GFX9-NEXT:    s_cmp_lt_i32 s4, s10
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s10
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s5, s4
 ; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
@@ -6954,22 +6862,22 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s3
 ; GFX9-NEXT:    s_ashr_i32 s5, s3, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s4, s13
-; GFX9-NEXT:    s_cselect_b32 s6, s4, s13
-; GFX9-NEXT:    s_cmp_gt_i32 s5, s10
-; GFX9-NEXT:    s_cselect_b32 s11, s5, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s11
-; GFX9-NEXT:    s_lshr_b32 s11, s6, 16
+; GFX9-NEXT:    s_cmp_gt_i32 s4, s12
+; GFX9-NEXT:    s_cselect_b32 s6, s4, s12
+; GFX9-NEXT:    s_cmp_gt_i32 s5, 0
+; GFX9-NEXT:    s_cselect_b32 s10, s5, 0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s10
+; GFX9-NEXT:    s_lshr_b32 s10, s6, 16
 ; GFX9-NEXT:    s_sub_i32 s6, s8, s6
-; GFX9-NEXT:    s_sub_i32 s8, s15, s11
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s13
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s13
-; GFX9-NEXT:    s_cmp_lt_i32 s5, s10
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s10
+; GFX9-NEXT:    s_sub_i32 s8, s14, s10
+; GFX9-NEXT:    s_cmp_lt_i32 s4, s12
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s12
+; GFX9-NEXT:    s_cmp_lt_i32 s5, 0
+; GFX9-NEXT:    s_cselect_b32 s5, s5, 0
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX9-NEXT:    s_sub_i32 s4, s9, s4
-; GFX9-NEXT:    s_sub_i32 s5, s12, s5
+; GFX9-NEXT:    s_sub_i32 s5, s11, s5
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s8
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s4
@@ -6999,188 +6907,184 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ;
 ; GFX10-LABEL: s_saddsat_v8i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s8, 0, 0
-; GFX10-NEXT:    s_sext_i32_i16 s9, s0
-; GFX10-NEXT:    s_sext_i32_i16 s11, s8
+; GFX10-NEXT:    s_sext_i32_i16 s8, s0
+; GFX10-NEXT:    s_sext_i32_i16 s9, 0
 ; GFX10-NEXT:    s_ashr_i32 s10, s0, 16
-; GFX10-NEXT:    s_ashr_i32 s8, s8, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s9, s11
-; GFX10-NEXT:    s_movk_i32 s13, 0x7fff
-; GFX10-NEXT:    s_cselect_b32 s12, s9, s11
-; GFX10-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s13, s13, s13
-; GFX10-NEXT:    s_cselect_b32 s14, s10, s8
-; GFX10-NEXT:    s_movk_i32 s16, 0x8000
-; GFX10-NEXT:    s_pack_ll_b32_b16 s12, s12, s14
-; GFX10-NEXT:    s_lshr_b32 s14, s13, 16
-; GFX10-NEXT:    s_lshr_b32 s15, s12, 16
-; GFX10-NEXT:    s_sub_i32 s12, s13, s12
-; GFX10-NEXT:    s_sub_i32 s15, s14, s15
-; GFX10-NEXT:    s_cmp_lt_i32 s9, s11
-; GFX10-NEXT:    s_pack_ll_b32_b16 s16, s16, s16
-; GFX10-NEXT:    s_cselect_b32 s9, s9, s11
-; GFX10-NEXT:    s_cmp_lt_i32 s10, s8
-; GFX10-NEXT:    s_sext_i32_i16 s18, s4
-; GFX10-NEXT:    s_cselect_b32 s10, s10, s8
-; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
-; GFX10-NEXT:    s_lshr_b32 s10, s16, 16
-; GFX10-NEXT:    s_lshr_b32 s17, s9, 16
-; GFX10-NEXT:    s_sub_i32 s9, s16, s9
-; GFX10-NEXT:    s_sub_i32 s17, s10, s17
+; GFX10-NEXT:    s_cmp_gt_i32 s8, s9
+; GFX10-NEXT:    s_mov_b32 s13, 0x7fff7fff
+; GFX10-NEXT:    s_cselect_b32 s11, s8, s9
+; GFX10-NEXT:    s_cmp_gt_i32 s10, 0
+; GFX10-NEXT:    s_mov_b32 s15, 0x80008000
+; GFX10-NEXT:    s_cselect_b32 s12, s10, 0
+; GFX10-NEXT:    s_sext_i32_i16 s17, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
+; GFX10-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX10-NEXT:    s_lshr_b32 s14, s11, 16
+; GFX10-NEXT:    s_sub_i32 s11, s13, s11
+; GFX10-NEXT:    s_sub_i32 s14, s12, s14
+; GFX10-NEXT:    s_cmp_lt_i32 s8, s9
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s17
-; GFX10-NEXT:    s_sext_i32_i16 s17, s9
-; GFX10-NEXT:    s_ashr_i32 s9, s9, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s17, s18
-; GFX10-NEXT:    s_cselect_b32 s17, s17, s18
-; GFX10-NEXT:    s_cmp_gt_i32 s9, s4
-; GFX10-NEXT:    s_sext_i32_i16 s18, s5
-; GFX10-NEXT:    s_cselect_b32 s4, s9, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s12, s15
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s17, s4
-; GFX10-NEXT:    s_sext_i32_i16 s15, s9
-; GFX10-NEXT:    s_sext_i32_i16 s12, s4
+; GFX10-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX10-NEXT:    s_cmp_lt_i32 s10, 0
+; GFX10-NEXT:    s_cselect_b32 s10, s10, 0
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_ashr_i32 s9, s9, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s12, s15
-; GFX10-NEXT:    s_cselect_b32 s12, s12, s15
-; GFX10-NEXT:    s_cmp_lt_i32 s4, s9
-; GFX10-NEXT:    s_cselect_b32 s4, s4, s9
-; GFX10-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s12, s4
-; GFX10-NEXT:    s_lshr_b32 s12, s4, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s8, s10
+; GFX10-NEXT:    s_mov_b32 s10, 0x8000
+; GFX10-NEXT:    s_lshr_b32 s16, s8, 16
+; GFX10-NEXT:    s_sub_i32 s8, s15, s8
+; GFX10-NEXT:    s_sub_i32 s16, s10, s16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s8, s16
+; GFX10-NEXT:    s_sext_i32_i16 s16, s8
+; GFX10-NEXT:    s_ashr_i32 s8, s8, 16
+; GFX10-NEXT:    s_cmp_gt_i32 s16, s17
+; GFX10-NEXT:    s_cselect_b32 s16, s16, s17
+; GFX10-NEXT:    s_cmp_gt_i32 s8, s4
+; GFX10-NEXT:    s_sext_i32_i16 s17, s5
+; GFX10-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s11, s14
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
+; GFX10-NEXT:    s_sext_i32_i16 s14, s8
+; GFX10-NEXT:    s_sext_i32_i16 s11, s4
+; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX10-NEXT:    s_ashr_i32 s8, s8, 16
+; GFX10-NEXT:    s_cmp_lt_i32 s11, s14
+; GFX10-NEXT:    s_cselect_b32 s11, s11, s14
+; GFX10-NEXT:    s_cmp_lt_i32 s4, s8
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s8
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s11, s4
+; GFX10-NEXT:    s_lshr_b32 s11, s4, 16
 ; GFX10-NEXT:    s_add_i32 s0, s0, s4
 ; GFX10-NEXT:    s_sext_i32_i16 s4, s1
-; GFX10-NEXT:    s_add_i32 s9, s9, s12
-; GFX10-NEXT:    s_ashr_i32 s12, s1, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s4, s11
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s9
-; GFX10-NEXT:    s_cselect_b32 s15, s4, s11
-; GFX10-NEXT:    s_cmp_gt_i32 s12, s8
-; GFX10-NEXT:    s_cselect_b32 s17, s12, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s15, s15, s17
-; GFX10-NEXT:    s_lshr_b32 s17, s15, 16
-; GFX10-NEXT:    s_sub_i32 s15, s13, s15
-; GFX10-NEXT:    s_sub_i32 s17, s14, s17
-; GFX10-NEXT:    s_cmp_lt_i32 s4, s11
-; GFX10-NEXT:    s_cselect_b32 s4, s4, s11
-; GFX10-NEXT:    s_cmp_lt_i32 s12, s8
-; GFX10-NEXT:    s_cselect_b32 s12, s12, s8
+; GFX10-NEXT:    s_add_i32 s8, s8, s11
+; GFX10-NEXT:    s_ashr_i32 s11, s1, 16
+; GFX10-NEXT:    s_cmp_gt_i32 s4, s9
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s8
+; GFX10-NEXT:    s_cselect_b32 s14, s4, s9
+; GFX10-NEXT:    s_cmp_gt_i32 s11, 0
+; GFX10-NEXT:    s_cselect_b32 s16, s11, 0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s14, s14, s16
+; GFX10-NEXT:    s_lshr_b32 s16, s14, 16
+; GFX10-NEXT:    s_sub_i32 s14, s13, s14
+; GFX10-NEXT:    s_sub_i32 s16, s12, s16
+; GFX10-NEXT:    s_cmp_lt_i32 s4, s9
+; GFX10-NEXT:    s_pack_ll_b32_b16 s14, s14, s16
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s9
+; GFX10-NEXT:    s_cmp_lt_i32 s11, 0
+; GFX10-NEXT:    s_cselect_b32 s11, s11, 0
 ; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s12
-; GFX10-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX10-NEXT:    s_sub_i32 s4, s16, s4
-; GFX10-NEXT:    s_sub_i32 s12, s10, s12
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s12
-; GFX10-NEXT:    s_sext_i32_i16 s12, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s11
+; GFX10-NEXT:    s_lshr_b32 s11, s4, 16
+; GFX10-NEXT:    s_sub_i32 s4, s15, s4
+; GFX10-NEXT:    s_sub_i32 s11, s10, s11
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s11
+; GFX10-NEXT:    s_sext_i32_i16 s11, s4
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s12, s18
-; GFX10-NEXT:    s_cselect_b32 s12, s12, s18
+; GFX10-NEXT:    s_cmp_gt_i32 s11, s17
+; GFX10-NEXT:    s_cselect_b32 s11, s11, s17
 ; GFX10-NEXT:    s_cmp_gt_i32 s4, s5
-; GFX10-NEXT:    s_sext_i32_i16 s18, s6
+; GFX10-NEXT:    s_sext_i32_i16 s17, s6
 ; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s15, s17
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s12, s4
-; GFX10-NEXT:    s_sext_i32_i16 s15, s5
-; GFX10-NEXT:    s_sext_i32_i16 s12, s4
+; GFX10-NEXT:    s_sext_i32_i16 s5, s14
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s11, s4
+; GFX10-NEXT:    s_ashr_i32 s11, s14, 16
+; GFX10-NEXT:    s_sext_i32_i16 s14, s4
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s12, s15
-; GFX10-NEXT:    s_cselect_b32 s12, s12, s15
-; GFX10-NEXT:    s_cmp_lt_i32 s4, s5
-; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s12, s4
-; GFX10-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX10-NEXT:    s_add_i32 s1, s1, s4
-; GFX10-NEXT:    s_sext_i32_i16 s4, s2
-; GFX10-NEXT:    s_add_i32 s5, s5, s12
-; GFX10-NEXT:    s_ashr_i32 s12, s2, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s4, s11
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT:    s_cselect_b32 s15, s4, s11
-; GFX10-NEXT:    s_cmp_gt_i32 s12, s8
-; GFX10-NEXT:    s_cselect_b32 s17, s12, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s15, s15, s17
-; GFX10-NEXT:    s_lshr_b32 s17, s15, 16
-; GFX10-NEXT:    s_sub_i32 s15, s13, s15
-; GFX10-NEXT:    s_sub_i32 s17, s14, s17
+; GFX10-NEXT:    s_cmp_lt_i32 s14, s5
+; GFX10-NEXT:    s_cselect_b32 s5, s14, s5
 ; GFX10-NEXT:    s_cmp_lt_i32 s4, s11
-; GFX10-NEXT:    s_pack_ll_b32_b16 s15, s15, s17
+; GFX10-NEXT:    s_sext_i32_i16 s14, s2
 ; GFX10-NEXT:    s_cselect_b32 s4, s4, s11
-; GFX10-NEXT:    s_cmp_lt_i32 s12, s8
-; GFX10-NEXT:    s_cselect_b32 s12, s12, s8
+; GFX10-NEXT:    s_lshr_b32 s11, s1, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s5, s4
+; GFX10-NEXT:    s_ashr_i32 s5, s2, 16
+; GFX10-NEXT:    s_lshr_b32 s16, s4, 16
+; GFX10-NEXT:    s_add_i32 s1, s1, s4
+; GFX10-NEXT:    s_add_i32 s11, s11, s16
+; GFX10-NEXT:    s_cmp_gt_i32 s14, s9
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s11
+; GFX10-NEXT:    s_cselect_b32 s4, s14, s9
+; GFX10-NEXT:    s_cmp_gt_i32 s5, 0
+; GFX10-NEXT:    s_cselect_b32 s16, s5, 0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s16
+; GFX10-NEXT:    s_lshr_b32 s16, s4, 16
+; GFX10-NEXT:    s_sub_i32 s4, s13, s4
+; GFX10-NEXT:    s_sub_i32 s16, s12, s16
+; GFX10-NEXT:    s_cmp_lt_i32 s14, s9
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s16
+; GFX10-NEXT:    s_cselect_b32 s14, s14, s9
+; GFX10-NEXT:    s_cmp_lt_i32 s5, 0
+; GFX10-NEXT:    s_cselect_b32 s5, s5, 0
 ; GFX10-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s12
-; GFX10-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX10-NEXT:    s_sub_i32 s4, s16, s4
-; GFX10-NEXT:    s_sub_i32 s12, s10, s12
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s12
-; GFX10-NEXT:    s_sext_i32_i16 s12, s4
-; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s12, s18
-; GFX10-NEXT:    s_cselect_b32 s12, s12, s18
-; GFX10-NEXT:    s_cmp_gt_i32 s4, s6
-; GFX10-NEXT:    s_cselect_b32 s4, s4, s6
-; GFX10-NEXT:    s_sext_i32_i16 s6, s15
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s12, s4
-; GFX10-NEXT:    s_ashr_i32 s12, s15, 16
-; GFX10-NEXT:    s_sext_i32_i16 s15, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s14, s5
+; GFX10-NEXT:    s_lshr_b32 s14, s5, 16
+; GFX10-NEXT:    s_sub_i32 s5, s15, s5
+; GFX10-NEXT:    s_sub_i32 s14, s10, s14
+; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s14
+; GFX10-NEXT:    s_sext_i32_i16 s14, s5
+; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
+; GFX10-NEXT:    s_cmp_gt_i32 s14, s17
+; GFX10-NEXT:    s_cselect_b32 s14, s14, s17
+; GFX10-NEXT:    s_cmp_gt_i32 s5, s6
+; GFX10-NEXT:    s_cselect_b32 s5, s5, s6
+; GFX10-NEXT:    s_sext_i32_i16 s6, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s14, s5
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s15, s6
-; GFX10-NEXT:    s_cselect_b32 s6, s15, s6
-; GFX10-NEXT:    s_cmp_lt_i32 s4, s12
-; GFX10-NEXT:    s_sext_i32_i16 s15, s3
-; GFX10-NEXT:    s_cselect_b32 s4, s4, s12
-; GFX10-NEXT:    s_lshr_b32 s12, s2, 16
+; GFX10-NEXT:    s_sext_i32_i16 s14, s5
+; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
+; GFX10-NEXT:    s_cmp_lt_i32 s14, s6
+; GFX10-NEXT:    s_cselect_b32 s6, s14, s6
+; GFX10-NEXT:    s_cmp_lt_i32 s5, s4
+; GFX10-NEXT:    s_sext_i32_i16 s14, s3
+; GFX10-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s6, s4
 ; GFX10-NEXT:    s_ashr_i32 s6, s3, 16
-; GFX10-NEXT:    s_lshr_b32 s17, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s16, s4, 16
 ; GFX10-NEXT:    s_add_i32 s2, s2, s4
-; GFX10-NEXT:    s_add_i32 s12, s12, s17
-; GFX10-NEXT:    s_cmp_gt_i32 s15, s11
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s12
-; GFX10-NEXT:    s_cselect_b32 s4, s15, s11
-; GFX10-NEXT:    s_cmp_gt_i32 s6, s8
-; GFX10-NEXT:    s_cselect_b32 s17, s6, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s17
-; GFX10-NEXT:    s_lshr_b32 s17, s4, 16
+; GFX10-NEXT:    s_add_i32 s5, s5, s16
+; GFX10-NEXT:    s_cmp_gt_i32 s14, s9
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
+; GFX10-NEXT:    s_cselect_b32 s4, s14, s9
+; GFX10-NEXT:    s_cmp_gt_i32 s6, 0
+; GFX10-NEXT:    s_cselect_b32 s16, s6, 0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s16
+; GFX10-NEXT:    s_lshr_b32 s16, s4, 16
 ; GFX10-NEXT:    s_sub_i32 s4, s13, s4
-; GFX10-NEXT:    s_sub_i32 s13, s14, s17
-; GFX10-NEXT:    s_cmp_lt_i32 s15, s11
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s13
-; GFX10-NEXT:    s_cselect_b32 s11, s15, s11
-; GFX10-NEXT:    s_cmp_lt_i32 s6, s8
-; GFX10-NEXT:    s_cselect_b32 s6, s6, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s11, s6
-; GFX10-NEXT:    s_lshr_b32 s8, s6, 16
-; GFX10-NEXT:    s_sub_i32 s6, s16, s6
-; GFX10-NEXT:    s_sub_i32 s8, s10, s8
+; GFX10-NEXT:    s_sub_i32 s12, s12, s16
+; GFX10-NEXT:    s_cmp_lt_i32 s14, s9
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s12
+; GFX10-NEXT:    s_cselect_b32 s9, s14, s9
+; GFX10-NEXT:    s_cmp_lt_i32 s6, 0
+; GFX10-NEXT:    s_cselect_b32 s6, s6, 0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s9, s6
+; GFX10-NEXT:    s_lshr_b32 s9, s6, 16
+; GFX10-NEXT:    s_sub_i32 s6, s15, s6
+; GFX10-NEXT:    s_sub_i32 s9, s10, s9
 ; GFX10-NEXT:    s_sext_i32_i16 s10, s7
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s8
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s9
 ; GFX10-NEXT:    s_ashr_i32 s7, s7, 16
-; GFX10-NEXT:    s_sext_i32_i16 s8, s6
+; GFX10-NEXT:    s_sext_i32_i16 s9, s6
 ; GFX10-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s8, s10
-; GFX10-NEXT:    s_cselect_b32 s8, s8, s10
+; GFX10-NEXT:    s_cmp_gt_i32 s9, s10
+; GFX10-NEXT:    s_cselect_b32 s9, s9, s10
 ; GFX10-NEXT:    s_cmp_gt_i32 s6, s7
 ; GFX10-NEXT:    s_cselect_b32 s6, s6, s7
 ; GFX10-NEXT:    s_sext_i32_i16 s7, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s8, s6
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s9, s6
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_sext_i32_i16 s8, s6
+; GFX10-NEXT:    s_sext_i32_i16 s9, s6
 ; GFX10-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s8, s7
-; GFX10-NEXT:    s_cselect_b32 s7, s8, s7
+; GFX10-NEXT:    s_cmp_lt_i32 s9, s7
+; GFX10-NEXT:    s_cselect_b32 s7, s9, s7
 ; GFX10-NEXT:    s_cmp_lt_i32 s6, s4
 ; GFX10-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX10-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s7, s4
-; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s7, s4, 16
 ; GFX10-NEXT:    s_add_i32 s3, s3, s4
-; GFX10-NEXT:    s_add_i32 s5, s5, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
+; GFX10-NEXT:    s_add_i32 s6, s6, s7
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
   %cast = bitcast <8 x i16> %result to <4 x i32>

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index ed1fe7af5f36..015f6b5de8b0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -529,8 +529,7 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) {
 ; GFX9-LABEL: v_shl_v2i16_15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, 15, 15
-; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s4, v0
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <2 x i16> %value, <i16 15, i16 15>
   ret <2 x i16> %result

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 3e1778bcb881..ac2a75383cba 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4512,15 +4512,12 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX9-LABEL: v_ssubsat_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, -1, -1
-; GFX9-NEXT:    s_movk_i32 s5, 0x8000
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX9-NEXT:    v_pk_max_i16 v2, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v2, v2, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX9-NEXT:    v_pk_min_i16 v3, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v3, v3, s5
+; GFX9-NEXT:    v_pk_max_i16 v2, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fff7fff
+; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v3
+; GFX9-NEXT:    v_pk_min_i16 v3, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x80008000
+; GFX9-NEXT:    v_pk_sub_i16 v3, v3, v4
 ; GFX9-NEXT:    v_pk_max_i16 v1, v2, v1
 ; GFX9-NEXT:    v_pk_min_i16 v1, v1, v3
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
@@ -4530,16 +4527,11 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
-; GFX10-NEXT:    s_movk_i32 s5, 0x7fff
-; GFX10-NEXT:    v_pk_max_i16 v2, v0, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX10-NEXT:    v_pk_min_i16 v3, v0, s4
-; GFX10-NEXT:    s_movk_i32 s6, 0x8000
+; GFX10-NEXT:    v_pk_max_i16 v2, v0, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_min_i16 v3, v0, -1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_pk_sub_i16 v2, v2, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s6, s6
-; GFX10-NEXT:    v_pk_sub_i16 v3, v3, s4
+; GFX10-NEXT:    v_pk_sub_i16 v2, v2, 0x7fff7fff
+; GFX10-NEXT:    v_pk_sub_i16 v3, v3, 0x80008000
 ; GFX10-NEXT:    v_pk_max_i16 v1, v2, v1
 ; GFX10-NEXT:    v_pk_min_i16 v1, v1, v3
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1
@@ -4635,53 +4627,45 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ;
 ; GFX9-LABEL: s_ssubsat_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
-; GFX9-NEXT:    s_sext_i32_i16 s7, s4
-; GFX9-NEXT:    s_sext_i32_i16 s5, s0
-; GFX9-NEXT:    s_ashr_i32 s6, s0, 16
-; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s5, s7
-; GFX9-NEXT:    s_cselect_b32 s8, s5, s7
-; GFX9-NEXT:    s_cmp_gt_i32 s6, s4
-; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX9-NEXT:    s_cselect_b32 s9, s6, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s2
-; GFX9-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s8, 16
-; GFX9-NEXT:    s_sub_i32 s2, s8, s2
-; GFX9-NEXT:    s_sub_i32 s8, s9, s10
-; GFX9-NEXT:    s_cmp_lt_i32 s5, s7
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX9-NEXT:    s_cmp_lt_i32 s6, s4
-; GFX9-NEXT:    s_movk_i32 s3, 0x8000
-; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s5, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s3
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
-; GFX9-NEXT:    s_sub_i32 s3, s4, s3
-; GFX9-NEXT:    s_sub_i32 s4, s5, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
-; GFX9-NEXT:    s_sext_i32_i16 s4, s2
+; GFX9-NEXT:    s_sext_i32_i16 s2, s0
+; GFX9-NEXT:    s_ashr_i32 s3, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s4, -1
+; GFX9-NEXT:    s_cmp_gt_i32 s2, s4
+; GFX9-NEXT:    s_cselect_b32 s5, s2, s4
+; GFX9-NEXT:    s_cmp_gt_i32 s3, -1
+; GFX9-NEXT:    s_cselect_b32 s6, s3, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX9-NEXT:    s_sub_i32 s5, s5, 0x7fff7fff
+; GFX9-NEXT:    s_sub_i32 s6, s6, 0x7fff
+; GFX9-NEXT:    s_cmp_lt_i32 s2, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX9-NEXT:    s_cmp_lt_i32 s3, -1
+; GFX9-NEXT:    s_cselect_b32 s3, s3, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX9-NEXT:    s_sub_i32 s2, s2, 0x80008000
+; GFX9-NEXT:    s_sub_i32 s3, s3, 0x8000
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GFX9-NEXT:    s_sext_i32_i16 s3, s5
+; GFX9-NEXT:    s_ashr_i32 s4, s5, 16
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s1
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX9-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s4, s5
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX9-NEXT:    s_cmp_gt_i32 s2, s1
-; GFX9-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s4, s1
-; GFX9-NEXT:    s_sext_i32_i16 s2, s1
-; GFX9-NEXT:    s_sext_i32_i16 s4, s3
+; GFX9-NEXT:    s_cmp_gt_i32 s3, s5
+; GFX9-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX9-NEXT:    s_cmp_gt_i32 s4, s1
+; GFX9-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s3, s1
+; GFX9-NEXT:    s_sext_i32_i16 s3, s1
+; GFX9-NEXT:    s_sext_i32_i16 s4, s2
 ; GFX9-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s2, s4
-; GFX9-NEXT:    s_cselect_b32 s2, s2, s4
-; GFX9-NEXT:    s_cmp_lt_i32 s1, s3
-; GFX9-NEXT:    s_cselect_b32 s1, s1, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s2, s1
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX9-NEXT:    s_cmp_lt_i32 s3, s4
+; GFX9-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX9-NEXT:    s_cmp_lt_i32 s1, s2
+; GFX9-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s3, s1
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX9-NEXT:    s_sub_i32 s0, s0, s1
@@ -4691,47 +4675,39 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ;
 ; GFX10-LABEL: s_ssubsat_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, -1, -1
-; GFX10-NEXT:    s_sext_i32_i16 s3, s0
-; GFX10-NEXT:    s_sext_i32_i16 s5, s2
+; GFX10-NEXT:    s_sext_i32_i16 s2, s0
+; GFX10-NEXT:    s_sext_i32_i16 s3, -1
 ; GFX10-NEXT:    s_ashr_i32 s4, s0, 16
-; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s3, s5
-; GFX10-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX10-NEXT:    s_cselect_b32 s6, s3, s5
-; GFX10-NEXT:    s_cmp_gt_i32 s4, s2
+; GFX10-NEXT:    s_cmp_gt_i32 s2, s3
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_cselect_b32 s7, s4, s2
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
-; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s8, s8
-; GFX10-NEXT:    s_lshr_b32 s8, s6, 16
-; GFX10-NEXT:    s_lshr_b32 s9, s7, 16
-; GFX10-NEXT:    s_sub_i32 s6, s6, s7
-; GFX10-NEXT:    s_sub_i32 s7, s8, s9
-; GFX10-NEXT:    s_cmp_lt_i32 s3, s5
-; GFX10-NEXT:    s_cselect_b32 s3, s3, s5
-; GFX10-NEXT:    s_cmp_lt_i32 s4, s2
-; GFX10-NEXT:    s_movk_i32 s5, 0x8000
-; GFX10-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s5, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
-; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX10-NEXT:    s_sub_i32 s2, s2, s4
-; GFX10-NEXT:    s_sub_i32 s4, s5, s6
-; GFX10-NEXT:    s_sext_i32_i16 s5, s3
+; GFX10-NEXT:    s_cselect_b32 s5, s2, s3
+; GFX10-NEXT:    s_cmp_gt_i32 s4, -1
+; GFX10-NEXT:    s_cselect_b32 s6, s4, -1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
 ; GFX10-NEXT:    s_sext_i32_i16 s6, s1
-; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
+; GFX10-NEXT:    s_lshr_b32 s7, s5, 16
+; GFX10-NEXT:    s_sub_i32 s5, s5, 0x7fff7fff
+; GFX10-NEXT:    s_sub_i32 s7, s7, 0x7fff
+; GFX10-NEXT:    s_cmp_lt_i32 s2, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s7
+; GFX10-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX10-NEXT:    s_cmp_lt_i32 s4, -1
+; GFX10-NEXT:    s_sext_i32_i16 s3, s5
+; GFX10-NEXT:    s_cselect_b32 s4, s4, -1
+; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
 ; GFX10-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s5, s6
+; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-NEXT:    s_sub_i32 s2, s2, 0x80008000
+; GFX10-NEXT:    s_sub_i32 s4, s4, 0x8000
+; GFX10-NEXT:    s_cmp_gt_i32 s3, s6
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
-; GFX10-NEXT:    s_cselect_b32 s5, s5, s6
-; GFX10-NEXT:    s_cmp_gt_i32 s3, s1
+; GFX10-NEXT:    s_cselect_b32 s3, s3, s6
+; GFX10-NEXT:    s_cmp_gt_i32 s5, s1
 ; GFX10-NEXT:    s_sext_i32_i16 s4, s2
-; GFX10-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX10-NEXT:    s_cselect_b32 s1, s5, s1
 ; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s5, s1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s3, s1
 ; GFX10-NEXT:    s_sext_i32_i16 s3, s1
 ; GFX10-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX10-NEXT:    s_cmp_lt_i32 s3, s4
@@ -4819,72 +4795,56 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ;
 ; GFX9-LABEL: ssubsat_v2i16_sv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, -1, -1
-; GFX9-NEXT:    s_sext_i32_i16 s6, s3
-; GFX9-NEXT:    s_sext_i32_i16 s4, s0
-; GFX9-NEXT:    s_ashr_i32 s5, s0, 16
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s4, s6
-; GFX9-NEXT:    s_cselect_b32 s7, s4, s6
-; GFX9-NEXT:    s_cmp_gt_i32 s5, s3
-; GFX9-NEXT:    s_movk_i32 s1, 0x7fff
-; GFX9-NEXT:    s_cselect_b32 s8, s5, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s1
-; GFX9-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s7, 16
-; GFX9-NEXT:    s_sub_i32 s1, s7, s1
-; GFX9-NEXT:    s_sub_i32 s7, s8, s9
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s6
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s6
-; GFX9-NEXT:    s_cmp_lt_i32 s5, s3
-; GFX9-NEXT:    s_movk_i32 s2, 0x8000
-; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s2
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_sub_i32 s2, s3, s2
-; GFX9-NEXT:    s_sub_i32 s3, s4, s5
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
-; GFX9-NEXT:    v_pk_max_i16 v0, s1, v0
-; GFX9-NEXT:    v_pk_min_i16 v0, v0, s2
+; GFX9-NEXT:    s_sext_i32_i16 s1, s0
+; GFX9-NEXT:    s_ashr_i32 s2, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s3, -1
+; GFX9-NEXT:    s_cmp_gt_i32 s1, s3
+; GFX9-NEXT:    s_cselect_b32 s4, s1, s3
+; GFX9-NEXT:    s_cmp_gt_i32 s2, -1
+; GFX9-NEXT:    s_cselect_b32 s5, s2, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
+; GFX9-NEXT:    s_sub_i32 s4, s4, 0x7fff7fff
+; GFX9-NEXT:    s_sub_i32 s5, s5, 0x7fff
+; GFX9-NEXT:    s_cmp_lt_i32 s1, s3
+; GFX9-NEXT:    s_cselect_b32 s1, s1, s3
+; GFX9-NEXT:    s_cmp_lt_i32 s2, -1
+; GFX9-NEXT:    s_cselect_b32 s2, s2, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-NEXT:    s_sub_i32 s1, s1, 0x80008000
+; GFX9-NEXT:    s_sub_i32 s2, s2, 0x8000
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX9-NEXT:    v_pk_max_i16 v0, s4, v0
+; GFX9-NEXT:    v_pk_min_i16 v0, v0, s1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: ssubsat_v2i16_sv:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, -1, -1
-; GFX10-NEXT:    s_sext_i32_i16 s2, s0
-; GFX10-NEXT:    s_sext_i32_i16 s4, s1
+; GFX10-NEXT:    s_sext_i32_i16 s1, s0
+; GFX10-NEXT:    s_sext_i32_i16 s2, -1
 ; GFX10-NEXT:    s_ashr_i32 s3, s0, 16
-; GFX10-NEXT:    s_ashr_i32 s1, s1, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s2, s4
-; GFX10-NEXT:    s_movk_i32 s7, 0x7fff
-; GFX10-NEXT:    s_cselect_b32 s5, s2, s4
-; GFX10-NEXT:    s_cmp_gt_i32 s3, s1
+; GFX10-NEXT:    s_cmp_gt_i32 s1, s2
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_cselect_b32 s6, s3, s1
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s7, s7
-; GFX10-NEXT:    s_lshr_b32 s7, s5, 16
-; GFX10-NEXT:    s_lshr_b32 s8, s6, 16
-; GFX10-NEXT:    s_sub_i32 s5, s5, s6
-; GFX10-NEXT:    s_sub_i32 s6, s7, s8
-; GFX10-NEXT:    s_cmp_lt_i32 s2, s4
-; GFX10-NEXT:    s_cselect_b32 s2, s2, s4
-; GFX10-NEXT:    s_cmp_lt_i32 s3, s1
-; GFX10-NEXT:    s_movk_i32 s4, 0x8000
-; GFX10-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s4, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s2, s1
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s5, s6
-; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX10-NEXT:    s_cselect_b32 s4, s1, s2
+; GFX10-NEXT:    s_cmp_gt_i32 s3, -1
+; GFX10-NEXT:    s_cselect_b32 s5, s3, -1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX10-NEXT:    s_lshr_b32 s5, s4, 16
+; GFX10-NEXT:    s_sub_i32 s4, s4, 0x7fff7fff
+; GFX10-NEXT:    s_sub_i32 s5, s5, 0x7fff
+; GFX10-NEXT:    s_cmp_lt_i32 s1, s2
+; GFX10-NEXT:    s_cselect_b32 s1, s1, s2
+; GFX10-NEXT:    s_cmp_lt_i32 s3, -1
+; GFX10-NEXT:    s_cselect_b32 s2, s3, -1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s4, s5
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX10-NEXT:    v_pk_max_i16 v0, s2, v0
-; GFX10-NEXT:    s_sub_i32 s1, s1, s3
-; GFX10-NEXT:    s_sub_i32 s2, s4, s5
+; GFX10-NEXT:    s_sub_i32 s1, s1, 0x80008000
+; GFX10-NEXT:    s_sub_i32 s2, s3, 0x8000
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
 ; GFX10-NEXT:    v_pk_min_i16 v0, v0, s1
 ; GFX10-NEXT:    v_pk_sub_i16 v0, s0, v0
@@ -4951,15 +4911,12 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ;
 ; GFX9-LABEL: ssubsat_v2i16_vs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s1, 0x7fff
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, -1, -1
-; GFX9-NEXT:    s_movk_i32 s2, 0x8000
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s1
-; GFX9-NEXT:    v_pk_max_i16 v1, v0, s3
-; GFX9-NEXT:    v_pk_sub_i16 v1, v1, s1
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s2
-; GFX9-NEXT:    v_pk_min_i16 v2, v0, s3
-; GFX9-NEXT:    v_pk_sub_i16 v2, v2, s2
+; GFX9-NEXT:    v_pk_max_i16 v1, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff7fff
+; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v2
+; GFX9-NEXT:    v_pk_min_i16 v2, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x80008000
+; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v3
 ; GFX9-NEXT:    v_pk_max_i16 v1, v1, s0
 ; GFX9-NEXT:    v_pk_min_i16 v1, v1, v2
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
@@ -4967,16 +4924,11 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ;
 ; GFX10-LABEL: ssubsat_v2i16_vs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, -1, -1
-; GFX10-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX10-NEXT:    v_pk_max_i16 v1, v0, s1
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s2
-; GFX10-NEXT:    v_pk_min_i16 v2, v0, s1
-; GFX10-NEXT:    s_movk_i32 s3, 0x8000
+; GFX10-NEXT:    v_pk_max_i16 v1, v0, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_min_i16 v2, v0, -1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_pk_sub_i16 v1, v1, s2
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s3, s3
-; GFX10-NEXT:    v_pk_sub_i16 v2, v2, s1
+; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x7fff7fff
+; GFX10-NEXT:    v_pk_sub_i16 v2, v2, 0x80008000
 ; GFX10-NEXT:    v_pk_max_i16 v1, v1, s0
 ; GFX10-NEXT:    v_pk_min_i16 v1, v1, v2
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1
@@ -5098,22 +5050,19 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX9-LABEL: v_ssubsat_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, -1, -1
-; GFX9-NEXT:    s_movk_i32 s5, 0x8000
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX9-NEXT:    v_pk_max_i16 v4, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v4, v4, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX9-NEXT:    v_pk_min_i16 v5, v0, s6
+; GFX9-NEXT:    v_pk_max_i16 v4, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fff7fff
+; GFX9-NEXT:    v_pk_sub_i16 v4, v4, v5
+; GFX9-NEXT:    v_pk_min_i16 v6, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x80008000
 ; GFX9-NEXT:    v_pk_max_i16 v2, v4, v2
-; GFX9-NEXT:    v_pk_sub_i16 v5, v5, s5
-; GFX9-NEXT:    v_pk_min_i16 v2, v2, v5
+; GFX9-NEXT:    v_pk_sub_i16 v6, v6, v7
+; GFX9-NEXT:    v_pk_min_i16 v2, v2, v6
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_i16 v2, v1, s6
-; GFX9-NEXT:    v_pk_sub_i16 v2, v2, s4
-; GFX9-NEXT:    v_pk_min_i16 v4, v1, s6
-; GFX9-NEXT:    v_pk_sub_i16 v4, v4, s5
+; GFX9-NEXT:    v_pk_max_i16 v2, v1, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v5
+; GFX9-NEXT:    v_pk_min_i16 v4, v1, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_sub_i16 v4, v4, v7
 ; GFX9-NEXT:    v_pk_max_i16 v2, v2, v3
 ; GFX9-NEXT:    v_pk_min_i16 v2, v2, v4
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v2
@@ -5123,24 +5072,19 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, -1, -1
-; GFX10-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX10-NEXT:    v_pk_max_i16 v4, v0, s5
-; GFX10-NEXT:    v_pk_max_i16 v5, v1, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX10-NEXT:    v_pk_min_i16 v6, v0, s5
-; GFX10-NEXT:    v_pk_min_i16 v7, v1, s5
-; GFX10-NEXT:    v_pk_sub_i16 v4, v4, s4
-; GFX10-NEXT:    v_pk_sub_i16 v5, v5, s4
-; GFX10-NEXT:    s_movk_i32 s6, 0x8000
+; GFX10-NEXT:    v_pk_max_i16 v4, v0, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_max_i16 v5, v1, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_min_i16 v6, v0, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_min_i16 v7, v1, -1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s6
+; GFX10-NEXT:    v_pk_sub_i16 v4, v4, 0x7fff7fff
+; GFX10-NEXT:    v_pk_sub_i16 v5, v5, 0x7fff7fff
+; GFX10-NEXT:    v_pk_sub_i16 v6, v6, 0x80008000
+; GFX10-NEXT:    v_pk_sub_i16 v7, v7, 0x80008000
 ; GFX10-NEXT:    v_pk_max_i16 v11, v4, v2
-; GFX10-NEXT:    v_pk_sub_i16 v6, v6, s6
-; GFX10-NEXT:    v_pk_sub_i16 v4, v7, s6
-; GFX10-NEXT:    v_pk_max_i16 v3, v5, v3
+; GFX10-NEXT:    v_pk_max_i16 v10, v5, v3
 ; GFX10-NEXT:    v_pk_min_i16 v2, v11, v6
-; GFX10-NEXT:    v_pk_min_i16 v3, v3, v4
+; GFX10-NEXT:    v_pk_min_i16 v3, v10, v7
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -5306,77 +5250,73 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ;
 ; GFX9-LABEL: s_ssubsat_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, -1, -1
+; GFX9-NEXT:    s_sext_i32_i16 s6, s0
+; GFX9-NEXT:    s_ashr_i32 s7, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s8, -1
+; GFX9-NEXT:    s_cmp_gt_i32 s6, s8
+; GFX9-NEXT:    s_cselect_b32 s9, s6, s8
+; GFX9-NEXT:    s_cmp_gt_i32 s7, -1
+; GFX9-NEXT:    s_cselect_b32 s10, s7, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
+; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
+; GFX9-NEXT:    s_lshr_b32 s10, s9, 16
+; GFX9-NEXT:    s_movk_i32 s11, 0x7fff
+; GFX9-NEXT:    s_sub_i32 s9, s9, s4
+; GFX9-NEXT:    s_sub_i32 s10, s10, s11
+; GFX9-NEXT:    s_cmp_lt_i32 s6, s8
+; GFX9-NEXT:    s_cselect_b32 s6, s6, s8
+; GFX9-NEXT:    s_cmp_lt_i32 s7, -1
+; GFX9-NEXT:    s_cselect_b32 s7, s7, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
+; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
+; GFX9-NEXT:    s_mov_b32 s5, 0x80008000
+; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
+; GFX9-NEXT:    s_mov_b32 s10, 0x8000
+; GFX9-NEXT:    s_sub_i32 s6, s6, s5
+; GFX9-NEXT:    s_sub_i32 s7, s7, s10
+; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
+; GFX9-NEXT:    s_sext_i32_i16 s7, s9
+; GFX9-NEXT:    s_sext_i32_i16 s12, s2
+; GFX9-NEXT:    s_ashr_i32 s9, s9, 16
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX9-NEXT:    s_cmp_gt_i32 s7, s12
+; GFX9-NEXT:    s_cselect_b32 s7, s7, s12
+; GFX9-NEXT:    s_cmp_gt_i32 s9, s2
+; GFX9-NEXT:    s_cselect_b32 s2, s9, s2
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s7, s2
+; GFX9-NEXT:    s_sext_i32_i16 s7, s2
 ; GFX9-NEXT:    s_sext_i32_i16 s9, s6
-; GFX9-NEXT:    s_sext_i32_i16 s7, s0
-; GFX9-NEXT:    s_ashr_i32 s8, s0, 16
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s7, s9
-; GFX9-NEXT:    s_cselect_b32 s10, s7, s9
-; GFX9-NEXT:    s_cmp_gt_i32 s8, s6
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    s_cselect_b32 s11, s8, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s11
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX9-NEXT:    s_lshr_b32 s11, s10, 16
-; GFX9-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX9-NEXT:    s_sub_i32 s10, s10, s4
-; GFX9-NEXT:    s_sub_i32 s11, s11, s12
 ; GFX9-NEXT:    s_cmp_lt_i32 s7, s9
 ; GFX9-NEXT:    s_cselect_b32 s7, s7, s9
-; GFX9-NEXT:    s_cmp_lt_i32 s8, s6
-; GFX9-NEXT:    s_movk_i32 s5, 0x8000
-; GFX9-NEXT:    s_cselect_b32 s8, s8, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s11
-; GFX9-NEXT:    s_lshr_b32 s8, s7, 16
-; GFX9-NEXT:    s_lshr_b32 s11, s5, 16
-; GFX9-NEXT:    s_sub_i32 s7, s7, s5
-; GFX9-NEXT:    s_sub_i32 s8, s8, s11
-; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX9-NEXT:    s_sext_i32_i16 s8, s10
-; GFX9-NEXT:    s_sext_i32_i16 s13, s2
-; GFX9-NEXT:    s_ashr_i32 s10, s10, 16
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s8, s13
-; GFX9-NEXT:    s_cselect_b32 s8, s8, s13
-; GFX9-NEXT:    s_cmp_gt_i32 s10, s2
-; GFX9-NEXT:    s_cselect_b32 s2, s10, s2
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s8, s2
-; GFX9-NEXT:    s_sext_i32_i16 s8, s2
-; GFX9-NEXT:    s_sext_i32_i16 s10, s7
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NEXT:    s_ashr_i32 s7, s7, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s8, s10
-; GFX9-NEXT:    s_cselect_b32 s8, s8, s10
-; GFX9-NEXT:    s_cmp_lt_i32 s2, s7
-; GFX9-NEXT:    s_cselect_b32 s2, s2, s7
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s8, s2
-; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX9-NEXT:    s_cmp_lt_i32 s2, s6
+; GFX9-NEXT:    s_cselect_b32 s2, s2, s6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s7, s2
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s7, s2, 16
 ; GFX9-NEXT:    s_sub_i32 s0, s0, s2
-; GFX9-NEXT:    s_sub_i32 s2, s7, s8
+; GFX9-NEXT:    s_sub_i32 s2, s6, s7
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    s_sext_i32_i16 s2, s1
-; GFX9-NEXT:    s_ashr_i32 s7, s1, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s2, s9
-; GFX9-NEXT:    s_cselect_b32 s8, s2, s9
-; GFX9-NEXT:    s_cmp_gt_i32 s7, s6
-; GFX9-NEXT:    s_cselect_b32 s10, s7, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s10
-; GFX9-NEXT:    s_lshr_b32 s10, s8, 16
-; GFX9-NEXT:    s_sub_i32 s4, s8, s4
-; GFX9-NEXT:    s_sub_i32 s8, s10, s12
-; GFX9-NEXT:    s_cmp_lt_i32 s2, s9
-; GFX9-NEXT:    s_cselect_b32 s2, s2, s9
-; GFX9-NEXT:    s_cmp_lt_i32 s7, s6
-; GFX9-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX9-NEXT:    s_ashr_i32 s6, s1, 16
+; GFX9-NEXT:    s_cmp_gt_i32 s2, s8
+; GFX9-NEXT:    s_cselect_b32 s7, s2, s8
+; GFX9-NEXT:    s_cmp_gt_i32 s6, -1
+; GFX9-NEXT:    s_cselect_b32 s9, s6, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s9
+; GFX9-NEXT:    s_lshr_b32 s9, s7, 16
+; GFX9-NEXT:    s_sub_i32 s4, s7, s4
+; GFX9-NEXT:    s_sub_i32 s7, s9, s11
+; GFX9-NEXT:    s_cmp_lt_i32 s2, s8
+; GFX9-NEXT:    s_cselect_b32 s2, s2, s8
+; GFX9-NEXT:    s_cmp_lt_i32 s6, -1
+; GFX9-NEXT:    s_cselect_b32 s6, s6, -1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
 ; GFX9-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_sub_i32 s5, s6, s11
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s8
+; GFX9-NEXT:    s_sub_i32 s5, s6, s10
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s7
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s4
 ; GFX9-NEXT:    s_sext_i32_i16 s6, s3
@@ -5405,80 +5345,76 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ;
 ; GFX10-LABEL: s_ssubsat_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
-; GFX10-NEXT:    s_sext_i32_i16 s5, s0
-; GFX10-NEXT:    s_sext_i32_i16 s7, s4
+; GFX10-NEXT:    s_sext_i32_i16 s4, s0
+; GFX10-NEXT:    s_sext_i32_i16 s5, -1
 ; GFX10-NEXT:    s_ashr_i32 s6, s0, 16
-; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s5, s7
+; GFX10-NEXT:    s_cmp_gt_i32 s4, s5
 ; GFX10-NEXT:    s_movk_i32 s10, 0x7fff
-; GFX10-NEXT:    s_cselect_b32 s8, s5, s7
-; GFX10-NEXT:    s_cmp_gt_i32 s6, s4
-; GFX10-NEXT:    s_movk_i32 s12, 0x8000
-; GFX10-NEXT:    s_cselect_b32 s9, s6, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s12, s12, s12
-; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s10, s10
-; GFX10-NEXT:    s_lshr_b32 s10, s8, 16
-; GFX10-NEXT:    s_lshr_b32 s11, s9, 16
-; GFX10-NEXT:    s_sub_i32 s8, s8, s9
-; GFX10-NEXT:    s_sub_i32 s10, s10, s11
-; GFX10-NEXT:    s_cmp_lt_i32 s5, s7
-; GFX10-NEXT:    s_sext_i32_i16 s14, s2
-; GFX10-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX10-NEXT:    s_cmp_lt_i32 s6, s4
+; GFX10-NEXT:    s_cselect_b32 s7, s4, s5
+; GFX10-NEXT:    s_cmp_gt_i32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s11, 0x80008000
+; GFX10-NEXT:    s_cselect_b32 s8, s6, -1
+; GFX10-NEXT:    s_sext_i32_i16 s13, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX10-NEXT:    s_mov_b32 s8, 0x7fff7fff
+; GFX10-NEXT:    s_lshr_b32 s9, s7, 16
+; GFX10-NEXT:    s_sub_i32 s7, s7, s8
+; GFX10-NEXT:    s_sub_i32 s9, s9, s10
+; GFX10-NEXT:    s_cmp_lt_i32 s4, s5
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_cselect_b32 s6, s6, s4
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX10-NEXT:    s_cmp_lt_i32 s6, -1
+; GFX10-NEXT:    s_cselect_b32 s6, s6, -1
 ; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s8, s10
-; GFX10-NEXT:    s_lshr_b32 s8, s5, 16
-; GFX10-NEXT:    s_lshr_b32 s10, s12, 16
-; GFX10-NEXT:    s_sext_i32_i16 s13, s6
-; GFX10-NEXT:    s_sub_i32 s5, s5, s12
-; GFX10-NEXT:    s_sub_i32 s8, s8, s10
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s7, s9
+; GFX10-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX10-NEXT:    s_mov_b32 s9, 0x8000
+; GFX10-NEXT:    s_sext_i32_i16 s12, s6
+; GFX10-NEXT:    s_sub_i32 s4, s4, s11
+; GFX10-NEXT:    s_sub_i32 s7, s7, s9
 ; GFX10-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s13, s14
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s8
-; GFX10-NEXT:    s_cselect_b32 s13, s13, s14
+; GFX10-NEXT:    s_cmp_gt_i32 s12, s13
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s7
+; GFX10-NEXT:    s_cselect_b32 s12, s12, s13
 ; GFX10-NEXT:    s_cmp_gt_i32 s6, s2
-; GFX10-NEXT:    s_sext_i32_i16 s8, s5
+; GFX10-NEXT:    s_sext_i32_i16 s7, s4
 ; GFX10-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s13, s2
+; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s12, s2
 ; GFX10-NEXT:    s_sext_i32_i16 s6, s2
 ; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s6, s8
-; GFX10-NEXT:    s_cselect_b32 s6, s6, s8
-; GFX10-NEXT:    s_cmp_lt_i32 s2, s5
-; GFX10-NEXT:    s_cselect_b32 s2, s2, s5
-; GFX10-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX10-NEXT:    s_cmp_lt_i32 s6, s7
+; GFX10-NEXT:    s_cselect_b32 s6, s6, s7
+; GFX10-NEXT:    s_cmp_lt_i32 s2, s4
+; GFX10-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s6, s2
 ; GFX10-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX10-NEXT:    s_sub_i32 s0, s0, s2
-; GFX10-NEXT:    s_sub_i32 s2, s5, s6
-; GFX10-NEXT:    s_sext_i32_i16 s5, s1
+; GFX10-NEXT:    s_sub_i32 s2, s4, s6
+; GFX10-NEXT:    s_sext_i32_i16 s4, s1
 ; GFX10-NEXT:    s_ashr_i32 s6, s1, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s5, s7
+; GFX10-NEXT:    s_cmp_gt_i32 s4, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GFX10-NEXT:    s_cselect_b32 s8, s5, s7
-; GFX10-NEXT:    s_cmp_gt_i32 s6, s4
-; GFX10-NEXT:    s_cselect_b32 s13, s6, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s8, s13
-; GFX10-NEXT:    s_lshr_b32 s13, s8, 16
-; GFX10-NEXT:    s_sub_i32 s8, s8, s9
-; GFX10-NEXT:    s_sub_i32 s9, s13, s11
-; GFX10-NEXT:    s_cmp_lt_i32 s5, s7
-; GFX10-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX10-NEXT:    s_cmp_lt_i32 s6, s4
-; GFX10-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s5, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s8, s9
+; GFX10-NEXT:    s_cselect_b32 s7, s4, s5
+; GFX10-NEXT:    s_cmp_gt_i32 s6, -1
+; GFX10-NEXT:    s_cselect_b32 s12, s6, -1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s7, s12
+; GFX10-NEXT:    s_lshr_b32 s12, s7, 16
+; GFX10-NEXT:    s_sub_i32 s7, s7, s8
+; GFX10-NEXT:    s_sub_i32 s8, s12, s10
+; GFX10-NEXT:    s_cmp_lt_i32 s4, s5
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX10-NEXT:    s_cmp_lt_i32 s6, -1
+; GFX10-NEXT:    s_cselect_b32 s5, s6, -1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s7, s8
 ; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
 ; GFX10-NEXT:    s_sext_i32_i16 s7, s5
 ; GFX10-NEXT:    s_sext_i32_i16 s8, s3
-; GFX10-NEXT:    s_sub_i32 s4, s4, s12
-; GFX10-NEXT:    s_sub_i32 s6, s6, s10
+; GFX10-NEXT:    s_sub_i32 s4, s4, s11
+; GFX10-NEXT:    s_sub_i32 s6, s6, s9
 ; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX10-NEXT:    s_cmp_gt_i32 s7, s8
@@ -5661,29 +5597,26 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX9-LABEL: v_ssubsat_v6i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, -1, -1
-; GFX9-NEXT:    s_movk_i32 s5, 0x8000
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX9-NEXT:    v_pk_max_i16 v6, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v6, v6, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX9-NEXT:    v_pk_min_i16 v7, v0, s6
+; GFX9-NEXT:    v_pk_max_i16 v6, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fff7fff
+; GFX9-NEXT:    v_pk_sub_i16 v6, v6, v7
+; GFX9-NEXT:    v_pk_min_i16 v8, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x80008000
 ; GFX9-NEXT:    v_pk_max_i16 v3, v6, v3
-; GFX9-NEXT:    v_pk_sub_i16 v7, v7, s5
-; GFX9-NEXT:    v_pk_min_i16 v3, v3, v7
+; GFX9-NEXT:    v_pk_sub_i16 v8, v8, v9
+; GFX9-NEXT:    v_pk_min_i16 v3, v3, v8
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v3
-; GFX9-NEXT:    v_pk_max_i16 v3, v1, s6
-; GFX9-NEXT:    v_pk_sub_i16 v3, v3, s4
-; GFX9-NEXT:    v_pk_min_i16 v6, v1, s6
+; GFX9-NEXT:    v_pk_max_i16 v3, v1, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_sub_i16 v3, v3, v7
+; GFX9-NEXT:    v_pk_min_i16 v6, v1, -1 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_max_i16 v3, v3, v4
-; GFX9-NEXT:    v_pk_sub_i16 v6, v6, s5
+; GFX9-NEXT:    v_pk_sub_i16 v6, v6, v9
 ; GFX9-NEXT:    v_pk_min_i16 v3, v3, v6
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v3
-; GFX9-NEXT:    v_pk_max_i16 v3, v2, s6
-; GFX9-NEXT:    v_pk_sub_i16 v3, v3, s4
-; GFX9-NEXT:    v_pk_min_i16 v4, v2, s6
-; GFX9-NEXT:    v_pk_sub_i16 v4, v4, s5
+; GFX9-NEXT:    v_pk_max_i16 v3, v2, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_sub_i16 v3, v3, v7
+; GFX9-NEXT:    v_pk_min_i16 v4, v2, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_sub_i16 v4, v4, v9
 ; GFX9-NEXT:    v_pk_max_i16 v3, v3, v5
 ; GFX9-NEXT:    v_pk_min_i16 v3, v3, v4
 ; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v3
@@ -5693,28 +5626,23 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, -1, -1
-; GFX10-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX10-NEXT:    v_pk_max_i16 v6, v0, s5
-; GFX10-NEXT:    v_pk_max_i16 v8, v1, s5
-; GFX10-NEXT:    v_pk_max_i16 v9, v2, s5
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX10-NEXT:    v_pk_min_i16 v7, v0, s5
-; GFX10-NEXT:    v_pk_sub_i16 v6, v6, s4
-; GFX10-NEXT:    v_pk_sub_i16 v15, v8, s4
-; GFX10-NEXT:    v_pk_sub_i16 v19, v9, s4
-; GFX10-NEXT:    v_pk_min_i16 v10, v1, s5
-; GFX10-NEXT:    v_pk_min_i16 v11, v2, s5
-; GFX10-NEXT:    s_movk_i32 s6, 0x8000
+; GFX10-NEXT:    v_pk_max_i16 v6, v0, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_max_i16 v8, v1, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_max_i16 v9, v2, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_min_i16 v7, v0, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_min_i16 v10, v1, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_sub_i16 v6, v6, 0x7fff7fff
+; GFX10-NEXT:    v_pk_sub_i16 v15, v8, 0x7fff7fff
+; GFX10-NEXT:    v_pk_min_i16 v11, v2, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_sub_i16 v19, v9, 0x7fff7fff
+; GFX10-NEXT:    v_pk_sub_i16 v7, v7, 0x80008000
 ; GFX10-NEXT:    v_pk_max_i16 v14, v6, v3
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s6
+; GFX10-NEXT:    v_pk_sub_i16 v6, v10, 0x80008000
 ; GFX10-NEXT:    v_pk_max_i16 v4, v15, v4
-; GFX10-NEXT:    v_pk_sub_i16 v7, v7, s6
-; GFX10-NEXT:    v_pk_sub_i16 v6, v10, s6
-; GFX10-NEXT:    v_pk_sub_i16 v8, v11, s6
+; GFX10-NEXT:    v_pk_sub_i16 v8, v11, 0x80008000
 ; GFX10-NEXT:    v_pk_max_i16 v5, v19, v5
-; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_pk_min_i16 v3, v14, v7
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_pk_min_i16 v4, v4, v6
 ; GFX10-NEXT:    v_pk_min_i16 v5, v5, v8
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v3
@@ -5953,120 +5881,116 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ;
 ; GFX9-LABEL: s_ssubsat_v6i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s8, -1, -1
+; GFX9-NEXT:    s_sext_i32_i16 s8, s0
+; GFX9-NEXT:    s_ashr_i32 s9, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s10, -1
+; GFX9-NEXT:    s_cmp_gt_i32 s8, s10
+; GFX9-NEXT:    s_cselect_b32 s11, s8, s10
+; GFX9-NEXT:    s_cmp_gt_i32 s9, -1
+; GFX9-NEXT:    s_cselect_b32 s12, s9, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
+; GFX9-NEXT:    s_mov_b32 s6, 0x7fff7fff
+; GFX9-NEXT:    s_lshr_b32 s12, s11, 16
+; GFX9-NEXT:    s_movk_i32 s13, 0x7fff
+; GFX9-NEXT:    s_sub_i32 s11, s11, s6
+; GFX9-NEXT:    s_sub_i32 s12, s12, s13
+; GFX9-NEXT:    s_cmp_lt_i32 s8, s10
+; GFX9-NEXT:    s_cselect_b32 s8, s8, s10
+; GFX9-NEXT:    s_cmp_lt_i32 s9, -1
+; GFX9-NEXT:    s_cselect_b32 s9, s9, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
+; GFX9-NEXT:    s_mov_b32 s7, 0x80008000
+; GFX9-NEXT:    s_lshr_b32 s9, s8, 16
+; GFX9-NEXT:    s_mov_b32 s12, 0x8000
+; GFX9-NEXT:    s_sub_i32 s8, s8, s7
+; GFX9-NEXT:    s_sub_i32 s9, s9, s12
+; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX9-NEXT:    s_sext_i32_i16 s9, s11
+; GFX9-NEXT:    s_sext_i32_i16 s14, s3
+; GFX9-NEXT:    s_ashr_i32 s11, s11, 16
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
+; GFX9-NEXT:    s_cmp_gt_i32 s9, s14
+; GFX9-NEXT:    s_cselect_b32 s9, s9, s14
+; GFX9-NEXT:    s_cmp_gt_i32 s11, s3
+; GFX9-NEXT:    s_cselect_b32 s3, s11, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s9, s3
+; GFX9-NEXT:    s_sext_i32_i16 s9, s3
 ; GFX9-NEXT:    s_sext_i32_i16 s11, s8
-; GFX9-NEXT:    s_sext_i32_i16 s9, s0
-; GFX9-NEXT:    s_ashr_i32 s10, s0, 16
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX9-NEXT:    s_ashr_i32 s8, s8, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s9, s11
-; GFX9-NEXT:    s_cselect_b32 s12, s9, s11
-; GFX9-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX9-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX9-NEXT:    s_cselect_b32 s13, s10, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s12, s12, s13
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s6
-; GFX9-NEXT:    s_lshr_b32 s13, s12, 16
-; GFX9-NEXT:    s_lshr_b32 s14, s6, 16
-; GFX9-NEXT:    s_sub_i32 s12, s12, s6
-; GFX9-NEXT:    s_sub_i32 s13, s13, s14
 ; GFX9-NEXT:    s_cmp_lt_i32 s9, s11
 ; GFX9-NEXT:    s_cselect_b32 s9, s9, s11
-; GFX9-NEXT:    s_cmp_lt_i32 s10, s8
-; GFX9-NEXT:    s_movk_i32 s7, 0x8000
-; GFX9-NEXT:    s_cselect_b32 s10, s10, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s7
-; GFX9-NEXT:    s_pack_ll_b32_b16 s12, s12, s13
-; GFX9-NEXT:    s_lshr_b32 s10, s9, 16
-; GFX9-NEXT:    s_lshr_b32 s13, s7, 16
-; GFX9-NEXT:    s_sub_i32 s9, s9, s7
-; GFX9-NEXT:    s_sub_i32 s10, s10, s13
-; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
-; GFX9-NEXT:    s_sext_i32_i16 s10, s12
-; GFX9-NEXT:    s_sext_i32_i16 s15, s3
-; GFX9-NEXT:    s_ashr_i32 s12, s12, 16
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s10, s15
-; GFX9-NEXT:    s_cselect_b32 s10, s10, s15
-; GFX9-NEXT:    s_cmp_gt_i32 s12, s3
-; GFX9-NEXT:    s_cselect_b32 s3, s12, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s10, s3
-; GFX9-NEXT:    s_sext_i32_i16 s10, s3
-; GFX9-NEXT:    s_sext_i32_i16 s12, s9
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NEXT:    s_ashr_i32 s9, s9, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s10, s12
-; GFX9-NEXT:    s_cselect_b32 s10, s10, s12
-; GFX9-NEXT:    s_cmp_lt_i32 s3, s9
-; GFX9-NEXT:    s_cselect_b32 s3, s3, s9
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s10, s3
-; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s3, 16
+; GFX9-NEXT:    s_cmp_lt_i32 s3, s8
+; GFX9-NEXT:    s_cselect_b32 s3, s3, s8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s9, s3
+; GFX9-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s3, 16
 ; GFX9-NEXT:    s_sub_i32 s0, s0, s3
-; GFX9-NEXT:    s_sub_i32 s3, s9, s10
+; GFX9-NEXT:    s_sub_i32 s3, s8, s9
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX9-NEXT:    s_sext_i32_i16 s3, s1
-; GFX9-NEXT:    s_ashr_i32 s9, s1, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s3, s11
-; GFX9-NEXT:    s_cselect_b32 s10, s3, s11
-; GFX9-NEXT:    s_cmp_gt_i32 s9, s8
-; GFX9-NEXT:    s_cselect_b32 s12, s9, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s12
-; GFX9-NEXT:    s_lshr_b32 s12, s10, 16
-; GFX9-NEXT:    s_sub_i32 s10, s10, s6
-; GFX9-NEXT:    s_sub_i32 s12, s12, s14
-; GFX9-NEXT:    s_cmp_lt_i32 s3, s11
-; GFX9-NEXT:    s_cselect_b32 s3, s3, s11
-; GFX9-NEXT:    s_cmp_lt_i32 s9, s8
-; GFX9-NEXT:    s_cselect_b32 s9, s9, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s9
-; GFX9-NEXT:    s_lshr_b32 s9, s3, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s12
+; GFX9-NEXT:    s_ashr_i32 s8, s1, 16
+; GFX9-NEXT:    s_cmp_gt_i32 s3, s10
+; GFX9-NEXT:    s_cselect_b32 s9, s3, s10
+; GFX9-NEXT:    s_cmp_gt_i32 s8, -1
+; GFX9-NEXT:    s_cselect_b32 s11, s8, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s11
+; GFX9-NEXT:    s_lshr_b32 s11, s9, 16
+; GFX9-NEXT:    s_sub_i32 s9, s9, s6
+; GFX9-NEXT:    s_sub_i32 s11, s11, s13
+; GFX9-NEXT:    s_cmp_lt_i32 s3, s10
+; GFX9-NEXT:    s_cselect_b32 s3, s3, s10
+; GFX9-NEXT:    s_cmp_lt_i32 s8, -1
+; GFX9-NEXT:    s_cselect_b32 s8, s8, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s8
+; GFX9-NEXT:    s_lshr_b32 s8, s3, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s11
 ; GFX9-NEXT:    s_sub_i32 s3, s3, s7
-; GFX9-NEXT:    s_sub_i32 s9, s9, s13
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s9
-; GFX9-NEXT:    s_sext_i32_i16 s9, s10
-; GFX9-NEXT:    s_sext_i32_i16 s12, s4
-; GFX9-NEXT:    s_ashr_i32 s10, s10, 16
+; GFX9-NEXT:    s_sub_i32 s8, s8, s12
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s8
+; GFX9-NEXT:    s_sext_i32_i16 s8, s9
+; GFX9-NEXT:    s_sext_i32_i16 s11, s4
+; GFX9-NEXT:    s_ashr_i32 s9, s9, 16
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s9, s12
-; GFX9-NEXT:    s_cselect_b32 s9, s9, s12
-; GFX9-NEXT:    s_cmp_gt_i32 s10, s4
-; GFX9-NEXT:    s_cselect_b32 s4, s10, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s9, s4
-; GFX9-NEXT:    s_sext_i32_i16 s9, s4
-; GFX9-NEXT:    s_sext_i32_i16 s10, s3
+; GFX9-NEXT:    s_cmp_gt_i32 s8, s11
+; GFX9-NEXT:    s_cselect_b32 s8, s8, s11
+; GFX9-NEXT:    s_cmp_gt_i32 s9, s4
+; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s8, s4
+; GFX9-NEXT:    s_sext_i32_i16 s8, s4
+; GFX9-NEXT:    s_sext_i32_i16 s9, s3
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s9, s10
-; GFX9-NEXT:    s_cselect_b32 s9, s9, s10
+; GFX9-NEXT:    s_cmp_lt_i32 s8, s9
+; GFX9-NEXT:    s_cselect_b32 s8, s8, s9
 ; GFX9-NEXT:    s_cmp_lt_i32 s4, s3
 ; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s9, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s8, s3
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX9-NEXT:    s_sub_i32 s1, s1, s3
-; GFX9-NEXT:    s_sub_i32 s3, s4, s9
+; GFX9-NEXT:    s_sub_i32 s3, s4, s8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
 ; GFX9-NEXT:    s_sext_i32_i16 s3, s2
 ; GFX9-NEXT:    s_ashr_i32 s4, s2, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s3, s11
-; GFX9-NEXT:    s_cselect_b32 s9, s3, s11
-; GFX9-NEXT:    s_cmp_gt_i32 s4, s8
-; GFX9-NEXT:    s_cselect_b32 s10, s4, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
-; GFX9-NEXT:    s_lshr_b32 s10, s9, 16
-; GFX9-NEXT:    s_sub_i32 s6, s9, s6
-; GFX9-NEXT:    s_sub_i32 s9, s10, s14
-; GFX9-NEXT:    s_cmp_lt_i32 s3, s11
-; GFX9-NEXT:    s_cselect_b32 s3, s3, s11
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s8
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s8
+; GFX9-NEXT:    s_cmp_gt_i32 s3, s10
+; GFX9-NEXT:    s_cselect_b32 s8, s3, s10
+; GFX9-NEXT:    s_cmp_gt_i32 s4, -1
+; GFX9-NEXT:    s_cselect_b32 s9, s4, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX9-NEXT:    s_lshr_b32 s9, s8, 16
+; GFX9-NEXT:    s_sub_i32 s6, s8, s6
+; GFX9-NEXT:    s_sub_i32 s8, s9, s13
+; GFX9-NEXT:    s_cmp_lt_i32 s3, s10
+; GFX9-NEXT:    s_cselect_b32 s3, s3, s10
+; GFX9-NEXT:    s_cmp_lt_i32 s4, -1
+; GFX9-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX9-NEXT:    s_sub_i32 s3, s3, s7
-; GFX9-NEXT:    s_sub_i32 s4, s4, s13
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s9
+; GFX9-NEXT:    s_sub_i32 s4, s4, s12
+; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s6
 ; GFX9-NEXT:    s_sext_i32_i16 s7, s5
@@ -6095,123 +6019,119 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ;
 ; GFX10-LABEL: s_ssubsat_v6i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, -1, -1
-; GFX10-NEXT:    s_sext_i32_i16 s7, s0
-; GFX10-NEXT:    s_sext_i32_i16 s9, s6
+; GFX10-NEXT:    s_sext_i32_i16 s6, s0
+; GFX10-NEXT:    s_sext_i32_i16 s7, -1
 ; GFX10-NEXT:    s_ashr_i32 s8, s0, 16
-; GFX10-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s7, s9
+; GFX10-NEXT:    s_cmp_gt_i32 s6, s7
 ; GFX10-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX10-NEXT:    s_cselect_b32 s10, s7, s9
-; GFX10-NEXT:    s_cmp_gt_i32 s8, s6
-; GFX10-NEXT:    s_movk_i32 s14, 0x8000
-; GFX10-NEXT:    s_cselect_b32 s11, s8, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s14, s14, s14
-; GFX10-NEXT:    s_pack_ll_b32_b16 s10, s10, s11
-; GFX10-NEXT:    s_pack_ll_b32_b16 s11, s12, s12
-; GFX10-NEXT:    s_lshr_b32 s12, s10, 16
-; GFX10-NEXT:    s_lshr_b32 s13, s11, 16
-; GFX10-NEXT:    s_sub_i32 s10, s10, s11
-; GFX10-NEXT:    s_sub_i32 s12, s12, s13
-; GFX10-NEXT:    s_cmp_lt_i32 s7, s9
-; GFX10-NEXT:    s_sext_i32_i16 s16, s3
-; GFX10-NEXT:    s_cselect_b32 s7, s7, s9
-; GFX10-NEXT:    s_cmp_lt_i32 s8, s6
+; GFX10-NEXT:    s_cselect_b32 s9, s6, s7
+; GFX10-NEXT:    s_cmp_gt_i32 s8, -1
+; GFX10-NEXT:    s_mov_b32 s13, 0x80008000
+; GFX10-NEXT:    s_cselect_b32 s10, s8, -1
+; GFX10-NEXT:    s_sext_i32_i16 s15, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
+; GFX10-NEXT:    s_mov_b32 s10, 0x7fff7fff
+; GFX10-NEXT:    s_lshr_b32 s11, s9, 16
+; GFX10-NEXT:    s_sub_i32 s9, s9, s10
+; GFX10-NEXT:    s_sub_i32 s11, s11, s12
+; GFX10-NEXT:    s_cmp_lt_i32 s6, s7
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_cselect_b32 s8, s8, s6
+; GFX10-NEXT:    s_cselect_b32 s6, s6, s7
+; GFX10-NEXT:    s_cmp_lt_i32 s8, -1
+; GFX10-NEXT:    s_cselect_b32 s8, s8, -1
 ; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s10, s12
-; GFX10-NEXT:    s_lshr_b32 s10, s7, 16
-; GFX10-NEXT:    s_lshr_b32 s12, s14, 16
-; GFX10-NEXT:    s_sext_i32_i16 s15, s8
-; GFX10-NEXT:    s_sub_i32 s7, s7, s14
-; GFX10-NEXT:    s_sub_i32 s10, s10, s12
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s8
+; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s9, s11
+; GFX10-NEXT:    s_lshr_b32 s9, s6, 16
+; GFX10-NEXT:    s_mov_b32 s11, 0x8000
+; GFX10-NEXT:    s_sext_i32_i16 s14, s8
+; GFX10-NEXT:    s_sub_i32 s6, s6, s13
+; GFX10-NEXT:    s_sub_i32 s9, s9, s11
 ; GFX10-NEXT:    s_ashr_i32 s8, s8, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s15, s16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s7, s10
-; GFX10-NEXT:    s_cselect_b32 s15, s15, s16
+; GFX10-NEXT:    s_cmp_gt_i32 s14, s15
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s9
+; GFX10-NEXT:    s_cselect_b32 s14, s14, s15
 ; GFX10-NEXT:    s_cmp_gt_i32 s8, s3
-; GFX10-NEXT:    s_sext_i32_i16 s10, s7
+; GFX10-NEXT:    s_sext_i32_i16 s9, s6
 ; GFX10-NEXT:    s_cselect_b32 s3, s8, s3
-; GFX10-NEXT:    s_ashr_i32 s7, s7, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s15, s3
-; GFX10-NEXT:    s_sext_i32_i16 s16, s4
+; GFX10-NEXT:    s_ashr_i32 s6, s6, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s14, s3
+; GFX10-NEXT:    s_sext_i32_i16 s15, s4
 ; GFX10-NEXT:    s_sext_i32_i16 s8, s3
 ; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s8, s10
-; GFX10-NEXT:    s_cselect_b32 s8, s8, s10
-; GFX10-NEXT:    s_cmp_lt_i32 s3, s7
-; GFX10-NEXT:    s_cselect_b32 s3, s3, s7
-; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX10-NEXT:    s_cmp_lt_i32 s8, s9
+; GFX10-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX10-NEXT:    s_cmp_lt_i32 s3, s6
+; GFX10-NEXT:    s_cselect_b32 s3, s3, s6
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s8, s3
 ; GFX10-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX10-NEXT:    s_sub_i32 s0, s0, s3
-; GFX10-NEXT:    s_sub_i32 s3, s7, s8
-; GFX10-NEXT:    s_sext_i32_i16 s7, s1
+; GFX10-NEXT:    s_sub_i32 s3, s6, s8
+; GFX10-NEXT:    s_sext_i32_i16 s6, s1
 ; GFX10-NEXT:    s_ashr_i32 s8, s1, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s7, s9
+; GFX10-NEXT:    s_cmp_gt_i32 s6, s7
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX10-NEXT:    s_cselect_b32 s10, s7, s9
-; GFX10-NEXT:    s_cmp_gt_i32 s8, s6
-; GFX10-NEXT:    s_cselect_b32 s15, s8, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s10, s10, s15
-; GFX10-NEXT:    s_lshr_b32 s15, s10, 16
-; GFX10-NEXT:    s_sub_i32 s10, s10, s11
-; GFX10-NEXT:    s_sub_i32 s15, s15, s13
-; GFX10-NEXT:    s_cmp_lt_i32 s7, s9
-; GFX10-NEXT:    s_cselect_b32 s7, s7, s9
-; GFX10-NEXT:    s_cmp_lt_i32 s8, s6
-; GFX10-NEXT:    s_cselect_b32 s8, s8, s6
+; GFX10-NEXT:    s_cselect_b32 s9, s6, s7
+; GFX10-NEXT:    s_cmp_gt_i32 s8, -1
+; GFX10-NEXT:    s_cselect_b32 s14, s8, -1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s14
+; GFX10-NEXT:    s_lshr_b32 s14, s9, 16
+; GFX10-NEXT:    s_sub_i32 s9, s9, s10
+; GFX10-NEXT:    s_sub_i32 s14, s14, s12
+; GFX10-NEXT:    s_cmp_lt_i32 s6, s7
+; GFX10-NEXT:    s_cselect_b32 s6, s6, s7
+; GFX10-NEXT:    s_cmp_lt_i32 s8, -1
+; GFX10-NEXT:    s_cselect_b32 s8, s8, -1
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s10, s15
-; GFX10-NEXT:    s_lshr_b32 s10, s7, 16
-; GFX10-NEXT:    s_sext_i32_i16 s15, s8
-; GFX10-NEXT:    s_sub_i32 s7, s7, s14
-; GFX10-NEXT:    s_sub_i32 s10, s10, s12
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s8
+; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s9, s14
+; GFX10-NEXT:    s_lshr_b32 s9, s6, 16
+; GFX10-NEXT:    s_sext_i32_i16 s14, s8
+; GFX10-NEXT:    s_sub_i32 s6, s6, s13
+; GFX10-NEXT:    s_sub_i32 s9, s9, s11
 ; GFX10-NEXT:    s_ashr_i32 s8, s8, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s15, s16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s7, s10
-; GFX10-NEXT:    s_cselect_b32 s15, s15, s16
+; GFX10-NEXT:    s_cmp_gt_i32 s14, s15
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s9
+; GFX10-NEXT:    s_cselect_b32 s14, s14, s15
 ; GFX10-NEXT:    s_cmp_gt_i32 s8, s4
-; GFX10-NEXT:    s_sext_i32_i16 s10, s7
+; GFX10-NEXT:    s_sext_i32_i16 s9, s6
 ; GFX10-NEXT:    s_cselect_b32 s4, s8, s4
-; GFX10-NEXT:    s_ashr_i32 s7, s7, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s15, s4
+; GFX10-NEXT:    s_ashr_i32 s6, s6, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s14, s4
 ; GFX10-NEXT:    s_sext_i32_i16 s8, s4
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s8, s10
-; GFX10-NEXT:    s_cselect_b32 s8, s8, s10
-; GFX10-NEXT:    s_cmp_lt_i32 s4, s7
-; GFX10-NEXT:    s_cselect_b32 s4, s4, s7
-; GFX10-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX10-NEXT:    s_cmp_lt_i32 s8, s9
+; GFX10-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX10-NEXT:    s_cmp_lt_i32 s4, s6
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s8, s4
 ; GFX10-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX10-NEXT:    s_sub_i32 s1, s1, s4
-; GFX10-NEXT:    s_sub_i32 s4, s7, s8
-; GFX10-NEXT:    s_sext_i32_i16 s7, s2
+; GFX10-NEXT:    s_sub_i32 s4, s6, s8
+; GFX10-NEXT:    s_sext_i32_i16 s6, s2
 ; GFX10-NEXT:    s_ashr_i32 s8, s2, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s7, s9
+; GFX10-NEXT:    s_cmp_gt_i32 s6, s7
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX10-NEXT:    s_cselect_b32 s10, s7, s9
-; GFX10-NEXT:    s_cmp_gt_i32 s8, s6
-; GFX10-NEXT:    s_cselect_b32 s15, s8, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s10, s10, s15
-; GFX10-NEXT:    s_lshr_b32 s15, s10, 16
-; GFX10-NEXT:    s_sub_i32 s10, s10, s11
-; GFX10-NEXT:    s_sub_i32 s11, s15, s13
-; GFX10-NEXT:    s_cmp_lt_i32 s7, s9
-; GFX10-NEXT:    s_cselect_b32 s7, s7, s9
-; GFX10-NEXT:    s_cmp_lt_i32 s8, s6
-; GFX10-NEXT:    s_cselect_b32 s6, s8, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s7, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s10, s11
+; GFX10-NEXT:    s_cselect_b32 s9, s6, s7
+; GFX10-NEXT:    s_cmp_gt_i32 s8, -1
+; GFX10-NEXT:    s_cselect_b32 s14, s8, -1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s14
+; GFX10-NEXT:    s_lshr_b32 s14, s9, 16
+; GFX10-NEXT:    s_sub_i32 s9, s9, s10
+; GFX10-NEXT:    s_sub_i32 s10, s14, s12
+; GFX10-NEXT:    s_cmp_lt_i32 s6, s7
+; GFX10-NEXT:    s_cselect_b32 s6, s6, s7
+; GFX10-NEXT:    s_cmp_lt_i32 s8, -1
+; GFX10-NEXT:    s_cselect_b32 s7, s8, -1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
+; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s9, s10
 ; GFX10-NEXT:    s_lshr_b32 s8, s6, 16
 ; GFX10-NEXT:    s_sext_i32_i16 s9, s7
 ; GFX10-NEXT:    s_sext_i32_i16 s10, s5
-; GFX10-NEXT:    s_sub_i32 s6, s6, s14
-; GFX10-NEXT:    s_sub_i32 s8, s8, s12
+; GFX10-NEXT:    s_sub_i32 s6, s6, s13
+; GFX10-NEXT:    s_sub_i32 s8, s8, s11
 ; GFX10-NEXT:    s_ashr_i32 s7, s7, 16
 ; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX10-NEXT:    s_cmp_gt_i32 s9, s10
@@ -6423,36 +6343,33 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX9-LABEL: v_ssubsat_v8i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, -1, -1
-; GFX9-NEXT:    s_movk_i32 s5, 0x8000
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
-; GFX9-NEXT:    v_pk_max_i16 v8, v0, s6
-; GFX9-NEXT:    v_pk_sub_i16 v8, v8, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX9-NEXT:    v_pk_min_i16 v9, v0, s6
+; GFX9-NEXT:    v_pk_max_i16 v8, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fff7fff
+; GFX9-NEXT:    v_pk_sub_i16 v8, v8, v9
+; GFX9-NEXT:    v_pk_min_i16 v10, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0x80008000
 ; GFX9-NEXT:    v_pk_max_i16 v4, v8, v4
-; GFX9-NEXT:    v_pk_sub_i16 v9, v9, s5
-; GFX9-NEXT:    v_pk_min_i16 v4, v4, v9
+; GFX9-NEXT:    v_pk_sub_i16 v10, v10, v11
+; GFX9-NEXT:    v_pk_min_i16 v4, v4, v10
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v4
-; GFX9-NEXT:    v_pk_max_i16 v4, v1, s6
-; GFX9-NEXT:    v_pk_sub_i16 v4, v4, s4
-; GFX9-NEXT:    v_pk_min_i16 v8, v1, s6
+; GFX9-NEXT:    v_pk_max_i16 v4, v1, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_sub_i16 v4, v4, v9
+; GFX9-NEXT:    v_pk_min_i16 v8, v1, -1 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_max_i16 v4, v4, v5
-; GFX9-NEXT:    v_pk_sub_i16 v8, v8, s5
+; GFX9-NEXT:    v_pk_sub_i16 v8, v8, v11
 ; GFX9-NEXT:    v_pk_min_i16 v4, v4, v8
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v4
-; GFX9-NEXT:    v_pk_max_i16 v4, v2, s6
-; GFX9-NEXT:    v_pk_sub_i16 v4, v4, s4
-; GFX9-NEXT:    v_pk_min_i16 v5, v2, s6
-; GFX9-NEXT:    v_pk_sub_i16 v5, v5, s5
+; GFX9-NEXT:    v_pk_max_i16 v4, v2, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_sub_i16 v4, v4, v9
+; GFX9-NEXT:    v_pk_min_i16 v5, v2, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_sub_i16 v5, v5, v11
 ; GFX9-NEXT:    v_pk_max_i16 v4, v4, v6
 ; GFX9-NEXT:    v_pk_min_i16 v4, v4, v5
 ; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v4
-; GFX9-NEXT:    v_pk_max_i16 v4, v3, s6
-; GFX9-NEXT:    v_pk_sub_i16 v4, v4, s4
-; GFX9-NEXT:    v_pk_min_i16 v5, v3, s6
-; GFX9-NEXT:    v_pk_sub_i16 v5, v5, s5
+; GFX9-NEXT:    v_pk_max_i16 v4, v3, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_sub_i16 v4, v4, v9
+; GFX9-NEXT:    v_pk_min_i16 v5, v3, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_sub_i16 v5, v5, v11
 ; GFX9-NEXT:    v_pk_max_i16 v4, v4, v7
 ; GFX9-NEXT:    v_pk_min_i16 v4, v4, v5
 ; GFX9-NEXT:    v_pk_sub_i16 v3, v3, v4
@@ -6462,30 +6379,25 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
-; GFX10-NEXT:    s_movk_i32 s5, 0x7fff
-; GFX10-NEXT:    v_pk_max_i16 v8, v0, s4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s5, s5
-; GFX10-NEXT:    v_pk_max_i16 v10, v1, s4
-; GFX10-NEXT:    v_pk_max_i16 v12, v3, s4
-; GFX10-NEXT:    v_pk_min_i16 v9, v0, s4
-; GFX10-NEXT:    v_pk_sub_i16 v15, v8, s5
-; GFX10-NEXT:    v_pk_max_i16 v8, v2, s4
-; GFX10-NEXT:    v_pk_sub_i16 v10, v10, s5
-; GFX10-NEXT:    v_pk_sub_i16 v12, v12, s5
-; GFX10-NEXT:    v_pk_min_i16 v11, v1, s4
-; GFX10-NEXT:    v_pk_min_i16 v13, v2, s4
-; GFX10-NEXT:    v_pk_sub_i16 v8, v8, s5
-; GFX10-NEXT:    v_pk_min_i16 v14, v3, s4
-; GFX10-NEXT:    s_movk_i32 s6, 0x8000
+; GFX10-NEXT:    v_pk_max_i16 v8, v0, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_max_i16 v10, v1, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_max_i16 v12, v3, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_min_i16 v9, v0, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_min_i16 v11, v1, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_sub_i16 v15, v8, 0x7fff7fff
+; GFX10-NEXT:    v_pk_max_i16 v8, v2, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_sub_i16 v10, v10, 0x7fff7fff
+; GFX10-NEXT:    v_pk_sub_i16 v12, v12, 0x7fff7fff
+; GFX10-NEXT:    v_pk_min_i16 v13, v2, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_min_i16 v14, v3, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_sub_i16 v8, v8, 0x7fff7fff
 ; GFX10-NEXT:    v_pk_max_i16 v4, v15, v4
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s6
+; GFX10-NEXT:    v_pk_sub_i16 v9, v9, 0x80008000
 ; GFX10-NEXT:    v_pk_max_i16 v5, v10, v5
-; GFX10-NEXT:    v_pk_sub_i16 v11, v11, s6
-; GFX10-NEXT:    v_pk_sub_i16 v9, v9, s6
+; GFX10-NEXT:    v_pk_sub_i16 v11, v11, 0x80008000
 ; GFX10-NEXT:    v_pk_max_i16 v15, v8, v6
-; GFX10-NEXT:    v_pk_sub_i16 v10, v13, s6
-; GFX10-NEXT:    v_pk_sub_i16 v8, v14, s6
+; GFX10-NEXT:    v_pk_sub_i16 v10, v13, 0x80008000
+; GFX10-NEXT:    v_pk_sub_i16 v8, v14, 0x80008000
 ; GFX10-NEXT:    v_pk_max_i16 v7, v12, v7
 ; GFX10-NEXT:    v_pk_min_i16 v19, v4, v9
 ; GFX10-NEXT:    v_pk_min_i16 v11, v5, v11
@@ -6799,136 +6711,132 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ;
 ; GFX9-LABEL: s_ssubsat_v8i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s10, -1, -1
+; GFX9-NEXT:    s_sext_i32_i16 s10, s0
+; GFX9-NEXT:    s_ashr_i32 s11, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s12, -1
+; GFX9-NEXT:    s_cmp_gt_i32 s10, s12
+; GFX9-NEXT:    s_cselect_b32 s13, s10, s12
+; GFX9-NEXT:    s_cmp_gt_i32 s11, -1
+; GFX9-NEXT:    s_cselect_b32 s14, s11, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s13, s13, s14
+; GFX9-NEXT:    s_mov_b32 s8, 0x7fff7fff
+; GFX9-NEXT:    s_lshr_b32 s14, s13, 16
+; GFX9-NEXT:    s_movk_i32 s15, 0x7fff
+; GFX9-NEXT:    s_sub_i32 s13, s13, s8
+; GFX9-NEXT:    s_sub_i32 s14, s14, s15
+; GFX9-NEXT:    s_cmp_lt_i32 s10, s12
+; GFX9-NEXT:    s_cselect_b32 s10, s10, s12
+; GFX9-NEXT:    s_cmp_lt_i32 s11, -1
+; GFX9-NEXT:    s_cselect_b32 s11, s11, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s11
+; GFX9-NEXT:    s_pack_ll_b32_b16 s13, s13, s14
+; GFX9-NEXT:    s_mov_b32 s9, 0x80008000
+; GFX9-NEXT:    s_lshr_b32 s11, s10, 16
+; GFX9-NEXT:    s_mov_b32 s14, 0x8000
+; GFX9-NEXT:    s_sub_i32 s10, s10, s9
+; GFX9-NEXT:    s_sub_i32 s11, s11, s14
+; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s11
+; GFX9-NEXT:    s_sext_i32_i16 s11, s13
+; GFX9-NEXT:    s_sext_i32_i16 s16, s4
+; GFX9-NEXT:    s_ashr_i32 s13, s13, 16
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX9-NEXT:    s_cmp_gt_i32 s11, s16
+; GFX9-NEXT:    s_cselect_b32 s11, s11, s16
+; GFX9-NEXT:    s_cmp_gt_i32 s13, s4
+; GFX9-NEXT:    s_cselect_b32 s4, s13, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s11, s4
+; GFX9-NEXT:    s_sext_i32_i16 s11, s4
 ; GFX9-NEXT:    s_sext_i32_i16 s13, s10
-; GFX9-NEXT:    s_sext_i32_i16 s11, s0
-; GFX9-NEXT:    s_ashr_i32 s12, s0, 16
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX9-NEXT:    s_ashr_i32 s10, s10, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s11, s13
-; GFX9-NEXT:    s_cselect_b32 s14, s11, s13
-; GFX9-NEXT:    s_cmp_gt_i32 s12, s10
-; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX9-NEXT:    s_cselect_b32 s15, s12, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s14, s14, s15
-; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s8
-; GFX9-NEXT:    s_lshr_b32 s15, s14, 16
-; GFX9-NEXT:    s_lshr_b32 s16, s8, 16
-; GFX9-NEXT:    s_sub_i32 s14, s14, s8
-; GFX9-NEXT:    s_sub_i32 s15, s15, s16
 ; GFX9-NEXT:    s_cmp_lt_i32 s11, s13
 ; GFX9-NEXT:    s_cselect_b32 s11, s11, s13
-; GFX9-NEXT:    s_cmp_lt_i32 s12, s10
-; GFX9-NEXT:    s_movk_i32 s9, 0x8000
-; GFX9-NEXT:    s_cselect_b32 s12, s12, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
-; GFX9-NEXT:    s_pack_ll_b32_b16 s9, s9, s9
-; GFX9-NEXT:    s_pack_ll_b32_b16 s14, s14, s15
-; GFX9-NEXT:    s_lshr_b32 s12, s11, 16
-; GFX9-NEXT:    s_lshr_b32 s15, s9, 16
-; GFX9-NEXT:    s_sub_i32 s11, s11, s9
-; GFX9-NEXT:    s_sub_i32 s12, s12, s15
-; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
-; GFX9-NEXT:    s_sext_i32_i16 s12, s14
-; GFX9-NEXT:    s_sext_i32_i16 s17, s4
-; GFX9-NEXT:    s_ashr_i32 s14, s14, 16
-; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s12, s17
-; GFX9-NEXT:    s_cselect_b32 s12, s12, s17
-; GFX9-NEXT:    s_cmp_gt_i32 s14, s4
-; GFX9-NEXT:    s_cselect_b32 s4, s14, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s12, s4
-; GFX9-NEXT:    s_sext_i32_i16 s12, s4
-; GFX9-NEXT:    s_sext_i32_i16 s14, s11
-; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_ashr_i32 s11, s11, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s12, s14
-; GFX9-NEXT:    s_cselect_b32 s12, s12, s14
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s11
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s11
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s12, s4
-; GFX9-NEXT:    s_lshr_b32 s11, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s12, s4, 16
+; GFX9-NEXT:    s_cmp_lt_i32 s4, s10
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s11, s4
+; GFX9-NEXT:    s_lshr_b32 s10, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s11, s4, 16
 ; GFX9-NEXT:    s_sub_i32 s0, s0, s4
-; GFX9-NEXT:    s_sub_i32 s4, s11, s12
+; GFX9-NEXT:    s_sub_i32 s4, s10, s11
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s1
-; GFX9-NEXT:    s_ashr_i32 s11, s1, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s4, s13
-; GFX9-NEXT:    s_cselect_b32 s12, s4, s13
-; GFX9-NEXT:    s_cmp_gt_i32 s11, s10
-; GFX9-NEXT:    s_cselect_b32 s14, s11, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s12, s12, s14
-; GFX9-NEXT:    s_lshr_b32 s14, s12, 16
-; GFX9-NEXT:    s_sub_i32 s12, s12, s8
-; GFX9-NEXT:    s_sub_i32 s14, s14, s16
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s13
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s13
-; GFX9-NEXT:    s_cmp_lt_i32 s11, s10
-; GFX9-NEXT:    s_cselect_b32 s11, s11, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s11
-; GFX9-NEXT:    s_lshr_b32 s11, s4, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s12, s12, s14
+; GFX9-NEXT:    s_ashr_i32 s10, s1, 16
+; GFX9-NEXT:    s_cmp_gt_i32 s4, s12
+; GFX9-NEXT:    s_cselect_b32 s11, s4, s12
+; GFX9-NEXT:    s_cmp_gt_i32 s10, -1
+; GFX9-NEXT:    s_cselect_b32 s13, s10, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s13
+; GFX9-NEXT:    s_lshr_b32 s13, s11, 16
+; GFX9-NEXT:    s_sub_i32 s11, s11, s8
+; GFX9-NEXT:    s_sub_i32 s13, s13, s15
+; GFX9-NEXT:    s_cmp_lt_i32 s4, s12
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s12
+; GFX9-NEXT:    s_cmp_lt_i32 s10, -1
+; GFX9-NEXT:    s_cselect_b32 s10, s10, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s10
+; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s13
 ; GFX9-NEXT:    s_sub_i32 s4, s4, s9
-; GFX9-NEXT:    s_sub_i32 s11, s11, s15
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s11
-; GFX9-NEXT:    s_sext_i32_i16 s11, s12
-; GFX9-NEXT:    s_sext_i32_i16 s14, s5
-; GFX9-NEXT:    s_ashr_i32 s12, s12, 16
+; GFX9-NEXT:    s_sub_i32 s10, s10, s14
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s10
+; GFX9-NEXT:    s_sext_i32_i16 s10, s11
+; GFX9-NEXT:    s_sext_i32_i16 s13, s5
+; GFX9-NEXT:    s_ashr_i32 s11, s11, 16
 ; GFX9-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s11, s14
-; GFX9-NEXT:    s_cselect_b32 s11, s11, s14
-; GFX9-NEXT:    s_cmp_gt_i32 s12, s5
-; GFX9-NEXT:    s_cselect_b32 s5, s12, s5
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s11, s5
-; GFX9-NEXT:    s_sext_i32_i16 s11, s5
-; GFX9-NEXT:    s_sext_i32_i16 s12, s4
+; GFX9-NEXT:    s_cmp_gt_i32 s10, s13
+; GFX9-NEXT:    s_cselect_b32 s10, s10, s13
+; GFX9-NEXT:    s_cmp_gt_i32 s11, s5
+; GFX9-NEXT:    s_cselect_b32 s5, s11, s5
+; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s10, s5
+; GFX9-NEXT:    s_sext_i32_i16 s10, s5
+; GFX9-NEXT:    s_sext_i32_i16 s11, s4
 ; GFX9-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s11, s12
-; GFX9-NEXT:    s_cselect_b32 s11, s11, s12
+; GFX9-NEXT:    s_cmp_lt_i32 s10, s11
+; GFX9-NEXT:    s_cselect_b32 s10, s10, s11
 ; GFX9-NEXT:    s_cmp_lt_i32 s5, s4
 ; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s11, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s10, s4
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s11, s4, 16
+; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX9-NEXT:    s_sub_i32 s1, s1, s4
-; GFX9-NEXT:    s_sub_i32 s4, s5, s11
+; GFX9-NEXT:    s_sub_i32 s4, s5, s10
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s2
 ; GFX9-NEXT:    s_ashr_i32 s5, s2, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s4, s13
-; GFX9-NEXT:    s_cselect_b32 s11, s4, s13
-; GFX9-NEXT:    s_cmp_gt_i32 s5, s10
-; GFX9-NEXT:    s_cselect_b32 s12, s5, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
-; GFX9-NEXT:    s_lshr_b32 s12, s11, 16
-; GFX9-NEXT:    s_sub_i32 s11, s11, s8
-; GFX9-NEXT:    s_sub_i32 s12, s12, s16
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s13
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s13
-; GFX9-NEXT:    s_cmp_lt_i32 s5, s10
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s10
+; GFX9-NEXT:    s_cmp_gt_i32 s4, s12
+; GFX9-NEXT:    s_cselect_b32 s10, s4, s12
+; GFX9-NEXT:    s_cmp_gt_i32 s5, -1
+; GFX9-NEXT:    s_cselect_b32 s11, s5, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s11
+; GFX9-NEXT:    s_lshr_b32 s11, s10, 16
+; GFX9-NEXT:    s_sub_i32 s10, s10, s8
+; GFX9-NEXT:    s_sub_i32 s11, s11, s15
+; GFX9-NEXT:    s_cmp_lt_i32 s4, s12
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s12
+; GFX9-NEXT:    s_cmp_lt_i32 s5, -1
+; GFX9-NEXT:    s_cselect_b32 s5, s5, -1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
+; GFX9-NEXT:    s_pack_ll_b32_b16 s10, s10, s11
 ; GFX9-NEXT:    s_sub_i32 s4, s4, s9
-; GFX9-NEXT:    s_sub_i32 s5, s5, s15
+; GFX9-NEXT:    s_sub_i32 s5, s5, s14
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
-; GFX9-NEXT:    s_sext_i32_i16 s5, s11
-; GFX9-NEXT:    s_sext_i32_i16 s12, s6
-; GFX9-NEXT:    s_ashr_i32 s11, s11, 16
+; GFX9-NEXT:    s_sext_i32_i16 s5, s10
+; GFX9-NEXT:    s_sext_i32_i16 s11, s6
+; GFX9-NEXT:    s_ashr_i32 s10, s10, 16
 ; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s5, s12
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s12
-; GFX9-NEXT:    s_cmp_gt_i32 s11, s6
-; GFX9-NEXT:    s_cselect_b32 s6, s11, s6
+; GFX9-NEXT:    s_cmp_gt_i32 s5, s11
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX9-NEXT:    s_cmp_gt_i32 s10, s6
+; GFX9-NEXT:    s_cselect_b32 s6, s10, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
 ; GFX9-NEXT:    s_sext_i32_i16 s6, s5
-; GFX9-NEXT:    s_sext_i32_i16 s11, s4
+; GFX9-NEXT:    s_sext_i32_i16 s10, s4
 ; GFX9-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_cmp_lt_i32 s6, s11
-; GFX9-NEXT:    s_cselect_b32 s6, s6, s11
+; GFX9-NEXT:    s_cmp_lt_i32 s6, s10
+; GFX9-NEXT:    s_cselect_b32 s6, s6, s10
 ; GFX9-NEXT:    s_cmp_lt_i32 s5, s4
 ; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s6, s4
@@ -6939,23 +6847,23 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s3
 ; GFX9-NEXT:    s_ashr_i32 s5, s3, 16
-; GFX9-NEXT:    s_cmp_gt_i32 s4, s13
-; GFX9-NEXT:    s_cselect_b32 s6, s4, s13
-; GFX9-NEXT:    s_cmp_gt_i32 s5, s10
-; GFX9-NEXT:    s_cselect_b32 s11, s5, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s11
-; GFX9-NEXT:    s_lshr_b32 s11, s6, 16
+; GFX9-NEXT:    s_cmp_gt_i32 s4, s12
+; GFX9-NEXT:    s_cselect_b32 s6, s4, s12
+; GFX9-NEXT:    s_cmp_gt_i32 s5, -1
+; GFX9-NEXT:    s_cselect_b32 s10, s5, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s10
+; GFX9-NEXT:    s_lshr_b32 s10, s6, 16
 ; GFX9-NEXT:    s_sub_i32 s6, s6, s8
-; GFX9-NEXT:    s_sub_i32 s8, s11, s16
-; GFX9-NEXT:    s_cmp_lt_i32 s4, s13
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s13
-; GFX9-NEXT:    s_cmp_lt_i32 s5, s10
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s10
+; GFX9-NEXT:    s_sub_i32 s8, s10, s15
+; GFX9-NEXT:    s_cmp_lt_i32 s4, s12
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s12
+; GFX9-NEXT:    s_cmp_lt_i32 s5, -1
+; GFX9-NEXT:    s_cselect_b32 s5, s5, -1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s8
 ; GFX9-NEXT:    s_sub_i32 s4, s4, s9
-; GFX9-NEXT:    s_sub_i32 s5, s5, s15
+; GFX9-NEXT:    s_sub_i32 s5, s5, s14
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s6
 ; GFX9-NEXT:    s_sext_i32_i16 s8, s7
@@ -6984,166 +6892,162 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ;
 ; GFX10-LABEL: s_ssubsat_v8i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s8, -1, -1
-; GFX10-NEXT:    s_sext_i32_i16 s9, s0
-; GFX10-NEXT:    s_sext_i32_i16 s11, s8
+; GFX10-NEXT:    s_sext_i32_i16 s8, s0
+; GFX10-NEXT:    s_sext_i32_i16 s9, -1
 ; GFX10-NEXT:    s_ashr_i32 s10, s0, 16
-; GFX10-NEXT:    s_ashr_i32 s8, s8, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s9, s11
+; GFX10-NEXT:    s_cmp_gt_i32 s8, s9
 ; GFX10-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX10-NEXT:    s_cselect_b32 s12, s9, s11
-; GFX10-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX10-NEXT:    s_movk_i32 s16, 0x8000
-; GFX10-NEXT:    s_cselect_b32 s13, s10, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s16, s16, s16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s12, s12, s13
-; GFX10-NEXT:    s_pack_ll_b32_b16 s13, s14, s14
-; GFX10-NEXT:    s_lshr_b32 s14, s12, 16
-; GFX10-NEXT:    s_lshr_b32 s15, s13, 16
-; GFX10-NEXT:    s_sub_i32 s12, s12, s13
-; GFX10-NEXT:    s_sub_i32 s14, s14, s15
-; GFX10-NEXT:    s_cmp_lt_i32 s9, s11
-; GFX10-NEXT:    s_sext_i32_i16 s18, s4
-; GFX10-NEXT:    s_cselect_b32 s9, s9, s11
-; GFX10-NEXT:    s_cmp_lt_i32 s10, s8
+; GFX10-NEXT:    s_cselect_b32 s11, s8, s9
+; GFX10-NEXT:    s_cmp_gt_i32 s10, -1
+; GFX10-NEXT:    s_mov_b32 s15, 0x80008000
+; GFX10-NEXT:    s_cselect_b32 s12, s10, -1
+; GFX10-NEXT:    s_sext_i32_i16 s17, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
+; GFX10-NEXT:    s_mov_b32 s12, 0x7fff7fff
+; GFX10-NEXT:    s_lshr_b32 s13, s11, 16
+; GFX10-NEXT:    s_sub_i32 s11, s11, s12
+; GFX10-NEXT:    s_sub_i32 s13, s13, s14
+; GFX10-NEXT:    s_cmp_lt_i32 s8, s9
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_cselect_b32 s10, s10, s8
+; GFX10-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX10-NEXT:    s_cmp_lt_i32 s10, -1
+; GFX10-NEXT:    s_cselect_b32 s10, s10, -1
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
-; GFX10-NEXT:    s_pack_ll_b32_b16 s10, s12, s14
-; GFX10-NEXT:    s_lshr_b32 s12, s9, 16
-; GFX10-NEXT:    s_lshr_b32 s14, s16, 16
-; GFX10-NEXT:    s_sext_i32_i16 s17, s10
-; GFX10-NEXT:    s_sub_i32 s9, s9, s16
-; GFX10-NEXT:    s_sub_i32 s12, s12, s14
+; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s8, s10
+; GFX10-NEXT:    s_pack_ll_b32_b16 s10, s11, s13
+; GFX10-NEXT:    s_lshr_b32 s11, s8, 16
+; GFX10-NEXT:    s_mov_b32 s13, 0x8000
+; GFX10-NEXT:    s_sext_i32_i16 s16, s10
+; GFX10-NEXT:    s_sub_i32 s8, s8, s15
+; GFX10-NEXT:    s_sub_i32 s11, s11, s13
 ; GFX10-NEXT:    s_ashr_i32 s10, s10, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s17, s18
-; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s12
-; GFX10-NEXT:    s_cselect_b32 s17, s17, s18
+; GFX10-NEXT:    s_cmp_gt_i32 s16, s17
+; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s8, s11
+; GFX10-NEXT:    s_cselect_b32 s16, s16, s17
 ; GFX10-NEXT:    s_cmp_gt_i32 s10, s4
-; GFX10-NEXT:    s_sext_i32_i16 s12, s9
+; GFX10-NEXT:    s_sext_i32_i16 s11, s8
 ; GFX10-NEXT:    s_cselect_b32 s4, s10, s4
-; GFX10-NEXT:    s_ashr_i32 s9, s9, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s17, s4
-; GFX10-NEXT:    s_sext_i32_i16 s18, s5
+; GFX10-NEXT:    s_ashr_i32 s8, s8, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
+; GFX10-NEXT:    s_sext_i32_i16 s17, s5
 ; GFX10-NEXT:    s_sext_i32_i16 s10, s4
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s10, s12
-; GFX10-NEXT:    s_cselect_b32 s10, s10, s12
-; GFX10-NEXT:    s_cmp_lt_i32 s4, s9
-; GFX10-NEXT:    s_cselect_b32 s4, s4, s9
-; GFX10-NEXT:    s_lshr_b32 s9, s0, 16
+; GFX10-NEXT:    s_cmp_lt_i32 s10, s11
+; GFX10-NEXT:    s_cselect_b32 s10, s10, s11
+; GFX10-NEXT:    s_cmp_lt_i32 s4, s8
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s8
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s10, s4
 ; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX10-NEXT:    s_sub_i32 s0, s0, s4
-; GFX10-NEXT:    s_sub_i32 s4, s9, s10
-; GFX10-NEXT:    s_sext_i32_i16 s9, s1
+; GFX10-NEXT:    s_sub_i32 s4, s8, s10
+; GFX10-NEXT:    s_sext_i32_i16 s8, s1
 ; GFX10-NEXT:    s_ashr_i32 s10, s1, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s9, s11
+; GFX10-NEXT:    s_cmp_gt_i32 s8, s9
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
-; GFX10-NEXT:    s_cselect_b32 s12, s9, s11
-; GFX10-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX10-NEXT:    s_cselect_b32 s17, s10, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s12, s12, s17
-; GFX10-NEXT:    s_lshr_b32 s17, s12, 16
-; GFX10-NEXT:    s_sub_i32 s12, s12, s13
-; GFX10-NEXT:    s_sub_i32 s17, s17, s15
-; GFX10-NEXT:    s_cmp_lt_i32 s9, s11
-; GFX10-NEXT:    s_cselect_b32 s9, s9, s11
-; GFX10-NEXT:    s_cmp_lt_i32 s10, s8
-; GFX10-NEXT:    s_cselect_b32 s10, s10, s8
+; GFX10-NEXT:    s_cselect_b32 s11, s8, s9
+; GFX10-NEXT:    s_cmp_gt_i32 s10, -1
+; GFX10-NEXT:    s_cselect_b32 s16, s10, -1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s11, s11, s16
+; GFX10-NEXT:    s_lshr_b32 s16, s11, 16
+; GFX10-NEXT:    s_sub_i32 s11, s11, s12
+; GFX10-NEXT:    s_sub_i32 s16, s16, s14
+; GFX10-NEXT:    s_cmp_lt_i32 s8, s9
+; GFX10-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX10-NEXT:    s_cmp_lt_i32 s10, -1
+; GFX10-NEXT:    s_cselect_b32 s10, s10, -1
 ; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
-; GFX10-NEXT:    s_pack_ll_b32_b16 s10, s12, s17
-; GFX10-NEXT:    s_lshr_b32 s12, s9, 16
-; GFX10-NEXT:    s_sext_i32_i16 s17, s10
-; GFX10-NEXT:    s_sub_i32 s9, s9, s16
-; GFX10-NEXT:    s_sub_i32 s12, s12, s14
+; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s8, s10
+; GFX10-NEXT:    s_pack_ll_b32_b16 s10, s11, s16
+; GFX10-NEXT:    s_lshr_b32 s11, s8, 16
+; GFX10-NEXT:    s_sext_i32_i16 s16, s10
+; GFX10-NEXT:    s_sub_i32 s8, s8, s15
+; GFX10-NEXT:    s_sub_i32 s11, s11, s13
 ; GFX10-NEXT:    s_ashr_i32 s10, s10, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s17, s18
-; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s12
-; GFX10-NEXT:    s_cselect_b32 s17, s17, s18
+; GFX10-NEXT:    s_cmp_gt_i32 s16, s17
+; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s8, s11
+; GFX10-NEXT:    s_cselect_b32 s16, s16, s17
 ; GFX10-NEXT:    s_cmp_gt_i32 s10, s5
-; GFX10-NEXT:    s_sext_i32_i16 s12, s9
+; GFX10-NEXT:    s_sext_i32_i16 s11, s8
 ; GFX10-NEXT:    s_cselect_b32 s5, s10, s5
-; GFX10-NEXT:    s_ashr_i32 s9, s9, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX10-NEXT:    s_sext_i32_i16 s18, s6
+; GFX10-NEXT:    s_ashr_i32 s8, s8, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
+; GFX10-NEXT:    s_sext_i32_i16 s17, s6
 ; GFX10-NEXT:    s_sext_i32_i16 s10, s5
 ; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s10, s12
-; GFX10-NEXT:    s_cselect_b32 s10, s10, s12
-; GFX10-NEXT:    s_cmp_lt_i32 s5, s9
-; GFX10-NEXT:    s_cselect_b32 s5, s5, s9
-; GFX10-NEXT:    s_lshr_b32 s9, s1, 16
+; GFX10-NEXT:    s_cmp_lt_i32 s10, s11
+; GFX10-NEXT:    s_cselect_b32 s10, s10, s11
+; GFX10-NEXT:    s_cmp_lt_i32 s5, s8
+; GFX10-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX10-NEXT:    s_lshr_b32 s8, s1, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s5, s10, s5
 ; GFX10-NEXT:    s_lshr_b32 s10, s5, 16
 ; GFX10-NEXT:    s_sub_i32 s1, s1, s5
-; GFX10-NEXT:    s_sub_i32 s5, s9, s10
-; GFX10-NEXT:    s_sext_i32_i16 s9, s2
+; GFX10-NEXT:    s_sub_i32 s5, s8, s10
+; GFX10-NEXT:    s_sext_i32_i16 s8, s2
 ; GFX10-NEXT:    s_ashr_i32 s10, s2, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s9, s11
+; GFX10-NEXT:    s_cmp_gt_i32 s8, s9
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT:    s_cselect_b32 s12, s9, s11
-; GFX10-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX10-NEXT:    s_cselect_b32 s17, s10, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s12, s12, s17
-; GFX10-NEXT:    s_lshr_b32 s17, s12, 16
-; GFX10-NEXT:    s_sub_i32 s12, s12, s13
-; GFX10-NEXT:    s_sub_i32 s17, s17, s15
-; GFX10-NEXT:    s_cmp_lt_i32 s9, s11
-; GFX10-NEXT:    s_cselect_b32 s9, s9, s11
-; GFX10-NEXT:    s_cmp_lt_i32 s10, s8
-; GFX10-NEXT:    s_cselect_b32 s10, s10, s8
+; GFX10-NEXT:    s_cselect_b32 s11, s8, s9
+; GFX10-NEXT:    s_cmp_gt_i32 s10, -1
+; GFX10-NEXT:    s_cselect_b32 s16, s10, -1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s11, s11, s16
+; GFX10-NEXT:    s_lshr_b32 s16, s11, 16
+; GFX10-NEXT:    s_sub_i32 s11, s11, s12
+; GFX10-NEXT:    s_sub_i32 s16, s16, s14
+; GFX10-NEXT:    s_cmp_lt_i32 s8, s9
+; GFX10-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX10-NEXT:    s_cmp_lt_i32 s10, -1
+; GFX10-NEXT:    s_cselect_b32 s10, s10, -1
 ; GFX10-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s10
-; GFX10-NEXT:    s_pack_ll_b32_b16 s10, s12, s17
-; GFX10-NEXT:    s_lshr_b32 s12, s9, 16
-; GFX10-NEXT:    s_sext_i32_i16 s17, s10
-; GFX10-NEXT:    s_sub_i32 s9, s9, s16
-; GFX10-NEXT:    s_sub_i32 s12, s12, s14
+; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s8, s10
+; GFX10-NEXT:    s_pack_ll_b32_b16 s10, s11, s16
+; GFX10-NEXT:    s_lshr_b32 s11, s8, 16
+; GFX10-NEXT:    s_sext_i32_i16 s16, s10
+; GFX10-NEXT:    s_sub_i32 s8, s8, s15
+; GFX10-NEXT:    s_sub_i32 s11, s11, s13
 ; GFX10-NEXT:    s_ashr_i32 s10, s10, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s17, s18
-; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s9, s12
-; GFX10-NEXT:    s_cselect_b32 s17, s17, s18
+; GFX10-NEXT:    s_cmp_gt_i32 s16, s17
+; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s8, s11
+; GFX10-NEXT:    s_cselect_b32 s16, s16, s17
 ; GFX10-NEXT:    s_cmp_gt_i32 s10, s6
-; GFX10-NEXT:    s_sext_i32_i16 s12, s9
+; GFX10-NEXT:    s_sext_i32_i16 s11, s8
 ; GFX10-NEXT:    s_cselect_b32 s6, s10, s6
-; GFX10-NEXT:    s_ashr_i32 s9, s9, 16
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
+; GFX10-NEXT:    s_ashr_i32 s8, s8, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s16, s6
 ; GFX10-NEXT:    s_sext_i32_i16 s10, s6
 ; GFX10-NEXT:    s_ashr_i32 s6, s6, 16
-; GFX10-NEXT:    s_cmp_lt_i32 s10, s12
-; GFX10-NEXT:    s_cselect_b32 s10, s10, s12
-; GFX10-NEXT:    s_cmp_lt_i32 s6, s9
-; GFX10-NEXT:    s_cselect_b32 s6, s6, s9
-; GFX10-NEXT:    s_lshr_b32 s9, s2, 16
+; GFX10-NEXT:    s_cmp_lt_i32 s10, s11
+; GFX10-NEXT:    s_cselect_b32 s10, s10, s11
+; GFX10-NEXT:    s_cmp_lt_i32 s6, s8
+; GFX10-NEXT:    s_cselect_b32 s6, s6, s8
+; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s10, s6
 ; GFX10-NEXT:    s_lshr_b32 s10, s6, 16
 ; GFX10-NEXT:    s_sub_i32 s2, s2, s6
-; GFX10-NEXT:    s_sub_i32 s6, s9, s10
-; GFX10-NEXT:    s_sext_i32_i16 s9, s3
+; GFX10-NEXT:    s_sub_i32 s6, s8, s10
+; GFX10-NEXT:    s_sext_i32_i16 s8, s3
 ; GFX10-NEXT:    s_ashr_i32 s10, s3, 16
-; GFX10-NEXT:    s_cmp_gt_i32 s9, s11
+; GFX10-NEXT:    s_cmp_gt_i32 s8, s9
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s12, s9, s11
-; GFX10-NEXT:    s_cmp_gt_i32 s10, s8
-; GFX10-NEXT:    s_cselect_b32 s17, s10, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s12, s12, s17
-; GFX10-NEXT:    s_lshr_b32 s17, s12, 16
-; GFX10-NEXT:    s_sub_i32 s12, s12, s13
-; GFX10-NEXT:    s_sub_i32 s13, s17, s15
-; GFX10-NEXT:    s_cmp_lt_i32 s9, s11
-; GFX10-NEXT:    s_cselect_b32 s9, s9, s11
-; GFX10-NEXT:    s_cmp_lt_i32 s10, s8
-; GFX10-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s9, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s12, s13
+; GFX10-NEXT:    s_cselect_b32 s11, s8, s9
+; GFX10-NEXT:    s_cmp_gt_i32 s10, -1
+; GFX10-NEXT:    s_cselect_b32 s16, s10, -1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s11, s11, s16
+; GFX10-NEXT:    s_lshr_b32 s16, s11, 16
+; GFX10-NEXT:    s_sub_i32 s11, s11, s12
+; GFX10-NEXT:    s_sub_i32 s12, s16, s14
+; GFX10-NEXT:    s_cmp_lt_i32 s8, s9
+; GFX10-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX10-NEXT:    s_cmp_lt_i32 s10, -1
+; GFX10-NEXT:    s_cselect_b32 s9, s10, -1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX10-NEXT:    s_pack_ll_b32_b16 s9, s11, s12
 ; GFX10-NEXT:    s_lshr_b32 s10, s8, 16
 ; GFX10-NEXT:    s_sext_i32_i16 s11, s9
 ; GFX10-NEXT:    s_sext_i32_i16 s12, s7
-; GFX10-NEXT:    s_sub_i32 s8, s8, s16
-; GFX10-NEXT:    s_sub_i32 s10, s10, s14
+; GFX10-NEXT:    s_sub_i32 s8, s8, s15
+; GFX10-NEXT:    s_sub_i32 s10, s10, s13
 ; GFX10-NEXT:    s_ashr_i32 s9, s9, 16
 ; GFX10-NEXT:    s_ashr_i32 s7, s7, 16
 ; GFX10-NEXT:    s_cmp_gt_i32 s11, s12

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 5570309a5be7..3a742fbcbd91 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -2371,8 +2371,7 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX9-LABEL: v_uaddsat_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
-; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v0
 ; GFX9-NEXT:    v_pk_min_u16 v1, v2, v1
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2381,9 +2380,8 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
+; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_xor_b32_e32 v2, s4, v0
 ; GFX10-NEXT:    v_pk_min_u16 v1, v2, v1
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2439,8 +2437,7 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ;
 ; GFX9-LABEL: s_uaddsat_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, -1, -1
-; GFX9-NEXT:    s_xor_b32 s2, s0, s2
+; GFX9-NEXT:    s_xor_b32 s2, s0, -1
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 16
@@ -2460,15 +2457,14 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ;
 ; GFX10-LABEL: s_uaddsat_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, -1, -1
+; GFX10-NEXT:    s_xor_b32 s2, s0, -1
 ; GFX10-NEXT:    s_mov_b32 s3, 0xffff
-; GFX10-NEXT:    s_xor_b32 s2, s0, s2
-; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX10-NEXT:    s_and_b32 s2, s2, s3
 ; GFX10-NEXT:    s_and_b32 s3, s1, s3
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX10-NEXT:    s_cmp_lt_u32 s2, s3
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_cselect_b32 s2, s2, s3
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, s1
 ; GFX10-NEXT:    s_cselect_b32 s1, s4, s1
@@ -2522,17 +2518,15 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ;
 ; GFX9-LABEL: uaddsat_v2i16_sv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, -1, -1
-; GFX9-NEXT:    s_xor_b32 s1, s0, s1
+; GFX9-NEXT:    s_xor_b32 s1, s0, -1
 ; GFX9-NEXT:    v_pk_min_u16 v0, s1, v0
 ; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: uaddsat_v2i16_sv:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, -1, -1
+; GFX10-NEXT:    s_xor_b32 s1, s0, -1
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_xor_b32 s1, s0, s1
 ; GFX10-NEXT:    v_pk_min_u16 v0, s1, v0
 ; GFX10-NEXT:    v_pk_add_u16 v0, s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -2578,17 +2572,15 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ;
 ; GFX9-LABEL: uaddsat_v2i16_vs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, -1, -1
-; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v0
 ; GFX9-NEXT:    v_pk_min_u16 v1, v1, s0
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: uaddsat_v2i16_vs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, -1, -1
+; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_xor_b32_e32 v1, s1, v0
 ; GFX10-NEXT:    v_pk_min_u16 v1, v1, s0
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -2671,11 +2663,10 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX9-LABEL: v_uaddsat_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
-; GFX9-NEXT:    v_xor_b32_e32 v4, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v0
 ; GFX9-NEXT:    v_pk_min_u16 v2, v4, v2
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2
-; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v1
 ; GFX9-NEXT:    v_pk_min_u16 v2, v2, v3
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2684,10 +2675,9 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
+; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v1
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_xor_b32_e32 v4, s4, v0
-; GFX10-NEXT:    v_xor_b32_e32 v5, s4, v1
 ; GFX10-NEXT:    v_pk_min_u16 v2, v4, v2
 ; GFX10-NEXT:    v_pk_min_u16 v3, v5, v3
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v2
@@ -2782,28 +2772,27 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ;
 ; GFX9-LABEL: s_uaddsat_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
-; GFX9-NEXT:    s_xor_b32 s5, s0, s4
-; GFX9-NEXT:    s_mov_b32 s7, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s6, s5, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX9-NEXT:    s_and_b32 s5, s5, s7
-; GFX9-NEXT:    s_and_b32 s2, s2, s7
-; GFX9-NEXT:    s_cmp_lt_u32 s5, s2
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s6, s8
-; GFX9-NEXT:    s_cselect_b32 s5, s6, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
-; GFX9-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX9-NEXT:    s_xor_b32 s4, s0, -1
+; GFX9-NEXT:    s_mov_b32 s6, 0xffff
+; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
+; GFX9-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX9-NEXT:    s_and_b32 s4, s4, s6
+; GFX9-NEXT:    s_and_b32 s2, s2, s6
+; GFX9-NEXT:    s_cmp_lt_u32 s4, s2
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX9-NEXT:    s_cmp_lt_u32 s5, s7
+; GFX9-NEXT:    s_cselect_b32 s4, s5, s7
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX9-NEXT:    s_add_i32 s0, s0, s2
-; GFX9-NEXT:    s_xor_b32 s2, s1, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s5
+; GFX9-NEXT:    s_add_i32 s4, s4, s5
+; GFX9-NEXT:    s_xor_b32 s2, s1, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX9-NEXT:    s_and_b32 s2, s2, s7
-; GFX9-NEXT:    s_and_b32 s3, s3, s7
+; GFX9-NEXT:    s_and_b32 s2, s2, s6
+; GFX9-NEXT:    s_and_b32 s3, s3, s6
 ; GFX9-NEXT:    s_cmp_lt_u32 s2, s3
 ; GFX9-NEXT:    s_cselect_b32 s2, s2, s3
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, s5
@@ -2818,38 +2807,37 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ;
 ; GFX10-LABEL: s_uaddsat_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
-; GFX10-NEXT:    s_mov_b32 s6, 0xffff
-; GFX10-NEXT:    s_xor_b32 s5, s0, s4
-; GFX10-NEXT:    s_and_b32 s8, s2, s6
-; GFX10-NEXT:    s_lshr_b32 s7, s5, 16
-; GFX10-NEXT:    s_and_b32 s5, s5, s6
+; GFX10-NEXT:    s_xor_b32 s4, s0, -1
+; GFX10-NEXT:    s_mov_b32 s5, 0xffff
+; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX10-NEXT:    s_and_b32 s7, s2, s5
+; GFX10-NEXT:    s_and_b32 s4, s4, s5
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX10-NEXT:    s_cmp_lt_u32 s5, s8
+; GFX10-NEXT:    s_cmp_lt_u32 s4, s7
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_cselect_b32 s5, s5, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s7, s2
-; GFX10-NEXT:    s_cselect_b32 s2, s7, s2
-; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s5, s2
-; GFX10-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s7
+; GFX10-NEXT:    s_cmp_lt_u32 s6, s2
+; GFX10-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s4, s2
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX10-NEXT:    s_add_i32 s0, s0, s2
-; GFX10-NEXT:    s_xor_b32 s2, s1, s4
-; GFX10-NEXT:    s_add_i32 s5, s5, s7
-; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX10-NEXT:    s_and_b32 s2, s2, s6
-; GFX10-NEXT:    s_and_b32 s6, s3, s6
+; GFX10-NEXT:    s_xor_b32 s2, s1, -1
+; GFX10-NEXT:    s_add_i32 s4, s4, s6
+; GFX10-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX10-NEXT:    s_and_b32 s2, s2, s5
+; GFX10-NEXT:    s_and_b32 s5, s3, s5
 ; GFX10-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX10-NEXT:    s_cmp_lt_u32 s2, s6
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s5
-; GFX10-NEXT:    s_cselect_b32 s2, s2, s6
-; GFX10-NEXT:    s_cmp_lt_u32 s4, s3
-; GFX10-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX10-NEXT:    s_cmp_lt_u32 s2, s5
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX10-NEXT:    s_cselect_b32 s2, s2, s5
+; GFX10-NEXT:    s_cmp_lt_u32 s6, s3
+; GFX10-NEXT:    s_cselect_b32 s3, s6, s3
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX10-NEXT:    s_add_i32 s1, s1, s2
-; GFX10-NEXT:    s_add_i32 s3, s3, s4
+; GFX10-NEXT:    s_add_i32 s3, s3, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
@@ -2955,14 +2943,13 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX9-LABEL: v_uaddsat_v6i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
-; GFX9-NEXT:    v_xor_b32_e32 v6, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v0
 ; GFX9-NEXT:    v_pk_min_u16 v3, v6, v3
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v3
-; GFX9-NEXT:    v_xor_b32_e32 v3, s4, v1
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v1
 ; GFX9-NEXT:    v_pk_min_u16 v3, v3, v4
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3
-; GFX9-NEXT:    v_xor_b32_e32 v3, s4, v2
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX9-NEXT:    v_pk_min_u16 v3, v3, v5
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2971,11 +2958,10 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v1
+; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v2
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_xor_b32_e32 v6, s4, v0
-; GFX10-NEXT:    v_xor_b32_e32 v7, s4, v1
-; GFX10-NEXT:    v_xor_b32_e32 v8, s4, v2
 ; GFX10-NEXT:    v_pk_min_u16 v3, v6, v3
 ; GFX10-NEXT:    v_pk_min_u16 v4, v7, v4
 ; GFX10-NEXT:    v_pk_min_u16 v5, v8, v5
@@ -3108,43 +3094,42 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ;
 ; GFX9-LABEL: s_uaddsat_v6i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, -1, -1
-; GFX9-NEXT:    s_xor_b32 s7, s0, s6
-; GFX9-NEXT:    s_mov_b32 s9, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s8, s7, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s3, 16
-; GFX9-NEXT:    s_and_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s3, s3, s9
-; GFX9-NEXT:    s_cmp_lt_u32 s7, s3
-; GFX9-NEXT:    s_cselect_b32 s3, s7, s3
-; GFX9-NEXT:    s_cmp_lt_u32 s8, s10
-; GFX9-NEXT:    s_cselect_b32 s7, s8, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s3, 16
-; GFX9-NEXT:    s_add_i32 s0, s0, s3
-; GFX9-NEXT:    s_add_i32 s7, s7, s8
-; GFX9-NEXT:    s_xor_b32 s3, s1, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s7
+; GFX9-NEXT:    s_xor_b32 s6, s0, -1
+; GFX9-NEXT:    s_mov_b32 s8, 0xffff
+; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s3, 16
+; GFX9-NEXT:    s_and_b32 s6, s6, s8
+; GFX9-NEXT:    s_and_b32 s3, s3, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s6, s3
+; GFX9-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX9-NEXT:    s_cmp_lt_u32 s7, s9
+; GFX9-NEXT:    s_cselect_b32 s6, s7, s9
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s4, 16
-; GFX9-NEXT:    s_and_b32 s3, s3, s9
-; GFX9-NEXT:    s_and_b32 s4, s4, s9
+; GFX9-NEXT:    s_add_i32 s0, s0, s3
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
+; GFX9-NEXT:    s_xor_b32 s3, s1, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX9-NEXT:    s_and_b32 s3, s3, s8
+; GFX9-NEXT:    s_and_b32 s4, s4, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s3, s4
 ; GFX9-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX9-NEXT:    s_cmp_lt_u32 s7, s8
-; GFX9-NEXT:    s_cselect_b32 s4, s7, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s6, s7
+; GFX9-NEXT:    s_cselect_b32 s4, s6, s7
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX9-NEXT:    s_add_i32 s1, s1, s3
-; GFX9-NEXT:    s_add_i32 s4, s4, s7
-; GFX9-NEXT:    s_xor_b32 s3, s2, s6
+; GFX9-NEXT:    s_add_i32 s4, s4, s6
+; GFX9-NEXT:    s_xor_b32 s3, s2, -1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX9-NEXT:    s_lshr_b32 s6, s5, 16
-; GFX9-NEXT:    s_and_b32 s3, s3, s9
-; GFX9-NEXT:    s_and_b32 s5, s5, s9
+; GFX9-NEXT:    s_and_b32 s3, s3, s8
+; GFX9-NEXT:    s_and_b32 s5, s5, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s3, s5
 ; GFX9-NEXT:    s_cselect_b32 s3, s3, s5
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, s6
@@ -3159,48 +3144,47 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ;
 ; GFX10-LABEL: s_uaddsat_v6i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s6, -1, -1
-; GFX10-NEXT:    s_mov_b32 s8, 0xffff
-; GFX10-NEXT:    s_xor_b32 s7, s0, s6
-; GFX10-NEXT:    s_and_b32 s10, s3, s8
-; GFX10-NEXT:    s_lshr_b32 s9, s7, 16
-; GFX10-NEXT:    s_and_b32 s7, s7, s8
+; GFX10-NEXT:    s_xor_b32 s6, s0, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0xffff
+; GFX10-NEXT:    s_lshr_b32 s8, s6, 16
+; GFX10-NEXT:    s_and_b32 s9, s3, s7
+; GFX10-NEXT:    s_and_b32 s6, s6, s7
 ; GFX10-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX10-NEXT:    s_cmp_lt_u32 s7, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s6, s9
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_cselect_b32 s7, s7, s10
-; GFX10-NEXT:    s_cmp_lt_u32 s9, s3
-; GFX10-NEXT:    s_cselect_b32 s3, s9, s3
-; GFX10-NEXT:    s_and_b32 s10, s4, s8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s7, s3
-; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s9, s3, 16
+; GFX10-NEXT:    s_cselect_b32 s6, s6, s9
+; GFX10-NEXT:    s_cmp_lt_u32 s8, s3
+; GFX10-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX10-NEXT:    s_and_b32 s9, s4, s7
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s6, s3
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX10-NEXT:    s_add_i32 s0, s0, s3
-; GFX10-NEXT:    s_xor_b32 s3, s1, s6
-; GFX10-NEXT:    s_add_i32 s7, s7, s9
-; GFX10-NEXT:    s_lshr_b32 s9, s3, 16
-; GFX10-NEXT:    s_and_b32 s3, s3, s8
+; GFX10-NEXT:    s_xor_b32 s3, s1, -1
+; GFX10-NEXT:    s_add_i32 s6, s6, s8
+; GFX10-NEXT:    s_lshr_b32 s8, s3, 16
+; GFX10-NEXT:    s_and_b32 s3, s3, s7
 ; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX10-NEXT:    s_cmp_lt_u32 s3, s10
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s7
-; GFX10-NEXT:    s_cselect_b32 s3, s3, s10
-; GFX10-NEXT:    s_cmp_lt_u32 s9, s4
-; GFX10-NEXT:    s_cselect_b32 s4, s9, s4
+; GFX10-NEXT:    s_cmp_lt_u32 s3, s9
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX10-NEXT:    s_cselect_b32 s3, s3, s9
+; GFX10-NEXT:    s_cmp_lt_u32 s8, s4
+; GFX10-NEXT:    s_cselect_b32 s4, s8, s4
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s9, s3, 16
+; GFX10-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX10-NEXT:    s_add_i32 s1, s1, s3
-; GFX10-NEXT:    s_xor_b32 s3, s2, s6
-; GFX10-NEXT:    s_add_i32 s4, s4, s9
-; GFX10-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX10-NEXT:    s_and_b32 s3, s3, s8
-; GFX10-NEXT:    s_and_b32 s8, s5, s8
+; GFX10-NEXT:    s_xor_b32 s3, s2, -1
+; GFX10-NEXT:    s_add_i32 s4, s4, s8
+; GFX10-NEXT:    s_lshr_b32 s8, s3, 16
+; GFX10-NEXT:    s_and_b32 s3, s3, s7
+; GFX10-NEXT:    s_and_b32 s7, s5, s7
 ; GFX10-NEXT:    s_lshr_b32 s5, s5, 16
-; GFX10-NEXT:    s_cmp_lt_u32 s3, s8
+; GFX10-NEXT:    s_cmp_lt_u32 s3, s7
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX10-NEXT:    s_cselect_b32 s3, s3, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s6, s5
-; GFX10-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX10-NEXT:    s_cselect_b32 s3, s3, s7
+; GFX10-NEXT:    s_cmp_lt_u32 s8, s5
+; GFX10-NEXT:    s_cselect_b32 s5, s8, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
 ; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX10-NEXT:    s_lshr_b32 s6, s3, 16
@@ -3324,17 +3308,16 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX9-LABEL: v_uaddsat_v8i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
-; GFX9-NEXT:    v_xor_b32_e32 v8, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v0
 ; GFX9-NEXT:    v_pk_min_u16 v4, v8, v4
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v4
-; GFX9-NEXT:    v_xor_b32_e32 v4, s4, v1
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v1
 ; GFX9-NEXT:    v_pk_min_u16 v4, v4, v5
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v4
-; GFX9-NEXT:    v_xor_b32_e32 v4, s4, v2
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v2
 ; GFX9-NEXT:    v_pk_min_u16 v4, v4, v6
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v4
-; GFX9-NEXT:    v_xor_b32_e32 v4, s4, v3
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v3
 ; GFX9-NEXT:    v_pk_min_u16 v4, v4, v7
 ; GFX9-NEXT:    v_pk_add_u16 v3, v3, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3343,12 +3326,11 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
+; GFX10-NEXT:    v_xor_b32_e32 v15, -1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v19, -1, v1
+; GFX10-NEXT:    v_xor_b32_e32 v23, -1, v2
+; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v3
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_xor_b32_e32 v15, s4, v0
-; GFX10-NEXT:    v_xor_b32_e32 v19, s4, v1
-; GFX10-NEXT:    v_xor_b32_e32 v23, s4, v2
-; GFX10-NEXT:    v_xor_b32_e32 v10, s4, v3
 ; GFX10-NEXT:    v_pk_min_u16 v11, v15, v4
 ; GFX10-NEXT:    v_pk_min_u16 v15, v19, v5
 ; GFX10-NEXT:    v_pk_min_u16 v19, v23, v6
@@ -3519,58 +3501,57 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ;
 ; GFX9-LABEL: s_uaddsat_v8i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_pack_ll_b32_b16 s8, -1, -1
-; GFX9-NEXT:    s_xor_b32 s9, s0, s8
-; GFX9-NEXT:    s_mov_b32 s11, 0xffff
-; GFX9-NEXT:    s_lshr_b32 s10, s9, 16
-; GFX9-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX9-NEXT:    s_and_b32 s9, s9, s11
-; GFX9-NEXT:    s_and_b32 s4, s4, s11
-; GFX9-NEXT:    s_cmp_lt_u32 s9, s4
-; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
-; GFX9-NEXT:    s_cmp_lt_u32 s10, s12
-; GFX9-NEXT:    s_cselect_b32 s9, s10, s12
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s9
-; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
-; GFX9-NEXT:    s_add_i32 s0, s0, s4
-; GFX9-NEXT:    s_add_i32 s9, s9, s10
-; GFX9-NEXT:    s_xor_b32 s4, s1, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s9
+; GFX9-NEXT:    s_xor_b32 s8, s0, -1
+; GFX9-NEXT:    s_mov_b32 s10, 0xffff
+; GFX9-NEXT:    s_lshr_b32 s9, s8, 16
+; GFX9-NEXT:    s_lshr_b32 s11, s4, 16
+; GFX9-NEXT:    s_and_b32 s8, s8, s10
+; GFX9-NEXT:    s_and_b32 s4, s4, s10
+; GFX9-NEXT:    s_cmp_lt_u32 s8, s4
+; GFX9-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s9, s11
+; GFX9-NEXT:    s_cselect_b32 s8, s9, s11
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s8
+; GFX9-NEXT:    s_lshr_b32 s8, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s5, 16
-; GFX9-NEXT:    s_and_b32 s4, s4, s11
-; GFX9-NEXT:    s_and_b32 s5, s5, s11
+; GFX9-NEXT:    s_add_i32 s0, s0, s4
+; GFX9-NEXT:    s_add_i32 s8, s8, s9
+; GFX9-NEXT:    s_xor_b32 s4, s1, -1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s8
+; GFX9-NEXT:    s_lshr_b32 s8, s4, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s5, 16
+; GFX9-NEXT:    s_and_b32 s4, s4, s10
+; GFX9-NEXT:    s_and_b32 s5, s5, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, s5
 ; GFX9-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX9-NEXT:    s_cmp_lt_u32 s9, s10
-; GFX9-NEXT:    s_cselect_b32 s5, s9, s10
+; GFX9-NEXT:    s_cmp_lt_u32 s8, s9
+; GFX9-NEXT:    s_cselect_b32 s5, s8, s9
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX9-NEXT:    s_add_i32 s1, s1, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s9
-; GFX9-NEXT:    s_xor_b32 s4, s2, s8
+; GFX9-NEXT:    s_add_i32 s5, s5, s8
+; GFX9-NEXT:    s_xor_b32 s4, s2, -1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s6, 16
-; GFX9-NEXT:    s_and_b32 s4, s4, s11
-; GFX9-NEXT:    s_and_b32 s6, s6, s11
+; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
+; GFX9-NEXT:    s_and_b32 s4, s4, s10
+; GFX9-NEXT:    s_and_b32 s6, s6, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, s6
 ; GFX9-NEXT:    s_cselect_b32 s4, s4, s6
-; GFX9-NEXT:    s_cmp_lt_u32 s5, s9
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s9
+; GFX9-NEXT:    s_cmp_lt_u32 s5, s8
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
 ; GFX9-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9-NEXT:    s_add_i32 s5, s5, s6
-; GFX9-NEXT:    s_xor_b32 s4, s3, s8
+; GFX9-NEXT:    s_xor_b32 s4, s3, -1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
 ; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX9-NEXT:    s_lshr_b32 s6, s7, 16
-; GFX9-NEXT:    s_and_b32 s4, s4, s11
-; GFX9-NEXT:    s_and_b32 s7, s7, s11
+; GFX9-NEXT:    s_and_b32 s4, s4, s10
+; GFX9-NEXT:    s_and_b32 s7, s7, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, s7
 ; GFX9-NEXT:    s_cselect_b32 s4, s4, s7
 ; GFX9-NEXT:    s_cmp_lt_u32 s5, s6
@@ -3585,63 +3566,62 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ;
 ; GFX10-LABEL: s_uaddsat_v8i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_pack_ll_b32_b16 s8, -1, -1
-; GFX10-NEXT:    s_mov_b32 s10, 0xffff
-; GFX10-NEXT:    s_xor_b32 s9, s0, s8
-; GFX10-NEXT:    s_and_b32 s12, s4, s10
-; GFX10-NEXT:    s_lshr_b32 s11, s9, 16
-; GFX10-NEXT:    s_and_b32 s9, s9, s10
+; GFX10-NEXT:    s_xor_b32 s8, s0, -1
+; GFX10-NEXT:    s_mov_b32 s9, 0xffff
+; GFX10-NEXT:    s_lshr_b32 s10, s8, 16
+; GFX10-NEXT:    s_and_b32 s11, s4, s9
+; GFX10-NEXT:    s_and_b32 s8, s8, s9
 ; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX10-NEXT:    s_cmp_lt_u32 s9, s12
+; GFX10-NEXT:    s_cmp_lt_u32 s8, s11
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_cselect_b32 s9, s9, s12
-; GFX10-NEXT:    s_cmp_lt_u32 s11, s4
-; GFX10-NEXT:    s_cselect_b32 s4, s11, s4
-; GFX10-NEXT:    s_and_b32 s12, s5, s10
-; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s9, s4
-; GFX10-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s11, s4, 16
+; GFX10-NEXT:    s_cselect_b32 s8, s8, s11
+; GFX10-NEXT:    s_cmp_lt_u32 s10, s4
+; GFX10-NEXT:    s_cselect_b32 s4, s10, s4
+; GFX10-NEXT:    s_and_b32 s11, s5, s9
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s8, s4
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX10-NEXT:    s_add_i32 s0, s0, s4
-; GFX10-NEXT:    s_xor_b32 s4, s1, s8
-; GFX10-NEXT:    s_add_i32 s9, s9, s11
-; GFX10-NEXT:    s_lshr_b32 s11, s4, 16
-; GFX10-NEXT:    s_and_b32 s4, s4, s10
+; GFX10-NEXT:    s_xor_b32 s4, s1, -1
+; GFX10-NEXT:    s_add_i32 s8, s8, s10
+; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX10-NEXT:    s_and_b32 s4, s4, s9
 ; GFX10-NEXT:    s_lshr_b32 s5, s5, 16
-; GFX10-NEXT:    s_cmp_lt_u32 s4, s12
-; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s9
-; GFX10-NEXT:    s_cselect_b32 s4, s4, s12
-; GFX10-NEXT:    s_cmp_lt_u32 s11, s5
-; GFX10-NEXT:    s_cselect_b32 s5, s11, s5
-; GFX10-NEXT:    s_and_b32 s12, s6, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s4, s11
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s8
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s11
+; GFX10-NEXT:    s_cmp_lt_u32 s10, s5
+; GFX10-NEXT:    s_cselect_b32 s5, s10, s5
+; GFX10-NEXT:    s_and_b32 s11, s6, s9
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s11, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX10-NEXT:    s_add_i32 s1, s1, s4
-; GFX10-NEXT:    s_xor_b32 s4, s2, s8
-; GFX10-NEXT:    s_add_i32 s5, s5, s11
-; GFX10-NEXT:    s_lshr_b32 s11, s4, 16
-; GFX10-NEXT:    s_and_b32 s4, s4, s10
+; GFX10-NEXT:    s_xor_b32 s4, s2, -1
+; GFX10-NEXT:    s_add_i32 s5, s5, s10
+; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX10-NEXT:    s_and_b32 s4, s4, s9
 ; GFX10-NEXT:    s_lshr_b32 s6, s6, 16
-; GFX10-NEXT:    s_cmp_lt_u32 s4, s12
+; GFX10-NEXT:    s_cmp_lt_u32 s4, s11
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT:    s_cselect_b32 s4, s4, s12
-; GFX10-NEXT:    s_cmp_lt_u32 s11, s6
-; GFX10-NEXT:    s_cselect_b32 s6, s11, s6
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s11
+; GFX10-NEXT:    s_cmp_lt_u32 s10, s6
+; GFX10-NEXT:    s_cselect_b32 s6, s10, s6
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
 ; GFX10-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s11, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX10-NEXT:    s_add_i32 s2, s2, s4
-; GFX10-NEXT:    s_xor_b32 s4, s3, s8
-; GFX10-NEXT:    s_add_i32 s6, s6, s11
-; GFX10-NEXT:    s_lshr_b32 s8, s4, 16
-; GFX10-NEXT:    s_and_b32 s4, s4, s10
-; GFX10-NEXT:    s_and_b32 s10, s7, s10
+; GFX10-NEXT:    s_xor_b32 s4, s3, -1
+; GFX10-NEXT:    s_add_i32 s6, s6, s10
+; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX10-NEXT:    s_and_b32 s4, s4, s9
+; GFX10-NEXT:    s_and_b32 s9, s7, s9
 ; GFX10-NEXT:    s_lshr_b32 s7, s7, 16
-; GFX10-NEXT:    s_cmp_lt_u32 s4, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s4, s9
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s4, s4, s10
-; GFX10-NEXT:    s_cmp_lt_u32 s8, s7
-; GFX10-NEXT:    s_cselect_b32 s7, s8, s7
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s9
+; GFX10-NEXT:    s_cmp_lt_u32 s10, s7
+; GFX10-NEXT:    s_cselect_b32 s7, s10, s7
 ; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s4, s7
 ; GFX10-NEXT:    s_lshr_b32 s7, s4, 16

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index 2512aaaeb082..474f6655bda2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -50,16 +50,14 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in
 ;
 ; GFX900-LABEL: scalar_xnor_v2i16_one_use:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_pack_ll_b32_b16 s2, -1, -1
 ; GFX900-NEXT:    s_xor_b32 s0, s0, s1
-; GFX900-NEXT:    s_xor_b32 s0, s0, s2
+; GFX900-NEXT:    s_xor_b32 s0, s0, -1
 ; GFX900-NEXT:    ; return to shader part epilog
 ;
 ; GFX906-LABEL: scalar_xnor_v2i16_one_use:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_pack_ll_b32_b16 s2, -1, -1
 ; GFX906-NEXT:    s_xor_b32 s0, s0, s1
-; GFX906-NEXT:    s_xor_b32 s0, s0, s2
+; GFX906-NEXT:    s_xor_b32 s0, s0, -1
 ; GFX906-NEXT:    ; return to shader part epilog
 entry:
   %xor = xor <2 x i16> %a, %b
@@ -150,7 +148,7 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
 ;
 ; GFX900-LABEL: scalar_xnor_v4i16_one_use:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
+; GFX900-NEXT:    s_mov_b32 s4, -1
 ; GFX900-NEXT:    s_mov_b32 s5, s4
 ; GFX900-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX900-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
@@ -158,7 +156,7 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
 ;
 ; GFX906-LABEL: scalar_xnor_v4i16_one_use:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, -1, -1
+; GFX906-NEXT:    s_mov_b32 s4, -1
 ; GFX906-NEXT:    s_mov_b32 s5, s4
 ; GFX906-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX906-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]


        


More information about the llvm-commits mailing list