[llvm] 36beaa3 - Reland AMDGPU/GlobalISel: Combine zext(trunc x) to x after RegBankSelect

Petar Avramovic via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 5 02:10:47 PST 2021


Author: Petar Avramovic
Date: 2021-03-05T11:05:37+01:00
New Revision: 36beaa3ba3b334fa0be0afd0a2b0da9c7c8afd34

URL: https://github.com/llvm/llvm-project/commit/36beaa3ba3b334fa0be0afd0a2b0da9c7c8afd34
DIFF: https://github.com/llvm/llvm-project/commit/36beaa3ba3b334fa0be0afd0a2b0da9c7c8afd34.diff

LOG: Reland AMDGPU/GlobalISel: Combine zext(trunc x) to x after RegBankSelect

Recommit bf5a5826504754788a8f1e3fec7a7dc95cda5782. Depends on
4c8fb7ddd6fa49258e0e9427e7345fb56ba522d4 which was reverted.

RegBankSelect creates zext and trunc when it selects banks for uniform i1.
Add zext_trunc_fold from generic combiner to post RegBankSelect combiner.

Differential Revision: https://reviews.llvm.org/D95432

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUCombine.td
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b6a6fb3e77db..5faebb2a538a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -64,6 +64,6 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
 }
 
 def AMDGPURegBankCombinerHelper : GICombinerHelper<
-  "AMDGPUGenRegBankCombinerHelper", []> {
+  "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold]> {
   let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
 }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 91889af4acf0..fe81a3f10d6c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -228,6 +228,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
   initializeAMDGPUPostLegalizerCombinerPass(*PR);
   initializeAMDGPUPreLegalizerCombinerPass(*PR);
+  initializeAMDGPURegBankCombinerPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
   initializeAMDGPUPromoteAllocaToVectorPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
@@ -807,6 +808,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
   bool addLegalizeMachineIR() override;
   void addPreRegBankSelect() override;
   bool addRegBankSelect() override;
+  void addPreGlobalInstructionSelect() override;
   bool addGlobalInstructionSelect() override;
   void addFastRegAlloc() override;
   void addOptimizedRegAlloc() override;
@@ -1113,6 +1115,11 @@ bool GCNPassConfig::addRegBankSelect() {
   return false;
 }
 
+void GCNPassConfig::addPreGlobalInstructionSelect() {
+  bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+  addPass(createAMDGPURegBankCombiner(IsOptNone));
+}
+
 bool GCNPassConfig::addGlobalInstructionSelect() {
   addPass(new InstructionSelect(getOptLevel()));
   // TODO: Fix instruction selection to do the right thing for image

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 4dd010994fc3..612f4fec3426 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -139,9 +139,6 @@ define void @constrained_if_register_class() {
 ; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
-; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
-; CHECK-NEXT:    s_and_b32 s4, s4, 1
-; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
 ; CHECK-NEXT:    s_cbranch_scc1 BB4_4
 ; CHECK-NEXT:  ; %bb.1: ; %bb2
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -161,9 +158,6 @@ define void @constrained_if_register_class() {
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:  BB4_3: ; %bb8
 ; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
-; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
-; CHECK-NEXT:    s_and_b32 s4, s4, 1
-; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
 ; CHECK-NEXT:    s_cbranch_scc0 BB4_5
 ; CHECK-NEXT:  BB4_4: ; %bb12
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
index 9530d64bef92..df9a807f381c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -9,9 +9,6 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
 ; GCN-NEXT:    s_load_dword s0, s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s1, 0
-; GCN-NEXT:    s_cselect_b32 s1, 1, 0
-; GCN-NEXT:    s_and_b32 s1, s1, 1
-; GCN-NEXT:    s_cmp_lg_u32 s1, 0
 ; GCN-NEXT:    s_cbranch_scc1 BB0_2
 ; GCN-NEXT:  ; %bb.1: ; %mid
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
index ba685dd6d5d2..031af117ce7e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
@@ -8,9 +8,6 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) {
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s2, 1, 0
-; GCN-NEXT:    s_and_b32 s2, s2, 1
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
 ; GCN-NEXT:    s_cbranch_scc1 BB0_2
 ; GCN-NEXT:  ; %bb.1: ; %mid
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
index 79c511638d7a..55c132d3a029 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@@ -53,9 +53,6 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
 ; CI-NEXT:    s_load_dword s0, s[4:5], 0x11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_lg_u32 s1, s0
-; CI-NEXT:    s_cselect_b32 s0, 1, 0
-; CI-NEXT:    s_and_b32 s0, s0, 1
-; CI-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-NEXT:    s_cbranch_scc1 BB1_2
 ; CI-NEXT:  ; %bb.1: ; %bb0
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
@@ -71,9 +68,6 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
 ; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, s0
-; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cbranch_scc1 BB1_2
 ; GFX9-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
index 75792d56ce42..90bf655286e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@@ -53,9 +53,6 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
 ; CI-NEXT:    s_load_dword s0, s[4:5], 0x10
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_lg_u32 s1, s0
-; CI-NEXT:    s_cselect_b32 s0, 1, 0
-; CI-NEXT:    s_and_b32 s0, s0, 1
-; CI-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-NEXT:    s_cbranch_scc1 BB1_2
 ; CI-NEXT:  ; %bb.1: ; %bb0
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
@@ -71,9 +68,6 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
 ; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, s0
-; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cbranch_scc1 BB1_2
 ; GFX9-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 46acf91f789c..a1b55ac31420 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -43,19 +43,18 @@ define amdgpu_kernel void @set_inactive_64(i64 addrspace(1)* %out, i64 %in) {
 define amdgpu_kernel void @set_inactive_scc(i32 addrspace(1)* %out, i32 %in, <4 x i32> inreg %desc) {
 ; GCN-LABEL: set_inactive_scc:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_buffer_load_dword s2, s[4:7], 0x0
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x2c
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s2, 56
+; GCN-NEXT:    s_buffer_load_dword s1, s[8:11], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s1, 56
 ; GCN-NEXT:    s_cselect_b32 s0, 1, 0
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 42
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_and_b32 s0, s0, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_cbranch_scc0 BB2_2
 ; GCN-NEXT:  ; %bb.1: ; %.one

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index 13f4528d2f1d..3eeea3cc79e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -21,18 +21,12 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_movk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s6, 1, 0
-; GCN-NEXT:    s_and_b32 s6, s6, 1
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s33, 0
 ; GCN-NEXT:    s_cbranch_scc1 BB0_3
 ; GCN-NEXT:  ; %bb.1: ; %bb.0
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0xc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s6, 1, 0
-; GCN-NEXT:    s_and_b32 s6, s6, 1
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_cbranch_scc1 BB0_3
 ; GCN-NEXT:  ; %bb.2: ; %bb.1
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -102,9 +96,6 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_movk_i32 s32, 0x1000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s6, 1, 0
-; GCN-NEXT:    s_and_b32 s6, s6, 1
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s33, 0
 ; GCN-NEXT:    s_cbranch_scc1 BB1_2
 ; GCN-NEXT:  ; %bb.1: ; %bb.0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 1da64b760b17..0aa9fedbd24e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4824,11 +4824,9 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_ashr_i64 s[6:7], s[10:11], s6
-; GFX6-NEXT:    s_and_b32 s12, s12, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT:    s_and_b32 s6, s13, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
@@ -4917,11 +4915,9 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX8-NEXT:    s_ashr_i64 s[6:7], s[10:11], s6
-; GFX8-NEXT:    s_and_b32 s12, s12, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT:    s_and_b32 s6, s13, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
 ; GFX8-NEXT:    s_mov_b32 s5, s4
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
@@ -5010,11 +5006,9 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX9-NEXT:    s_ashr_i64 s[6:7], s[10:11], s6
-; GFX9-NEXT:    s_and_b32 s12, s12, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT:    s_and_b32 s6, s13, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
@@ -5085,55 +5079,53 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    s_sub_i32 s13, s12, 64
 ; GFX10-NEXT:    s_and_b32 s14, 1, s1
-; GFX10-NEXT:    s_sub_i32 s2, 64, s12
+; GFX10-NEXT:    s_sub_i32 s15, 64, s12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[6:7], 0
-; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s14
-; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s14
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[8:9], s12
+; GFX10-NEXT:    s_lshl_b64 s[4:5], s[10:11], s15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s12
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[10:11], s2
-; GFX10-NEXT:    s_ashr_i64 s[4:5], s[10:11], s12
-; GFX10-NEXT:    s_and_b32 s12, s15, 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    s_ashr_i32 s2, s11, 31
-; GFX10-NEXT:    s_ashr_i64 s[6:7], s[10:11], s13
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX10-NEXT:    s_mov_b32 s3, s2
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
-; GFX10-NEXT:    s_and_b32 s6, s16, 1
+; GFX10-NEXT:    s_ashr_i32 s6, s11, 31
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    s_ashr_i64 s[0:1], s[10:11], s12
+; GFX10-NEXT:    s_ashr_i64 s[4:5], s[10:11], s13
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_mov_b32 s7, s6
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s9
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
-; GFX10-NEXT:    s_add_u32 s0, s0, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT:    s_add_u32 s2, s2, 0
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s8
+; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_and_b32 s4, s4, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    s_addc_u32 s2, s2, 0
+; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
 ; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, s2, vcc_lo
-; GFX10-NEXT:    s_addc_u32 s3, s3, 0x80000000
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v4, s1, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v4, s3, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
@@ -6123,11 +6115,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_ashr_i32 s8, s19, 31
 ; GFX6-NEXT:    s_ashr_i64 s[0:1], s[18:19], s20
 ; GFX6-NEXT:    s_ashr_i64 s[10:11], s[18:19], s21
-; GFX6-NEXT:    s_and_b32 s23, s23, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
-; GFX6-NEXT:    s_and_b32 s10, s24, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[16:17], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX6-NEXT:    s_mov_b32 s9, s8
@@ -6199,11 +6189,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX6-NEXT:    s_ashr_i64 s[4:5], s[2:3], s20
 ; GFX6-NEXT:    s_ashr_i64 s[10:11], s[2:3], s21
-; GFX6-NEXT:    s_and_b32 s12, s12, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[10:11]
-; GFX6-NEXT:    s_and_b32 s10, s13, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
 ; GFX6-NEXT:    s_mov_b32 s9, s8
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
@@ -6297,11 +6285,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_ashr_i32 s8, s19, 31
 ; GFX8-NEXT:    s_ashr_i64 s[0:1], s[18:19], s20
 ; GFX8-NEXT:    s_ashr_i64 s[10:11], s[18:19], s21
-; GFX8-NEXT:    s_and_b32 s23, s23, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
-; GFX8-NEXT:    s_and_b32 s10, s24, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[16:17], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_mov_b32 s9, s8
@@ -6380,11 +6366,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX8-NEXT:    s_ashr_i64 s[4:5], s[2:3], s20
 ; GFX8-NEXT:    s_ashr_i64 s[10:11], s[2:3], s21
-; GFX8-NEXT:    s_and_b32 s12, s12, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[10:11]
-; GFX8-NEXT:    s_and_b32 s10, s13, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
 ; GFX8-NEXT:    s_mov_b32 s9, s8
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
@@ -6477,11 +6461,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_ashr_i32 s8, s19, 31
 ; GFX9-NEXT:    s_ashr_i64 s[0:1], s[18:19], s20
 ; GFX9-NEXT:    s_ashr_i64 s[10:11], s[18:19], s21
-; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
-; GFX9-NEXT:    s_and_b32 s10, s24, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[16:17], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX9-NEXT:    s_mov_b32 s9, s8
@@ -6560,11 +6542,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX9-NEXT:    s_ashr_i64 s[4:5], s[2:3], s20
 ; GFX9-NEXT:    s_ashr_i64 s[10:11], s[2:3], s21
-; GFX9-NEXT:    s_and_b32 s12, s12, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[10:11]
-; GFX9-NEXT:    s_and_b32 s10, s13, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
 ; GFX9-NEXT:    s_mov_b32 s9, s8
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
@@ -6610,15 +6590,15 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ;
 ; GFX10-LABEL: s_saddsat_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s28, s0, s8
+; GFX10-NEXT:    s_add_u32 s16, s0, s8
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
 ; GFX10-NEXT:    s_mov_b32 s46, s0
 ; GFX10-NEXT:    s_and_b32 s17, s17, 1
 ; GFX10-NEXT:    s_mov_b32 s47, s1
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX10-NEXT:    s_addc_u32 s29, s1, s9
+; GFX10-NEXT:    s_addc_u32 s17, s1, s9
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[28:29], s[46:47]
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[46:47]
 ; GFX10-NEXT:    s_and_b32 s18, s18, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX10-NEXT:    s_addc_u32 s30, s2, s10
@@ -6638,136 +6618,132 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10-NEXT:    s_and_b32 s1, 1, s1
 ; GFX10-NEXT:    s_sub_i32 s21, s20, 64
 ; GFX10-NEXT:    s_sub_i32 s22, 64, s20
 ; GFX10-NEXT:    s_cmp_lt_u32 s20, 64
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s20, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[16:17], s20
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[30:31], s22
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[28:29], s20
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[30:31], s22
-; GFX10-NEXT:    s_and_b32 s24, s10, 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    s_ashr_i32 s2, s31, 31
-; GFX10-NEXT:    s_ashr_i64 s[8:9], s[30:31], s20
-; GFX10-NEXT:    s_ashr_i64 s[10:11], s[30:31], s21
+; GFX10-NEXT:    s_ashr_i32 s10, s31, 31
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX10-NEXT:    s_ashr_i64 s[0:1], s[30:31], s20
+; GFX10-NEXT:    s_ashr_i64 s[8:9], s[30:31], s21
+; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX10-NEXT:    s_mov_b32 s11, s10
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX10-NEXT:    s_mov_b32 s3, s2
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_and_b32 s10, s23, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, s29
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[28:29], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
-; GFX10-NEXT:    s_add_u32 s0, s0, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[16:17], s[2:3]
+; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s17
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_add_u32 s2, s2, 0
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s28
+; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_brev_b32 s23, 1
-; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s31
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX10-NEXT:    s_addc_u32 s2, s2, 0
+; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    s_addc_u32 s1, s1, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s2, vcc_lo
+; GFX10-NEXT:    s_add_u32 s2, s4, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s1, vcc_lo
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s30
-; GFX10-NEXT:    s_addc_u32 s3, s3, s23
-; GFX10-NEXT:    s_add_u32 s0, s4, s12
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s3, vcc_lo
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    s_addc_u32 s1, s5, s13
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    s_addc_u32 s3, s5, s13
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s3, s[0:1], s[4:5]
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s1, s[2:3], s[4:5]
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s0, vcc_lo
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s8, s6, s14
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s3
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s1
 ; GFX10-NEXT:    s_and_b32 s9, s9, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s8
 ; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_addc_u32 s9, s7, s15
 ; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[6:7]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[8:9], s[6:7]
-; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[8:9], s[6:7]
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s9
-; GFX10-NEXT:    s_and_b32 s2, 1, s2
+; GFX10-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s2, s[12:13], 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX10-NEXT:    s_and_b32 s16, 1, s3
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[12:13], 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s1
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    s_and_b32 s16, 1, s1
 ; GFX10-NEXT:    s_cmp_lt_u32 s20, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s2
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[14:15], 0
-; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[14:15], 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s20, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s16
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[8:9], s22
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
-; GFX10-NEXT:    s_lshr_b64 s[2:3], s[0:1], s20
-; GFX10-NEXT:    s_and_b32 s13, s10, 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
-; GFX10-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX10-NEXT:    s_ashr_i64 s[6:7], s[8:9], s20
-; GFX10-NEXT:    s_ashr_i64 s[10:11], s[8:9], s21
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10-NEXT:    s_mov_b32 s5, s4
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
-; GFX10-NEXT:    s_and_b32 s10, s12, 1
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[2:3], s20
+; GFX10-NEXT:    s_lshl_b64 s[6:7], s[8:9], s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    s_ashr_i32 s10, s9, 31
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX10-NEXT:    s_ashr_i64 s[0:1], s[8:9], s20
+; GFX10-NEXT:    s_ashr_i64 s[6:7], s[8:9], s21
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_mov_b32 s11, s10
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
-; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX10-NEXT:    v_mov_b32_e32 v6, s1
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
-; GFX10-NEXT:    s_add_u32 s2, s2, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[2:3], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, s3
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_add_u32 s4, s4, 0
 ; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s0
+; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
 ; GFX10-NEXT:    s_and_b32 s6, s6, 1
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX10-NEXT:    s_addc_u32 s5, s5, 0
 ; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX10-NEXT:    s_and_b32 s6, s6, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX10-NEXT:    s_addc_u32 s4, s4, 0
+; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, s2, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    s_and_b32 s6, s6, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, s3, vcc_lo
 ; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, s4, vcc_lo
-; GFX10-NEXT:    s_addc_u32 s1, s5, s23
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX10-NEXT:    s_addc_u32 s1, s1, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, s5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v8, s1, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX10-NEXT:    v_readfirstlane_b32 s6, v6

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 375b35e010ea..753aec99cdb1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4810,11 +4810,9 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_ashr_i64 s[6:7], s[10:11], s6
-; GFX6-NEXT:    s_and_b32 s12, s12, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT:    s_and_b32 s6, s13, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
@@ -4903,11 +4901,9 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX8-NEXT:    s_ashr_i64 s[6:7], s[10:11], s6
-; GFX8-NEXT:    s_and_b32 s12, s12, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT:    s_and_b32 s6, s13, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
 ; GFX8-NEXT:    s_mov_b32 s5, s4
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
@@ -4996,11 +4992,9 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX9-NEXT:    s_ashr_i64 s[6:7], s[10:11], s6
-; GFX9-NEXT:    s_and_b32 s12, s12, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT:    s_and_b32 s6, s13, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
@@ -5071,55 +5065,53 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    s_sub_i32 s13, s12, 64
 ; GFX10-NEXT:    s_and_b32 s14, 1, s1
-; GFX10-NEXT:    s_sub_i32 s2, 64, s12
+; GFX10-NEXT:    s_sub_i32 s15, 64, s12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[6:7], 0
-; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s14
-; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s14
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[8:9], s12
+; GFX10-NEXT:    s_lshl_b64 s[4:5], s[10:11], s15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s12
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[10:11], s2
-; GFX10-NEXT:    s_ashr_i64 s[4:5], s[10:11], s12
-; GFX10-NEXT:    s_and_b32 s12, s15, 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    s_ashr_i32 s2, s11, 31
-; GFX10-NEXT:    s_ashr_i64 s[6:7], s[10:11], s13
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX10-NEXT:    s_mov_b32 s3, s2
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
-; GFX10-NEXT:    s_and_b32 s6, s16, 1
+; GFX10-NEXT:    s_ashr_i32 s6, s11, 31
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    s_ashr_i64 s[0:1], s[10:11], s12
+; GFX10-NEXT:    s_ashr_i64 s[4:5], s[10:11], s13
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_mov_b32 s7, s6
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s9
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
-; GFX10-NEXT:    s_add_u32 s0, s0, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT:    s_add_u32 s2, s2, 0
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s8
+; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_and_b32 s4, s4, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    s_addc_u32 s2, s2, 0
+; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
 ; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, s2, vcc_lo
-; GFX10-NEXT:    s_addc_u32 s3, s3, 0x80000000
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v4, s1, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v4, s3, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
@@ -6109,11 +6101,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_ashr_i32 s8, s19, 31
 ; GFX6-NEXT:    s_ashr_i64 s[0:1], s[18:19], s20
 ; GFX6-NEXT:    s_ashr_i64 s[10:11], s[18:19], s21
-; GFX6-NEXT:    s_and_b32 s23, s23, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
-; GFX6-NEXT:    s_and_b32 s10, s24, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[16:17], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX6-NEXT:    s_mov_b32 s9, s8
@@ -6185,11 +6175,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX6-NEXT:    s_ashr_i64 s[4:5], s[2:3], s20
 ; GFX6-NEXT:    s_ashr_i64 s[10:11], s[2:3], s21
-; GFX6-NEXT:    s_and_b32 s12, s12, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[10:11]
-; GFX6-NEXT:    s_and_b32 s10, s13, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
 ; GFX6-NEXT:    s_mov_b32 s9, s8
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
@@ -6283,11 +6271,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_ashr_i32 s8, s19, 31
 ; GFX8-NEXT:    s_ashr_i64 s[0:1], s[18:19], s20
 ; GFX8-NEXT:    s_ashr_i64 s[10:11], s[18:19], s21
-; GFX8-NEXT:    s_and_b32 s23, s23, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
-; GFX8-NEXT:    s_and_b32 s10, s24, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[16:17], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_mov_b32 s9, s8
@@ -6366,11 +6352,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX8-NEXT:    s_ashr_i64 s[4:5], s[2:3], s20
 ; GFX8-NEXT:    s_ashr_i64 s[10:11], s[2:3], s21
-; GFX8-NEXT:    s_and_b32 s12, s12, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[10:11]
-; GFX8-NEXT:    s_and_b32 s10, s13, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
 ; GFX8-NEXT:    s_mov_b32 s9, s8
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
@@ -6463,11 +6447,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_ashr_i32 s8, s19, 31
 ; GFX9-NEXT:    s_ashr_i64 s[0:1], s[18:19], s20
 ; GFX9-NEXT:    s_ashr_i64 s[10:11], s[18:19], s21
-; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
-; GFX9-NEXT:    s_and_b32 s10, s24, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[16:17], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX9-NEXT:    s_mov_b32 s9, s8
@@ -6546,11 +6528,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX9-NEXT:    s_ashr_i64 s[4:5], s[2:3], s20
 ; GFX9-NEXT:    s_ashr_i64 s[10:11], s[2:3], s21
-; GFX9-NEXT:    s_and_b32 s12, s12, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[10:11]
-; GFX9-NEXT:    s_and_b32 s10, s13, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
 ; GFX9-NEXT:    s_mov_b32 s9, s8
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
@@ -6596,15 +6576,15 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ;
 ; GFX10-LABEL: s_ssubsat_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_sub_u32 s28, s0, s8
+; GFX10-NEXT:    s_sub_u32 s16, s0, s8
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
 ; GFX10-NEXT:    s_mov_b32 s46, s0
 ; GFX10-NEXT:    s_and_b32 s17, s17, 1
 ; GFX10-NEXT:    s_mov_b32 s47, s1
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX10-NEXT:    s_subb_u32 s29, s1, s9
+; GFX10-NEXT:    s_subb_u32 s17, s1, s9
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[28:29], s[46:47]
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[46:47]
 ; GFX10-NEXT:    s_and_b32 s18, s18, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX10-NEXT:    s_subb_u32 s30, s2, s10
@@ -6624,136 +6604,132 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[10:11], 0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10-NEXT:    s_and_b32 s1, 1, s1
 ; GFX10-NEXT:    s_sub_i32 s21, s20, 64
 ; GFX10-NEXT:    s_sub_i32 s22, 64, s20
 ; GFX10-NEXT:    s_cmp_lt_u32 s20, 64
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s20, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[16:17], s20
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[30:31], s22
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[28:29], s20
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[30:31], s22
-; GFX10-NEXT:    s_and_b32 s24, s10, 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    s_ashr_i32 s2, s31, 31
-; GFX10-NEXT:    s_ashr_i64 s[8:9], s[30:31], s20
-; GFX10-NEXT:    s_ashr_i64 s[10:11], s[30:31], s21
+; GFX10-NEXT:    s_ashr_i32 s10, s31, 31
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX10-NEXT:    s_ashr_i64 s[0:1], s[30:31], s20
+; GFX10-NEXT:    s_ashr_i64 s[8:9], s[30:31], s21
+; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX10-NEXT:    s_mov_b32 s11, s10
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX10-NEXT:    s_mov_b32 s3, s2
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_and_b32 s10, s23, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, s29
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[28:29], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
-; GFX10-NEXT:    s_add_u32 s0, s0, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[16:17], s[2:3]
+; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s17
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_add_u32 s2, s2, 0
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s28
+; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_brev_b32 s23, 1
-; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s31
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX10-NEXT:    s_addc_u32 s2, s2, 0
+; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    s_addc_u32 s1, s1, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s2, vcc_lo
+; GFX10-NEXT:    s_sub_u32 s2, s4, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s1, vcc_lo
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s30
-; GFX10-NEXT:    s_addc_u32 s3, s3, s23
-; GFX10-NEXT:    s_sub_u32 s0, s4, s12
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s3, vcc_lo
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    s_subb_u32 s1, s5, s13
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    s_subb_u32 s3, s5, s13
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s3, s[0:1], s[4:5]
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s1, s[2:3], s[4:5]
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s0, vcc_lo
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_subb_u32 s8, s6, s14
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s3
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s1
 ; GFX10-NEXT:    s_and_b32 s9, s9, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s8
 ; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_subb_u32 s9, s7, s15
 ; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[6:7]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[8:9], s[6:7]
-; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[8:9], s[6:7]
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s9
-; GFX10-NEXT:    s_and_b32 s2, 1, s2
+; GFX10-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[12:13], 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX10-NEXT:    s_and_b32 s16, 1, s3
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s0, s[12:13], 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s1
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    s_and_b32 s16, 1, s1
 ; GFX10-NEXT:    s_cmp_lt_u32 s20, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s2
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[14:15], 0
-; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[14:15], 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s20, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s16
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[8:9], s22
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
-; GFX10-NEXT:    s_lshr_b64 s[2:3], s[0:1], s20
-; GFX10-NEXT:    s_and_b32 s13, s10, 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
-; GFX10-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX10-NEXT:    s_ashr_i64 s[6:7], s[8:9], s20
-; GFX10-NEXT:    s_ashr_i64 s[10:11], s[8:9], s21
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10-NEXT:    s_mov_b32 s5, s4
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
-; GFX10-NEXT:    s_and_b32 s10, s12, 1
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[2:3], s20
+; GFX10-NEXT:    s_lshl_b64 s[6:7], s[8:9], s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    s_ashr_i32 s10, s9, 31
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX10-NEXT:    s_ashr_i64 s[0:1], s[8:9], s20
+; GFX10-NEXT:    s_ashr_i64 s[6:7], s[8:9], s21
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_mov_b32 s11, s10
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
-; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX10-NEXT:    v_mov_b32_e32 v6, s1
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
-; GFX10-NEXT:    s_add_u32 s2, s2, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[2:3], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, s3
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_add_u32 s4, s4, 0
 ; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s0
+; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
 ; GFX10-NEXT:    s_and_b32 s6, s6, 1
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX10-NEXT:    s_addc_u32 s5, s5, 0
 ; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX10-NEXT:    s_and_b32 s6, s6, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX10-NEXT:    s_addc_u32 s4, s4, 0
+; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, s2, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    s_and_b32 s6, s6, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, s3, vcc_lo
 ; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, s4, vcc_lo
-; GFX10-NEXT:    s_addc_u32 s1, s5, s23
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX10-NEXT:    s_addc_u32 s1, s1, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, s5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v8, s1, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX10-NEXT:    v_readfirstlane_b32 s6, v6


        


More information about the llvm-commits mailing list