[llvm] 2179867 - [AArch64] Select saturating Neon instructions

David Green via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 31 10:29:19 PDT 2019


Author: David Green
Date: 2019-10-31T17:28:36Z
New Revision: 2179867ddc2990b141d53ade1d66fc1af66b412e

URL: https://github.com/llvm/llvm-project/commit/2179867ddc2990b141d53ade1d66fc1af66b412e
DIFF: https://github.com/llvm/llvm-project/commit/2179867ddc2990b141d53ade1d66fc1af66b412e.diff

LOG: [AArch64] Select saturating Neon instructions

This adds some extra patterns to select AArch64 Neon SQADD, UQADD, SQSUB
and UQSUB from the existing target independent sadd_sat, uadd_sat,
ssub_sat and usub_sat nodes.

It does not attempt to replace the existing int_aarch64_neon_uqadd
intrinsic nodes as they are apparently used for both scalar and vector,
and need to be legal on scalar types for some of the patterns to work.
The int_aarch64_neon_uqadd on scalar would move the two integers into
floating point registers, perform a Neon uqadd and move the value back.
I don't believe this is good idea for uadd_sat to do the same as the
scalar alternative is simpler (an adds with a csinv). For signed it may
be smaller, but I'm not sure about it being better.

So this just adds some extra patterns for the existing vector
instructions, matching on the _sat nodes.

Differential Revision: https://reviews.llvm.org/D69374

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64InstrFormats.td
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/test/CodeGen/AArch64/sadd_sat.ll
    llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
    llvm/test/CodeGen/AArch64/ssub_sat.ll
    llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
    llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
    llvm/test/CodeGen/AArch64/usub_sat_vec.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4a5d3bc4dfd5..3ea3fae9ec5c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -741,14 +741,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
 
-    // Vector reductions
     for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
                     MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+      // Vector reductions
       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+
+      // Saturates
+      setOperationAction(ISD::SADDSAT, VT, Legal);
+      setOperationAction(ISD::UADDSAT, VT, Legal);
+      setOperationAction(ISD::SSUBSAT, VT, Legal);
+      setOperationAction(ISD::USUBSAT, VT, Legal);
     }
     for (MVT VT : { MVT::v4f16, MVT::v2f32,
                     MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index f555e4123307..10b5a21a9222 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5066,6 +5066,24 @@ multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
          [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
 }
 
+multiclass SIMDThreeSameVectorExtraPatterns<string inst, SDPatternOperator OpNode> {
+  def : Pat<(v8i8 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(inst#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(inst#"v4i16") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(inst#"v2i32") V64:$LHS, V64:$RHS)>;
+
+  def : Pat<(v16i8 (OpNode V128:$LHS, V128:$RHS)),
+          (!cast<Instruction>(inst#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
+          (!cast<Instruction>(inst#"v8i16") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
+          (!cast<Instruction>(inst#"v4i32") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
+          (!cast<Instruction>(inst#"v2i64") V128:$LHS, V128:$RHS)>;
+}
+
 // As above, but D sized elements unsupported.
 multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
                                   SDPatternOperator OpNode> {

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c0cb471c9391..77bee6019111 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3839,6 +3839,12 @@ defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
 defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
                                                     int_aarch64_neon_sqsub>;
 
+// Extra saturate patterns, other than the intrinsics matches above
+defm : SIMDThreeSameVectorExtraPatterns<"SQADD", saddsat>;
+defm : SIMDThreeSameVectorExtraPatterns<"UQADD", uaddsat>;
+defm : SIMDThreeSameVectorExtraPatterns<"SQSUB", ssubsat>;
+defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>;
+
 defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
 defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
                                   BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;

diff  --git a/llvm/test/CodeGen/AArch64/sadd_sat.ll b/llvm/test/CodeGen/AArch64/sadd_sat.ll
index a28beb744a7f..99711660615e 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat.ll
@@ -88,15 +88,7 @@ define i4 @func3(i4 %x, i4 %y) nounwind {
 define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: vec:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmlt v4.4s, v2.4s, #0
-; CHECK-NEXT:    mvni v3.4s, #128, lsl #24
-; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    sqadd v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
   ret <4 x i32> %tmp;

diff  --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index ab886b883afb..893ed6445462 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -35,15 +35,7 @@ declare <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128>, <2 x i128>)
 define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; CHECK-LABEL: v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmlt v4.16b, v2.16b, #0
-; CHECK-NEXT:    movi v3.16b, #127
-; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmgt v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    sqadd v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %z = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %z
@@ -52,24 +44,8 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; CHECK-LABEL: v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v4.16b, v0.16b, v2.16b
-; CHECK-NEXT:    cmlt v7.16b, v4.16b, #0
-; CHECK-NEXT:    movi v6.16b, #127
-; CHECK-NEXT:    mvn v16.16b, v7.16b
-; CHECK-NEXT:    bsl v6.16b, v7.16b, v16.16b
-; CHECK-NEXT:    add v7.16b, v1.16b, v3.16b
-; CHECK-NEXT:    cmlt v2.16b, v2.16b, #0
-; CHECK-NEXT:    cmgt v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    cmlt v16.16b, v7.16b, #0
-; CHECK-NEXT:    movi v5.16b, #127
-; CHECK-NEXT:    cmlt v3.16b, v3.16b, #0
-; CHECK-NEXT:    cmgt v1.16b, v1.16b, v7.16b
-; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    mvn v2.16b, v16.16b
-; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v2.16b
-; CHECK-NEXT:    bsl v0.16b, v6.16b, v4.16b
-; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-NEXT:    sqadd v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    sqadd v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    ret
   %z = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
@@ -78,42 +54,10 @@ define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 ; CHECK-LABEL: v64i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v16.16b, v0.16b, v4.16b
-; CHECK-NEXT:    cmlt v24.16b, v16.16b, #0
-; CHECK-NEXT:    movi v18.16b, #127
-; CHECK-NEXT:    add v19.16b, v1.16b, v5.16b
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    bsl v18.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.16b, v19.16b, #0
-; CHECK-NEXT:    movi v20.16b, #127
-; CHECK-NEXT:    add v21.16b, v2.16b, v6.16b
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    bsl v20.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.16b, v21.16b, #0
-; CHECK-NEXT:    cmlt v4.16b, v4.16b, #0
-; CHECK-NEXT:    cmgt v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    movi v22.16b, #127
-; CHECK-NEXT:    add v23.16b, v3.16b, v7.16b
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    cmlt v4.16b, v5.16b, #0
-; CHECK-NEXT:    cmgt v1.16b, v1.16b, v19.16b
-; CHECK-NEXT:    bsl v22.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.16b, v23.16b, #0
-; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    cmlt v4.16b, v6.16b, #0
-; CHECK-NEXT:    cmgt v2.16b, v2.16b, v21.16b
-; CHECK-NEXT:    movi v17.16b, #127
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    cmlt v4.16b, v7.16b, #0
-; CHECK-NEXT:    cmgt v3.16b, v3.16b, v23.16b
-; CHECK-NEXT:    bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v0.16b, v18.16b, v16.16b
-; CHECK-NEXT:    bsl v1.16b, v20.16b, v19.16b
-; CHECK-NEXT:    bsl v2.16b, v22.16b, v21.16b
-; CHECK-NEXT:    bsl v3.16b, v17.16b, v23.16b
+; CHECK-NEXT:    sqadd v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    sqadd v1.16b, v1.16b, v5.16b
+; CHECK-NEXT:    sqadd v2.16b, v2.16b, v6.16b
+; CHECK-NEXT:    sqadd v3.16b, v3.16b, v7.16b
 ; CHECK-NEXT:    ret
   %z = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
@@ -122,15 +66,7 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; CHECK-LABEL: v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v2.8h, v0.8h, v1.8h
-; CHECK-NEXT:    cmlt v4.8h, v2.8h, #0
-; CHECK-NEXT:    mvni v3.8h, #128, lsl #8
-; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
-; CHECK-NEXT:    cmgt v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    sqadd v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %z = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %z
@@ -139,24 +75,8 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
 ; CHECK-LABEL: v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v4.8h, v0.8h, v2.8h
-; CHECK-NEXT:    cmlt v7.8h, v4.8h, #0
-; CHECK-NEXT:    mvni v6.8h, #128, lsl #8
-; CHECK-NEXT:    mvn v16.16b, v7.16b
-; CHECK-NEXT:    bsl v6.16b, v7.16b, v16.16b
-; CHECK-NEXT:    add v7.8h, v1.8h, v3.8h
-; CHECK-NEXT:    cmlt v2.8h, v2.8h, #0
-; CHECK-NEXT:    cmgt v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    cmlt v16.8h, v7.8h, #0
-; CHECK-NEXT:    mvni v5.8h, #128, lsl #8
-; CHECK-NEXT:    cmlt v3.8h, v3.8h, #0
-; CHECK-NEXT:    cmgt v1.8h, v1.8h, v7.8h
-; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    mvn v2.16b, v16.16b
-; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v2.16b
-; CHECK-NEXT:    bsl v0.16b, v6.16b, v4.16b
-; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-NEXT:    sqadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    sqadd v1.8h, v1.8h, v3.8h
 ; CHECK-NEXT:    ret
   %z = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
@@ -165,42 +85,10 @@ define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 ; CHECK-LABEL: v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v16.8h, v0.8h, v4.8h
-; CHECK-NEXT:    cmlt v24.8h, v16.8h, #0
-; CHECK-NEXT:    mvni v18.8h, #128, lsl #8
-; CHECK-NEXT:    add v19.8h, v1.8h, v5.8h
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    bsl v18.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.8h, v19.8h, #0
-; CHECK-NEXT:    mvni v20.8h, #128, lsl #8
-; CHECK-NEXT:    add v21.8h, v2.8h, v6.8h
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    bsl v20.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.8h, v21.8h, #0
-; CHECK-NEXT:    cmlt v4.8h, v4.8h, #0
-; CHECK-NEXT:    cmgt v0.8h, v0.8h, v16.8h
-; CHECK-NEXT:    mvni v22.8h, #128, lsl #8
-; CHECK-NEXT:    add v23.8h, v3.8h, v7.8h
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    cmlt v4.8h, v5.8h, #0
-; CHECK-NEXT:    cmgt v1.8h, v1.8h, v19.8h
-; CHECK-NEXT:    bsl v22.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.8h, v23.8h, #0
-; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    cmlt v4.8h, v6.8h, #0
-; CHECK-NEXT:    cmgt v2.8h, v2.8h, v21.8h
-; CHECK-NEXT:    mvni v17.8h, #128, lsl #8
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    cmlt v4.8h, v7.8h, #0
-; CHECK-NEXT:    cmgt v3.8h, v3.8h, v23.8h
-; CHECK-NEXT:    bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v0.16b, v18.16b, v16.16b
-; CHECK-NEXT:    bsl v1.16b, v20.16b, v19.16b
-; CHECK-NEXT:    bsl v2.16b, v22.16b, v21.16b
-; CHECK-NEXT:    bsl v3.16b, v17.16b, v23.16b
+; CHECK-NEXT:    sqadd v0.8h, v0.8h, v4.8h
+; CHECK-NEXT:    sqadd v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    sqadd v2.8h, v2.8h, v6.8h
+; CHECK-NEXT:    sqadd v3.8h, v3.8h, v7.8h
 ; CHECK-NEXT:    ret
   %z = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
@@ -211,15 +99,7 @@ define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    movi v2.8b, #127
-; CHECK-NEXT:    add v3.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmlt v4.8b, v3.8b, #0
-; CHECK-NEXT:    cmlt v1.8b, v1.8b, #0
-; CHECK-NEXT:    cmgt v0.8b, v0.8b, v3.8b
-; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    sqadd v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, <8 x i8>* %px
@@ -248,11 +128,10 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
 ; CHECK-NEXT:    mov v1.h[2], w9
 ; CHECK-NEXT:    mov v0.h[3], w10
 ; CHECK-NEXT:    mov v1.h[3], w11
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    movi v1.4h, #127
-; CHECK-NEXT:    smin v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    mvni v1.4h, #127
-; CHECK-NEXT:    smax v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    str s0, [x2]
 ; CHECK-NEXT:    ret
@@ -266,19 +145,18 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
 define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsb w8, [x0]
-; CHECK-NEXT:    ldrsb w9, [x1]
-; CHECK-NEXT:    ldrsb w10, [x0, #1]
-; CHECK-NEXT:    ldrsb w11, [x1, #1]
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ldrb w9, [x1]
+; CHECK-NEXT:    ldrb w10, [x0, #1]
+; CHECK-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    mov v0.s[1], w10
 ; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    movi v1.2s, #127
-; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mvni v1.2s, #127
-; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    shl v1.2s, v1.2s, #24
+; CHECK-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-NEXT:    sqadd v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strb w8, [x2, #1]
@@ -296,15 +174,7 @@ define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
-; CHECK-NEXT:    add v3.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT:    cmlt v1.4h, v1.4h, #0
-; CHECK-NEXT:    cmgt v0.4h, v0.4h, v3.4h
-; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, <4 x i16>* %px
@@ -317,19 +187,18 @@ define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
 define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsh w8, [x0]
-; CHECK-NEXT:    ldrsh w9, [x1]
-; CHECK-NEXT:    ldrsh w10, [x0, #2]
-; CHECK-NEXT:    ldrsh w11, [x1, #2]
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    ldrh w10, [x0, #2]
+; CHECK-NEXT:    ldrh w11, [x1, #2]
 ; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    mov v0.s[1], w10
 ; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    movi v1.2s, #127, msl #8
-; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mvni v1.2s, #127, msl #8
-; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    sqadd v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strh w8, [x2, #2]
@@ -345,15 +214,7 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
 define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 ; CHECK-LABEL: v12i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmlt v4.16b, v2.16b, #0
-; CHECK-NEXT:    movi v3.16b, #127
-; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmgt v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    sqadd v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %z = call <12 x i8> @llvm.sadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
   ret <12 x i8> %z
@@ -364,24 +225,8 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ldp q3, q2, [x1]
-; CHECK-NEXT:    mvni v5.8h, #128, lsl #8
-; CHECK-NEXT:    mvni v4.8h, #128, lsl #8
-; CHECK-NEXT:    add v6.8h, v1.8h, v2.8h
-; CHECK-NEXT:    cmlt v7.8h, v6.8h, #0
-; CHECK-NEXT:    mvn v16.16b, v7.16b
-; CHECK-NEXT:    bsl v5.16b, v7.16b, v16.16b
-; CHECK-NEXT:    add v7.8h, v0.8h, v3.8h
-; CHECK-NEXT:    cmlt v2.8h, v2.8h, #0
-; CHECK-NEXT:    cmgt v1.8h, v1.8h, v6.8h
-; CHECK-NEXT:    cmlt v16.8h, v7.8h, #0
-; CHECK-NEXT:    cmlt v3.8h, v3.8h, #0
-; CHECK-NEXT:    cmgt v0.8h, v0.8h, v7.8h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v16.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    bsl v4.16b, v16.16b, v2.16b
-; CHECK-NEXT:    bsl v1.16b, v5.16b, v6.16b
-; CHECK-NEXT:    bsl v0.16b, v4.16b, v7.16b
+; CHECK-NEXT:    sqadd v1.8h, v1.8h, v2.8h
+; CHECK-NEXT:    sqadd v0.8h, v0.8h, v3.8h
 ; CHECK-NEXT:    str q0, [x2]
 ; CHECK-NEXT:    str d1, [x2, #16]
 ; CHECK-NEXT:    ret
@@ -397,15 +242,7 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    ldr b1, [x1]
-; CHECK-NEXT:    movi v2.8b, #127
-; CHECK-NEXT:    add v3.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmlt v4.8b, v3.8b, #0
-; CHECK-NEXT:    cmlt v1.8b, v1.8b, #0
-; CHECK-NEXT:    cmgt v0.8b, v0.8b, v3.8b
-; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    sqadd v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i8>, <1 x i8>* %px
@@ -420,15 +257,7 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ldr h1, [x1]
-; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
-; CHECK-NEXT:    add v3.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT:    cmlt v1.4h, v1.4h, #0
-; CHECK-NEXT:    cmgt v0.4h, v0.4h, v3.4h
-; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i16>, <1 x i16>* %px
@@ -444,11 +273,11 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #4
 ; CHECK-NEXT:    shl v1.16b, v1.16b, #4
 ; CHECK-NEXT:    sshr v0.16b, v0.16b, #4
-; CHECK-NEXT:    movi v2.16b, #7
-; CHECK-NEXT:    ssra v0.16b, v1.16b, #4
-; CHECK-NEXT:    smin v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    movi v1.16b, #248
-; CHECK-NEXT:    smax v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    sshr v1.16b, v1.16b, #4
+; CHECK-NEXT:    shl v1.16b, v1.16b, #4
+; CHECK-NEXT:    shl v0.16b, v0.16b, #4
+; CHECK-NEXT:    sqadd v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    sshr v0.16b, v0.16b, #4
 ; CHECK-NEXT:    ret
   %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
   ret <16 x i4> %z
@@ -460,11 +289,11 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
 ; CHECK-NEXT:    shl v1.16b, v1.16b, #7
 ; CHECK-NEXT:    sshr v0.16b, v0.16b, #7
-; CHECK-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-NEXT:    ssra v0.16b, v1.16b, #7
-; CHECK-NEXT:    smin v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT:    smax v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    sshr v1.16b, v1.16b, #7
+; CHECK-NEXT:    shl v1.16b, v1.16b, #7
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    sqadd v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    sshr v0.16b, v0.16b, #7
 ; CHECK-NEXT:    ret
   %z = call <16 x i1> @llvm.sadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
   ret <16 x i1> %z
@@ -473,15 +302,7 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; CHECK-LABEL: v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v2.2s, v0.2s, v1.2s
-; CHECK-NEXT:    cmlt v4.2s, v2.2s, #0
-; CHECK-NEXT:    mvni v3.2s, #128, lsl #24
-; CHECK-NEXT:    cmlt v1.2s, v1.2s, #0
-; CHECK-NEXT:    cmgt v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    bsl v3.8b, v4.8b, v5.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    bsl v0.8b, v3.8b, v2.8b
+; CHECK-NEXT:    sqadd v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
   ret <2 x i32> %z
@@ -490,15 +311,7 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmlt v4.4s, v2.4s, #0
-; CHECK-NEXT:    mvni v3.4s, #128, lsl #24
-; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    sqadd v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %z
@@ -507,24 +320,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; CHECK-LABEL: v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v4.4s, v0.4s, v2.4s
-; CHECK-NEXT:    cmlt v7.4s, v4.4s, #0
-; CHECK-NEXT:    mvni v6.4s, #128, lsl #24
-; CHECK-NEXT:    mvn v16.16b, v7.16b
-; CHECK-NEXT:    bsl v6.16b, v7.16b, v16.16b
-; CHECK-NEXT:    add v7.4s, v1.4s, v3.4s
-; CHECK-NEXT:    cmlt v2.4s, v2.4s, #0
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    cmlt v16.4s, v7.4s, #0
-; CHECK-NEXT:    mvni v5.4s, #128, lsl #24
-; CHECK-NEXT:    cmlt v3.4s, v3.4s, #0
-; CHECK-NEXT:    cmgt v1.4s, v1.4s, v7.4s
-; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    mvn v2.16b, v16.16b
-; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v2.16b
-; CHECK-NEXT:    bsl v0.16b, v6.16b, v4.16b
-; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-NEXT:    sqadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    sqadd v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    ret
   %z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
@@ -533,42 +330,10 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; CHECK-LABEL: v16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v16.4s, v0.4s, v4.4s
-; CHECK-NEXT:    cmlt v24.4s, v16.4s, #0
-; CHECK-NEXT:    mvni v18.4s, #128, lsl #24
-; CHECK-NEXT:    add v19.4s, v1.4s, v5.4s
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    bsl v18.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.4s, v19.4s, #0
-; CHECK-NEXT:    mvni v20.4s, #128, lsl #24
-; CHECK-NEXT:    add v21.4s, v2.4s, v6.4s
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    bsl v20.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.4s, v21.4s, #0
-; CHECK-NEXT:    cmlt v4.4s, v4.4s, #0
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v16.4s
-; CHECK-NEXT:    mvni v22.4s, #128, lsl #24
-; CHECK-NEXT:    add v23.4s, v3.4s, v7.4s
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    cmlt v4.4s, v5.4s, #0
-; CHECK-NEXT:    cmgt v1.4s, v1.4s, v19.4s
-; CHECK-NEXT:    bsl v22.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.4s, v23.4s, #0
-; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    cmlt v4.4s, v6.4s, #0
-; CHECK-NEXT:    cmgt v2.4s, v2.4s, v21.4s
-; CHECK-NEXT:    mvni v17.4s, #128, lsl #24
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    cmlt v4.4s, v7.4s, #0
-; CHECK-NEXT:    cmgt v3.4s, v3.4s, v23.4s
-; CHECK-NEXT:    bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v0.16b, v18.16b, v16.16b
-; CHECK-NEXT:    bsl v1.16b, v20.16b, v19.16b
-; CHECK-NEXT:    bsl v2.16b, v22.16b, v21.16b
-; CHECK-NEXT:    bsl v3.16b, v17.16b, v23.16b
+; CHECK-NEXT:    sqadd v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    sqadd v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    sqadd v2.4s, v2.4s, v6.4s
+; CHECK-NEXT:    sqadd v3.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    ret
   %z = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
@@ -577,16 +342,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK-LABEL: v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v2.2d, v0.2d, v1.2d
-; CHECK-NEXT:    mov x8, #9223372036854775807
-; CHECK-NEXT:    cmlt v3.2d, v2.2d, #0
-; CHECK-NEXT:    cmlt v1.2d, v1.2d, #0
-; CHECK-NEXT:    dup v4.2d, x8
-; CHECK-NEXT:    cmgt v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    mvn v5.16b, v3.16b
-; CHECK-NEXT:    bsl v4.16b, v3.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bsl v0.16b, v4.16b, v2.16b
+; CHECK-NEXT:    sqadd v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %z = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
   ret <2 x i64> %z
@@ -595,25 +351,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v4.2d, v0.2d, v2.2d
-; CHECK-NEXT:    mov x8, #9223372036854775807
-; CHECK-NEXT:    cmlt v5.2d, v4.2d, #0
-; CHECK-NEXT:    dup v6.2d, x8
-; CHECK-NEXT:    mvn v7.16b, v5.16b
-; CHECK-NEXT:    mov v16.16b, v6.16b
-; CHECK-NEXT:    bsl v16.16b, v5.16b, v7.16b
-; CHECK-NEXT:    add v5.2d, v1.2d, v3.2d
-; CHECK-NEXT:    cmlt v2.2d, v2.2d, #0
-; CHECK-NEXT:    cmgt v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    cmlt v7.2d, v5.2d, #0
-; CHECK-NEXT:    cmlt v3.2d, v3.2d, #0
-; CHECK-NEXT:    cmgt v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    mvn v2.16b, v7.16b
-; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    bsl v6.16b, v7.16b, v2.16b
-; CHECK-NEXT:    bsl v0.16b, v16.16b, v4.16b
-; CHECK-NEXT:    bsl v1.16b, v6.16b, v5.16b
+; CHECK-NEXT:    sqadd v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    sqadd v1.2d, v1.2d, v3.2d
 ; CHECK-NEXT:    ret
   %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
@@ -622,43 +361,10 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; CHECK-LABEL: v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v16.2d, v0.2d, v4.2d
-; CHECK-NEXT:    mov x8, #9223372036854775807
-; CHECK-NEXT:    add v17.2d, v1.2d, v5.2d
-; CHECK-NEXT:    cmlt v20.2d, v16.2d, #0
-; CHECK-NEXT:    dup v21.2d, x8
-; CHECK-NEXT:    add v18.2d, v2.2d, v6.2d
-; CHECK-NEXT:    cmlt v22.2d, v17.2d, #0
-; CHECK-NEXT:    mvn v24.16b, v20.16b
-; CHECK-NEXT:    mov v25.16b, v21.16b
-; CHECK-NEXT:    cmlt v23.2d, v18.2d, #0
-; CHECK-NEXT:    bsl v25.16b, v20.16b, v24.16b
-; CHECK-NEXT:    mvn v20.16b, v22.16b
-; CHECK-NEXT:    mov v24.16b, v21.16b
-; CHECK-NEXT:    cmlt v4.2d, v4.2d, #0
-; CHECK-NEXT:    cmgt v0.2d, v0.2d, v16.2d
-; CHECK-NEXT:    add v19.2d, v3.2d, v7.2d
-; CHECK-NEXT:    bsl v24.16b, v22.16b, v20.16b
-; CHECK-NEXT:    mvn v20.16b, v23.16b
-; CHECK-NEXT:    mov v22.16b, v21.16b
-; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    cmlt v4.2d, v5.2d, #0
-; CHECK-NEXT:    cmgt v1.2d, v1.2d, v17.2d
-; CHECK-NEXT:    bsl v22.16b, v23.16b, v20.16b
-; CHECK-NEXT:    cmlt v20.2d, v19.2d, #0
-; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    cmlt v4.2d, v6.2d, #0
-; CHECK-NEXT:    cmgt v2.2d, v2.2d, v18.2d
-; CHECK-NEXT:    mvn v23.16b, v20.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    cmlt v4.2d, v7.2d, #0
-; CHECK-NEXT:    cmgt v3.2d, v3.2d, v19.2d
-; CHECK-NEXT:    bsl v21.16b, v20.16b, v23.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v0.16b, v25.16b, v16.16b
-; CHECK-NEXT:    bsl v1.16b, v24.16b, v17.16b
-; CHECK-NEXT:    bsl v2.16b, v22.16b, v18.16b
-; CHECK-NEXT:    bsl v3.16b, v21.16b, v19.16b
+; CHECK-NEXT:    sqadd v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    sqadd v1.2d, v1.2d, v5.2d
+; CHECK-NEXT:    sqadd v2.2d, v2.2d, v6.2d
+; CHECK-NEXT:    sqadd v3.2d, v3.2d, v7.2d
 ; CHECK-NEXT:    ret
   %z = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z

diff  --git a/llvm/test/CodeGen/AArch64/ssub_sat.ll b/llvm/test/CodeGen/AArch64/ssub_sat.ll
index bc5bd09e9510..4fab863460f3 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat.ll
@@ -88,15 +88,7 @@ define i4 @func3(i4 %x, i4 %y) nounwind {
 define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: vec:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmlt v4.4s, v2.4s, #0
-; CHECK-NEXT:    mvni v3.4s, #128, lsl #24
-; CHECK-NEXT:    cmgt v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    sqsub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
   ret <4 x i32> %tmp;

diff  --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index d83eb4a3bf09..2cf6e896bed0 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -36,15 +36,7 @@ declare <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128>, <2 x i128>)
 define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; CHECK-LABEL: v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmlt v4.16b, v2.16b, #0
-; CHECK-NEXT:    movi v3.16b, #127
-; CHECK-NEXT:    cmgt v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmgt v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    sqsub v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %z = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %z
@@ -53,24 +45,8 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; CHECK-LABEL: v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v4.16b, v0.16b, v2.16b
-; CHECK-NEXT:    cmlt v7.16b, v4.16b, #0
-; CHECK-NEXT:    movi v6.16b, #127
-; CHECK-NEXT:    mvn v16.16b, v7.16b
-; CHECK-NEXT:    bsl v6.16b, v7.16b, v16.16b
-; CHECK-NEXT:    sub v7.16b, v1.16b, v3.16b
-; CHECK-NEXT:    cmgt v2.16b, v2.16b, #0
-; CHECK-NEXT:    cmgt v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    cmlt v16.16b, v7.16b, #0
-; CHECK-NEXT:    movi v5.16b, #127
-; CHECK-NEXT:    cmgt v3.16b, v3.16b, #0
-; CHECK-NEXT:    cmgt v1.16b, v1.16b, v7.16b
-; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    mvn v2.16b, v16.16b
-; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v2.16b
-; CHECK-NEXT:    bsl v0.16b, v6.16b, v4.16b
-; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-NEXT:    sqsub v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    sqsub v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    ret
   %z = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
@@ -79,42 +55,10 @@ define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 ; CHECK-LABEL: v64i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v16.16b, v0.16b, v4.16b
-; CHECK-NEXT:    cmlt v24.16b, v16.16b, #0
-; CHECK-NEXT:    movi v18.16b, #127
-; CHECK-NEXT:    sub v19.16b, v1.16b, v5.16b
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    bsl v18.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.16b, v19.16b, #0
-; CHECK-NEXT:    movi v20.16b, #127
-; CHECK-NEXT:    sub v21.16b, v2.16b, v6.16b
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    bsl v20.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.16b, v21.16b, #0
-; CHECK-NEXT:    cmgt v4.16b, v4.16b, #0
-; CHECK-NEXT:    cmgt v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    movi v22.16b, #127
-; CHECK-NEXT:    sub v23.16b, v3.16b, v7.16b
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    cmgt v4.16b, v5.16b, #0
-; CHECK-NEXT:    cmgt v1.16b, v1.16b, v19.16b
-; CHECK-NEXT:    bsl v22.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.16b, v23.16b, #0
-; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    cmgt v4.16b, v6.16b, #0
-; CHECK-NEXT:    cmgt v2.16b, v2.16b, v21.16b
-; CHECK-NEXT:    movi v17.16b, #127
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    cmgt v4.16b, v7.16b, #0
-; CHECK-NEXT:    cmgt v3.16b, v3.16b, v23.16b
-; CHECK-NEXT:    bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v0.16b, v18.16b, v16.16b
-; CHECK-NEXT:    bsl v1.16b, v20.16b, v19.16b
-; CHECK-NEXT:    bsl v2.16b, v22.16b, v21.16b
-; CHECK-NEXT:    bsl v3.16b, v17.16b, v23.16b
+; CHECK-NEXT:    sqsub v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    sqsub v1.16b, v1.16b, v5.16b
+; CHECK-NEXT:    sqsub v2.16b, v2.16b, v6.16b
+; CHECK-NEXT:    sqsub v3.16b, v3.16b, v7.16b
 ; CHECK-NEXT:    ret
   %z = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
@@ -123,15 +67,7 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; CHECK-LABEL: v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v2.8h, v0.8h, v1.8h
-; CHECK-NEXT:    cmlt v4.8h, v2.8h, #0
-; CHECK-NEXT:    mvni v3.8h, #128, lsl #8
-; CHECK-NEXT:    cmgt v1.8h, v1.8h, #0
-; CHECK-NEXT:    cmgt v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    sqsub v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %z = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %z
@@ -140,24 +76,8 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
 ; CHECK-LABEL: v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v4.8h, v0.8h, v2.8h
-; CHECK-NEXT:    cmlt v7.8h, v4.8h, #0
-; CHECK-NEXT:    mvni v6.8h, #128, lsl #8
-; CHECK-NEXT:    mvn v16.16b, v7.16b
-; CHECK-NEXT:    bsl v6.16b, v7.16b, v16.16b
-; CHECK-NEXT:    sub v7.8h, v1.8h, v3.8h
-; CHECK-NEXT:    cmgt v2.8h, v2.8h, #0
-; CHECK-NEXT:    cmgt v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    cmlt v16.8h, v7.8h, #0
-; CHECK-NEXT:    mvni v5.8h, #128, lsl #8
-; CHECK-NEXT:    cmgt v3.8h, v3.8h, #0
-; CHECK-NEXT:    cmgt v1.8h, v1.8h, v7.8h
-; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    mvn v2.16b, v16.16b
-; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v2.16b
-; CHECK-NEXT:    bsl v0.16b, v6.16b, v4.16b
-; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-NEXT:    sqsub v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    sqsub v1.8h, v1.8h, v3.8h
 ; CHECK-NEXT:    ret
   %z = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
@@ -166,42 +86,10 @@ define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 ; CHECK-LABEL: v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v16.8h, v0.8h, v4.8h
-; CHECK-NEXT:    cmlt v24.8h, v16.8h, #0
-; CHECK-NEXT:    mvni v18.8h, #128, lsl #8
-; CHECK-NEXT:    sub v19.8h, v1.8h, v5.8h
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    bsl v18.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.8h, v19.8h, #0
-; CHECK-NEXT:    mvni v20.8h, #128, lsl #8
-; CHECK-NEXT:    sub v21.8h, v2.8h, v6.8h
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    bsl v20.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.8h, v21.8h, #0
-; CHECK-NEXT:    cmgt v4.8h, v4.8h, #0
-; CHECK-NEXT:    cmgt v0.8h, v0.8h, v16.8h
-; CHECK-NEXT:    mvni v22.8h, #128, lsl #8
-; CHECK-NEXT:    sub v23.8h, v3.8h, v7.8h
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    cmgt v4.8h, v5.8h, #0
-; CHECK-NEXT:    cmgt v1.8h, v1.8h, v19.8h
-; CHECK-NEXT:    bsl v22.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.8h, v23.8h, #0
-; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    cmgt v4.8h, v6.8h, #0
-; CHECK-NEXT:    cmgt v2.8h, v2.8h, v21.8h
-; CHECK-NEXT:    mvni v17.8h, #128, lsl #8
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    cmgt v4.8h, v7.8h, #0
-; CHECK-NEXT:    cmgt v3.8h, v3.8h, v23.8h
-; CHECK-NEXT:    bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v0.16b, v18.16b, v16.16b
-; CHECK-NEXT:    bsl v1.16b, v20.16b, v19.16b
-; CHECK-NEXT:    bsl v2.16b, v22.16b, v21.16b
-; CHECK-NEXT:    bsl v3.16b, v17.16b, v23.16b
+; CHECK-NEXT:    sqsub v0.8h, v0.8h, v4.8h
+; CHECK-NEXT:    sqsub v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    sqsub v2.8h, v2.8h, v6.8h
+; CHECK-NEXT:    sqsub v3.8h, v3.8h, v7.8h
 ; CHECK-NEXT:    ret
   %z = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
@@ -212,15 +100,7 @@ define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    movi v2.8b, #127
-; CHECK-NEXT:    sub v3.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmlt v4.8b, v3.8b, #0
-; CHECK-NEXT:    cmgt v1.8b, v1.8b, #0
-; CHECK-NEXT:    cmgt v0.8b, v0.8b, v3.8b
-; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    sqsub v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, <8 x i8>* %px
@@ -249,11 +129,10 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
 ; CHECK-NEXT:    mov v1.h[2], w9
 ; CHECK-NEXT:    mov v0.h[3], w10
 ; CHECK-NEXT:    mov v1.h[3], w11
-; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    movi v1.4h, #127
-; CHECK-NEXT:    smin v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    mvni v1.4h, #127
-; CHECK-NEXT:    smax v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    str s0, [x2]
 ; CHECK-NEXT:    ret
@@ -267,19 +146,18 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
 define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsb w8, [x0]
-; CHECK-NEXT:    ldrsb w9, [x1]
-; CHECK-NEXT:    ldrsb w10, [x0, #1]
-; CHECK-NEXT:    ldrsb w11, [x1, #1]
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ldrb w9, [x1]
+; CHECK-NEXT:    ldrb w10, [x0, #1]
+; CHECK-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    mov v0.s[1], w10
 ; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    sub v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    movi v1.2s, #127
-; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mvni v1.2s, #127
-; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    shl v1.2s, v1.2s, #24
+; CHECK-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-NEXT:    sqsub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strb w8, [x2, #1]
@@ -297,15 +175,7 @@ define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
-; CHECK-NEXT:    sub v3.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT:    cmgt v1.4h, v1.4h, #0
-; CHECK-NEXT:    cmgt v0.4h, v0.4h, v3.4h
-; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, <4 x i16>* %px
@@ -318,19 +188,18 @@ define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
 define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsh w8, [x0]
-; CHECK-NEXT:    ldrsh w9, [x1]
-; CHECK-NEXT:    ldrsh w10, [x0, #2]
-; CHECK-NEXT:    ldrsh w11, [x1, #2]
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    ldrh w10, [x0, #2]
+; CHECK-NEXT:    ldrh w11, [x1, #2]
 ; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    mov v0.s[1], w10
 ; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    sub v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    movi v1.2s, #127, msl #8
-; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mvni v1.2s, #127, msl #8
-; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    sqsub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strh w8, [x2, #2]
@@ -346,15 +215,7 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
 define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 ; CHECK-LABEL: v12i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmlt v4.16b, v2.16b, #0
-; CHECK-NEXT:    movi v3.16b, #127
-; CHECK-NEXT:    cmgt v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmgt v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    sqsub v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %z = call <12 x i8> @llvm.ssub.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
   ret <12 x i8> %z
@@ -365,24 +226,8 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ldp q3, q2, [x1]
-; CHECK-NEXT:    mvni v5.8h, #128, lsl #8
-; CHECK-NEXT:    mvni v4.8h, #128, lsl #8
-; CHECK-NEXT:    sub v6.8h, v1.8h, v2.8h
-; CHECK-NEXT:    cmlt v7.8h, v6.8h, #0
-; CHECK-NEXT:    mvn v16.16b, v7.16b
-; CHECK-NEXT:    bsl v5.16b, v7.16b, v16.16b
-; CHECK-NEXT:    sub v7.8h, v0.8h, v3.8h
-; CHECK-NEXT:    cmgt v2.8h, v2.8h, #0
-; CHECK-NEXT:    cmgt v1.8h, v1.8h, v6.8h
-; CHECK-NEXT:    cmlt v16.8h, v7.8h, #0
-; CHECK-NEXT:    cmgt v3.8h, v3.8h, #0
-; CHECK-NEXT:    cmgt v0.8h, v0.8h, v7.8h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v16.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    bsl v4.16b, v16.16b, v2.16b
-; CHECK-NEXT:    bsl v1.16b, v5.16b, v6.16b
-; CHECK-NEXT:    bsl v0.16b, v4.16b, v7.16b
+; CHECK-NEXT:    sqsub v1.8h, v1.8h, v2.8h
+; CHECK-NEXT:    sqsub v0.8h, v0.8h, v3.8h
 ; CHECK-NEXT:    str q0, [x2]
 ; CHECK-NEXT:    str d1, [x2, #16]
 ; CHECK-NEXT:    ret
@@ -398,15 +243,7 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    ldr b1, [x1]
-; CHECK-NEXT:    movi v2.8b, #127
-; CHECK-NEXT:    sub v3.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmlt v4.8b, v3.8b, #0
-; CHECK-NEXT:    cmgt v1.8b, v1.8b, #0
-; CHECK-NEXT:    cmgt v0.8b, v0.8b, v3.8b
-; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    sqsub v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i8>, <1 x i8>* %px
@@ -421,15 +258,7 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ldr h1, [x1]
-; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
-; CHECK-NEXT:    sub v3.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT:    cmgt v1.4h, v1.4h, #0
-; CHECK-NEXT:    cmgt v0.4h, v0.4h, v3.4h
-; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i16>, <1 x i16>* %px
@@ -442,15 +271,14 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; CHECK-LABEL: v16i4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v1.16b, v1.16b, #4
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #4
+; CHECK-NEXT:    shl v1.16b, v1.16b, #4
+; CHECK-NEXT:    sshr v0.16b, v0.16b, #4
 ; CHECK-NEXT:    sshr v1.16b, v1.16b, #4
+; CHECK-NEXT:    shl v1.16b, v1.16b, #4
+; CHECK-NEXT:    shl v0.16b, v0.16b, #4
+; CHECK-NEXT:    sqsub v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    sshr v0.16b, v0.16b, #4
-; CHECK-NEXT:    movi v2.16b, #7
-; CHECK-NEXT:    sub v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    smin v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    movi v1.16b, #248
-; CHECK-NEXT:    smax v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
   ret <16 x i4> %z
@@ -459,15 +287,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 ; CHECK-LABEL: v16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v1.16b, v1.16b, #7
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    shl v1.16b, v1.16b, #7
+; CHECK-NEXT:    sshr v0.16b, v0.16b, #7
 ; CHECK-NEXT:    sshr v1.16b, v1.16b, #7
+; CHECK-NEXT:    shl v1.16b, v1.16b, #7
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    sqsub v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    sshr v0.16b, v0.16b, #7
-; CHECK-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-NEXT:    sub v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    smin v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT:    smax v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
   ret <16 x i1> %z
@@ -476,15 +303,7 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; CHECK-LABEL: v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v2.2s, v0.2s, v1.2s
-; CHECK-NEXT:    cmlt v4.2s, v2.2s, #0
-; CHECK-NEXT:    mvni v3.2s, #128, lsl #24
-; CHECK-NEXT:    cmgt v1.2s, v1.2s, #0
-; CHECK-NEXT:    cmgt v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    bsl v3.8b, v4.8b, v5.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    bsl v0.8b, v3.8b, v2.8b
+; CHECK-NEXT:    sqsub v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
   ret <2 x i32> %z
@@ -493,15 +312,7 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmlt v4.4s, v2.4s, #0
-; CHECK-NEXT:    mvni v3.4s, #128, lsl #24
-; CHECK-NEXT:    cmgt v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    sqsub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %z
@@ -510,24 +321,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; CHECK-LABEL: v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v4.4s, v0.4s, v2.4s
-; CHECK-NEXT:    cmlt v7.4s, v4.4s, #0
-; CHECK-NEXT:    mvni v6.4s, #128, lsl #24
-; CHECK-NEXT:    mvn v16.16b, v7.16b
-; CHECK-NEXT:    bsl v6.16b, v7.16b, v16.16b
-; CHECK-NEXT:    sub v7.4s, v1.4s, v3.4s
-; CHECK-NEXT:    cmgt v2.4s, v2.4s, #0
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    cmlt v16.4s, v7.4s, #0
-; CHECK-NEXT:    mvni v5.4s, #128, lsl #24
-; CHECK-NEXT:    cmgt v3.4s, v3.4s, #0
-; CHECK-NEXT:    cmgt v1.4s, v1.4s, v7.4s
-; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    mvn v2.16b, v16.16b
-; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v2.16b
-; CHECK-NEXT:    bsl v0.16b, v6.16b, v4.16b
-; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-NEXT:    sqsub v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    sqsub v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    ret
   %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
@@ -536,42 +331,10 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; CHECK-LABEL: v16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v16.4s, v0.4s, v4.4s
-; CHECK-NEXT:    cmlt v24.4s, v16.4s, #0
-; CHECK-NEXT:    mvni v18.4s, #128, lsl #24
-; CHECK-NEXT:    sub v19.4s, v1.4s, v5.4s
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    bsl v18.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.4s, v19.4s, #0
-; CHECK-NEXT:    mvni v20.4s, #128, lsl #24
-; CHECK-NEXT:    sub v21.4s, v2.4s, v6.4s
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    bsl v20.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.4s, v21.4s, #0
-; CHECK-NEXT:    cmgt v4.4s, v4.4s, #0
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v16.4s
-; CHECK-NEXT:    mvni v22.4s, #128, lsl #24
-; CHECK-NEXT:    sub v23.4s, v3.4s, v7.4s
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    cmgt v4.4s, v5.4s, #0
-; CHECK-NEXT:    cmgt v1.4s, v1.4s, v19.4s
-; CHECK-NEXT:    bsl v22.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmlt v24.4s, v23.4s, #0
-; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    cmgt v4.4s, v6.4s, #0
-; CHECK-NEXT:    cmgt v2.4s, v2.4s, v21.4s
-; CHECK-NEXT:    mvni v17.4s, #128, lsl #24
-; CHECK-NEXT:    mvn v25.16b, v24.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    cmgt v4.4s, v7.4s, #0
-; CHECK-NEXT:    cmgt v3.4s, v3.4s, v23.4s
-; CHECK-NEXT:    bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v0.16b, v18.16b, v16.16b
-; CHECK-NEXT:    bsl v1.16b, v20.16b, v19.16b
-; CHECK-NEXT:    bsl v2.16b, v22.16b, v21.16b
-; CHECK-NEXT:    bsl v3.16b, v17.16b, v23.16b
+; CHECK-NEXT:    sqsub v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    sqsub v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    sqsub v2.4s, v2.4s, v6.4s
+; CHECK-NEXT:    sqsub v3.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    ret
   %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
@@ -580,16 +343,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK-LABEL: v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v2.2d, v0.2d, v1.2d
-; CHECK-NEXT:    mov x8, #9223372036854775807
-; CHECK-NEXT:    cmlt v3.2d, v2.2d, #0
-; CHECK-NEXT:    cmgt v1.2d, v1.2d, #0
-; CHECK-NEXT:    dup v4.2d, x8
-; CHECK-NEXT:    cmgt v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    mvn v5.16b, v3.16b
-; CHECK-NEXT:    bsl v4.16b, v3.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bsl v0.16b, v4.16b, v2.16b
+; CHECK-NEXT:    sqsub v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
   ret <2 x i64> %z
@@ -598,25 +352,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v4.2d, v0.2d, v2.2d
-; CHECK-NEXT:    mov x8, #9223372036854775807
-; CHECK-NEXT:    cmlt v5.2d, v4.2d, #0
-; CHECK-NEXT:    dup v6.2d, x8
-; CHECK-NEXT:    mvn v7.16b, v5.16b
-; CHECK-NEXT:    mov v16.16b, v6.16b
-; CHECK-NEXT:    bsl v16.16b, v5.16b, v7.16b
-; CHECK-NEXT:    sub v5.2d, v1.2d, v3.2d
-; CHECK-NEXT:    cmgt v2.2d, v2.2d, #0
-; CHECK-NEXT:    cmgt v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    cmlt v7.2d, v5.2d, #0
-; CHECK-NEXT:    cmgt v3.2d, v3.2d, #0
-; CHECK-NEXT:    cmgt v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    mvn v2.16b, v7.16b
-; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    bsl v6.16b, v7.16b, v2.16b
-; CHECK-NEXT:    bsl v0.16b, v16.16b, v4.16b
-; CHECK-NEXT:    bsl v1.16b, v6.16b, v5.16b
+; CHECK-NEXT:    sqsub v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    sqsub v1.2d, v1.2d, v3.2d
 ; CHECK-NEXT:    ret
   %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
@@ -625,43 +362,10 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; CHECK-LABEL: v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v16.2d, v0.2d, v4.2d
-; CHECK-NEXT:    mov x8, #9223372036854775807
-; CHECK-NEXT:    sub v17.2d, v1.2d, v5.2d
-; CHECK-NEXT:    cmlt v20.2d, v16.2d, #0
-; CHECK-NEXT:    dup v21.2d, x8
-; CHECK-NEXT:    sub v18.2d, v2.2d, v6.2d
-; CHECK-NEXT:    cmlt v22.2d, v17.2d, #0
-; CHECK-NEXT:    mvn v24.16b, v20.16b
-; CHECK-NEXT:    mov v25.16b, v21.16b
-; CHECK-NEXT:    cmlt v23.2d, v18.2d, #0
-; CHECK-NEXT:    bsl v25.16b, v20.16b, v24.16b
-; CHECK-NEXT:    mvn v20.16b, v22.16b
-; CHECK-NEXT:    mov v24.16b, v21.16b
-; CHECK-NEXT:    cmgt v4.2d, v4.2d, #0
-; CHECK-NEXT:    cmgt v0.2d, v0.2d, v16.2d
-; CHECK-NEXT:    sub v19.2d, v3.2d, v7.2d
-; CHECK-NEXT:    bsl v24.16b, v22.16b, v20.16b
-; CHECK-NEXT:    mvn v20.16b, v23.16b
-; CHECK-NEXT:    mov v22.16b, v21.16b
-; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    cmgt v4.2d, v5.2d, #0
-; CHECK-NEXT:    cmgt v1.2d, v1.2d, v17.2d
-; CHECK-NEXT:    bsl v22.16b, v23.16b, v20.16b
-; CHECK-NEXT:    cmlt v20.2d, v19.2d, #0
-; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    cmgt v4.2d, v6.2d, #0
-; CHECK-NEXT:    cmgt v2.2d, v2.2d, v18.2d
-; CHECK-NEXT:    mvn v23.16b, v20.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    cmgt v4.2d, v7.2d, #0
-; CHECK-NEXT:    cmgt v3.2d, v3.2d, v19.2d
-; CHECK-NEXT:    bsl v21.16b, v20.16b, v23.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v0.16b, v25.16b, v16.16b
-; CHECK-NEXT:    bsl v1.16b, v24.16b, v17.16b
-; CHECK-NEXT:    bsl v2.16b, v22.16b, v18.16b
-; CHECK-NEXT:    bsl v3.16b, v21.16b, v19.16b
+; CHECK-NEXT:    sqsub v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    sqsub v1.2d, v1.2d, v5.2d
+; CHECK-NEXT:    sqsub v2.2d, v2.2d, v6.2d
+; CHECK-NEXT:    sqsub v3.2d, v3.2d, v7.2d
 ; CHECK-NEXT:    ret
   %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z

diff  --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index 4ff14c72e1f8..40bbac2c0557 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -35,9 +35,7 @@ declare <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128>, <2 x i128>)
 define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; CHECK-LABEL: v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v2.16b, v1.16b
-; CHECK-NEXT:    umin v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    uqadd v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %z = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %z
@@ -46,12 +44,8 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; CHECK-LABEL: v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v4.16b, v2.16b
-; CHECK-NEXT:    mvn v5.16b, v3.16b
-; CHECK-NEXT:    umin v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    umin v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    add v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    uqadd v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    uqadd v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    ret
   %z = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
@@ -60,18 +54,10 @@ define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 ; CHECK-LABEL: v64i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v16.16b, v4.16b
-; CHECK-NEXT:    umin v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    mvn v16.16b, v5.16b
-; CHECK-NEXT:    umin v1.16b, v1.16b, v16.16b
-; CHECK-NEXT:    mvn v16.16b, v6.16b
-; CHECK-NEXT:    umin v2.16b, v2.16b, v16.16b
-; CHECK-NEXT:    mvn v16.16b, v7.16b
-; CHECK-NEXT:    umin v3.16b, v3.16b, v16.16b
-; CHECK-NEXT:    add v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    add v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    add v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    add v3.16b, v3.16b, v7.16b
+; CHECK-NEXT:    uqadd v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    uqadd v1.16b, v1.16b, v5.16b
+; CHECK-NEXT:    uqadd v2.16b, v2.16b, v6.16b
+; CHECK-NEXT:    uqadd v3.16b, v3.16b, v7.16b
 ; CHECK-NEXT:    ret
   %z = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
@@ -80,9 +66,7 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; CHECK-LABEL: v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v2.16b, v1.16b
-; CHECK-NEXT:    umin v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uqadd v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %z = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %z
@@ -91,12 +75,8 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
 ; CHECK-LABEL: v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v4.16b, v2.16b
-; CHECK-NEXT:    mvn v5.16b, v3.16b
-; CHECK-NEXT:    umin v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    umin v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    add v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    add v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    uqadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    uqadd v1.8h, v1.8h, v3.8h
 ; CHECK-NEXT:    ret
   %z = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
@@ -105,18 +85,10 @@ define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 ; CHECK-LABEL: v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v16.16b, v4.16b
-; CHECK-NEXT:    umin v0.8h, v0.8h, v16.8h
-; CHECK-NEXT:    mvn v16.16b, v5.16b
-; CHECK-NEXT:    umin v1.8h, v1.8h, v16.8h
-; CHECK-NEXT:    mvn v16.16b, v6.16b
-; CHECK-NEXT:    umin v2.8h, v2.8h, v16.8h
-; CHECK-NEXT:    mvn v16.16b, v7.16b
-; CHECK-NEXT:    umin v3.8h, v3.8h, v16.8h
-; CHECK-NEXT:    add v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    add v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    add v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    add v3.8h, v3.8h, v7.8h
+; CHECK-NEXT:    uqadd v0.8h, v0.8h, v4.8h
+; CHECK-NEXT:    uqadd v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    uqadd v2.8h, v2.8h, v6.8h
+; CHECK-NEXT:    uqadd v3.8h, v3.8h, v7.8h
 ; CHECK-NEXT:    ret
   %z = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
@@ -125,11 +97,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    mvn v2.8b, v0.8b
-; CHECK-NEXT:    umin v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    add v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uqadd v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, <8 x i8>* %px
@@ -146,21 +116,22 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
 ; CHECK-NEXT:    ldrb w9, [x1]
 ; CHECK-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-NEXT:    ldrb w11, [x1, #1]
-; CHECK-NEXT:    ldrb w12, [x0, #2]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldrb w8, [x1, #2]
 ; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    ldrb w8, [x0, #2]
+; CHECK-NEXT:    ldrb w9, [x1, #2]
 ; CHECK-NEXT:    mov v0.h[1], w10
-; CHECK-NEXT:    ldrb w9, [x0, #3]
-; CHECK-NEXT:    ldrb w10, [x1, #3]
 ; CHECK-NEXT:    mov v1.h[1], w11
-; CHECK-NEXT:    mov v0.h[2], w12
-; CHECK-NEXT:    mov v1.h[2], w8
-; CHECK-NEXT:    mov v0.h[3], w9
-; CHECK-NEXT:    mov v1.h[3], w10
-; CHECK-NEXT:    movi d2, #0xff00ff00ff00ff
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    umin v0.4h, v0.4h, v2.4h
+; CHECK-NEXT:    ldrb w10, [x0, #3]
+; CHECK-NEXT:    ldrb w11, [x1, #3]
+; CHECK-NEXT:    mov v0.h[2], w8
+; CHECK-NEXT:    mov v1.h[2], w9
+; CHECK-NEXT:    mov v0.h[3], w10
+; CHECK-NEXT:    mov v1.h[3], w11
+; CHECK-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-NEXT:    uqadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ushr v0.4h, v0.4h, #8
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    str s0, [x2]
 ; CHECK-NEXT:    ret
@@ -179,12 +150,13 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
 ; CHECK-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v2.s[1], w11
-; CHECK-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-NEXT:    add v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    umin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    mov v1.s[1], w11
+; CHECK-NEXT:    shl v1.2s, v1.2s, #24
+; CHECK-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-NEXT:    uqadd v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strb w8, [x2, #1]
@@ -200,11 +172,9 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
 define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    mvn v2.8b, v0.8b
-; CHECK-NEXT:    umin v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    add v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uqadd v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, <4 x i16>* %px
@@ -222,12 +192,13 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
 ; CHECK-NEXT:    ldrh w10, [x0, #2]
 ; CHECK-NEXT:    ldrh w11, [x1, #2]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v2.s[1], w11
-; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-NEXT:    add v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    umin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    mov v1.s[1], w11
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    uqadd v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strh w8, [x2, #2]
@@ -243,9 +214,7 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
 define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 ; CHECK-LABEL: v12i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v2.16b, v1.16b
-; CHECK-NEXT:    umin v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    uqadd v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %z = call <12 x i8> @llvm.uadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
   ret <12 x i8> %z
@@ -254,16 +223,12 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v12i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x1]
-; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    mvn v4.16b, v0.16b
-; CHECK-NEXT:    mvn v5.16b, v1.16b
-; CHECK-NEXT:    umin v2.8h, v2.8h, v4.8h
-; CHECK-NEXT:    umin v3.8h, v3.8h, v5.8h
-; CHECK-NEXT:    add v0.8h, v2.8h, v0.8h
-; CHECK-NEXT:    add v1.8h, v3.8h, v1.8h
-; CHECK-NEXT:    str q1, [x2]
-; CHECK-NEXT:    str d0, [x2, #16]
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q3, q2, [x1]
+; CHECK-NEXT:    uqadd v1.8h, v1.8h, v2.8h
+; CHECK-NEXT:    uqadd v0.8h, v0.8h, v3.8h
+; CHECK-NEXT:    str q0, [x2]
+; CHECK-NEXT:    str d1, [x2, #16]
 ; CHECK-NEXT:    ret
   %x = load <12 x i16>, <12 x i16>* %px
   %y = load <12 x i16>, <12 x i16>* %py
@@ -275,11 +240,9 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
 define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr b0, [x1]
-; CHECK-NEXT:    ldr b1, [x0]
-; CHECK-NEXT:    mvn v2.8b, v0.8b
-; CHECK-NEXT:    umin v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    add v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    ldr b1, [x1]
+; CHECK-NEXT:    uqadd v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i8>, <1 x i8>* %px
@@ -292,11 +255,9 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x1]
-; CHECK-NEXT:    ldr h1, [x0]
-; CHECK-NEXT:    mvn v2.8b, v0.8b
-; CHECK-NEXT:    umin v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    add v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ldr h1, [x1]
+; CHECK-NEXT:    uqadd v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i16>, <1 x i16>* %px
@@ -310,10 +271,12 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; CHECK-LABEL: v16i4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.16b, #15
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    umin v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    shl v1.16b, v1.16b, #4
+; CHECK-NEXT:    shl v0.16b, v0.16b, #4
+; CHECK-NEXT:    uqadd v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ushr v0.16b, v0.16b, #4
 ; CHECK-NEXT:    ret
   %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
   ret <16 x i4> %z
@@ -323,10 +286,12 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 ; CHECK-LABEL: v16i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.16b, #1
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    umin v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    shl v1.16b, v1.16b, #7
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    uqadd v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ushr v0.16b, v0.16b, #7
 ; CHECK-NEXT:    ret
   %z = call <16 x i1> @llvm.uadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
   ret <16 x i1> %z
@@ -335,9 +300,7 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; CHECK-LABEL: v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v2.8b, v1.8b
-; CHECK-NEXT:    umin v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    uqadd v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %z = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
   ret <2 x i32> %z
@@ -346,9 +309,7 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v2.16b, v1.16b
-; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uqadd v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %z = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %z
@@ -357,12 +318,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; CHECK-LABEL: v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v4.16b, v2.16b
-; CHECK-NEXT:    mvn v5.16b, v3.16b
-; CHECK-NEXT:    umin v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    umin v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    uqadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    uqadd v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    ret
   %z = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
@@ -371,18 +328,10 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; CHECK-LABEL: v16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v16.16b, v4.16b
-; CHECK-NEXT:    umin v0.4s, v0.4s, v16.4s
-; CHECK-NEXT:    mvn v16.16b, v5.16b
-; CHECK-NEXT:    umin v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mvn v16.16b, v6.16b
-; CHECK-NEXT:    umin v2.4s, v2.4s, v16.4s
-; CHECK-NEXT:    mvn v16.16b, v7.16b
-; CHECK-NEXT:    umin v3.4s, v3.4s, v16.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    add v3.4s, v3.4s, v7.4s
+; CHECK-NEXT:    uqadd v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    uqadd v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    uqadd v2.4s, v2.4s, v6.4s
+; CHECK-NEXT:    uqadd v3.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    ret
   %z = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
@@ -391,9 +340,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK-LABEL: v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v1.2d, v0.2d, v1.2d
-; CHECK-NEXT:    cmhi v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    uqadd v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %z = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
   ret <2 x i64> %z
@@ -402,12 +349,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v2.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v3.2d, v1.2d, v3.2d
-; CHECK-NEXT:    cmhi v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    cmhi v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    uqadd v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    uqadd v1.2d, v1.2d, v3.2d
 ; CHECK-NEXT:    ret
   %z = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
@@ -416,18 +359,10 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; CHECK-LABEL: v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v4.2d, v0.2d, v4.2d
-; CHECK-NEXT:    add v5.2d, v1.2d, v5.2d
-; CHECK-NEXT:    add v6.2d, v2.2d, v6.2d
-; CHECK-NEXT:    add v7.2d, v3.2d, v7.2d
-; CHECK-NEXT:    cmhi v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    cmhi v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    cmhi v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    cmhi v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    orr v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    orr v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    orr v2.16b, v6.16b, v2.16b
-; CHECK-NEXT:    orr v3.16b, v7.16b, v3.16b
+; CHECK-NEXT:    uqadd v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    uqadd v1.2d, v1.2d, v5.2d
+; CHECK-NEXT:    uqadd v2.2d, v2.2d, v6.2d
+; CHECK-NEXT:    uqadd v3.2d, v3.2d, v7.2d
 ; CHECK-NEXT:    ret
   %z = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z

diff  --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index db982cc42d0c..3eacf03dc6a8 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -36,8 +36,7 @@ declare <2 x i128> @llvm.usub.sat.v2i128(<2 x i128>, <2 x i128>)
 define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; CHECK-LABEL: v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    sub v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    uqsub v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %z = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %z
@@ -46,10 +45,8 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; CHECK-LABEL: v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    umax v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    sub v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    sub v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    uqsub v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    uqsub v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    ret
   %z = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
@@ -58,14 +55,10 @@ define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 ; CHECK-LABEL: v64i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    umax v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    umax v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    umax v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    sub v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    sub v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    sub v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    sub v3.16b, v3.16b, v7.16b
+; CHECK-NEXT:    uqsub v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    uqsub v1.16b, v1.16b, v5.16b
+; CHECK-NEXT:    uqsub v2.16b, v2.16b, v6.16b
+; CHECK-NEXT:    uqsub v3.16b, v3.16b, v7.16b
 ; CHECK-NEXT:    ret
   %z = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
@@ -74,8 +67,7 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; CHECK-LABEL: v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    sub v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uqsub v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %z = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %z
@@ -84,10 +76,8 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
 ; CHECK-LABEL: v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    umax v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    sub v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    sub v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    uqsub v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    uqsub v1.8h, v1.8h, v3.8h
 ; CHECK-NEXT:    ret
   %z = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
@@ -96,14 +86,10 @@ define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 ; CHECK-LABEL: v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    umax v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    umax v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    umax v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    sub v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    sub v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    sub v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    sub v3.8h, v3.8h, v7.8h
+; CHECK-NEXT:    uqsub v0.8h, v0.8h, v4.8h
+; CHECK-NEXT:    uqsub v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    uqsub v2.8h, v2.8h, v6.8h
+; CHECK-NEXT:    uqsub v3.8h, v3.8h, v7.8h
 ; CHECK-NEXT:    ret
   %z = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
@@ -114,8 +100,7 @@ define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    umax v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    sub v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    uqsub v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, <8 x i8>* %px
@@ -144,8 +129,10 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
 ; CHECK-NEXT:    mov v1.h[2], w9
 ; CHECK-NEXT:    mov v0.h[3], w10
 ; CHECK-NEXT:    mov v1.h[3], w11
-; CHECK-NEXT:    umax v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-NEXT:    uqsub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ushr v0.4h, v0.4h, #8
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    str s0, [x2]
 ; CHECK-NEXT:    ret
@@ -167,8 +154,10 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
 ; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    mov v0.s[1], w10
 ; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    umax v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    sub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    shl v1.2s, v1.2s, #24
+; CHECK-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-NEXT:    uqsub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strb w8, [x2, #1]
@@ -186,8 +175,7 @@ define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    umax v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    uqsub v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, <4 x i16>* %px
@@ -208,8 +196,10 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
 ; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    mov v0.s[1], w10
 ; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    umax v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    sub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    uqsub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strh w8, [x2, #2]
@@ -225,8 +215,7 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
 define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 ; CHECK-LABEL: v12i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    sub v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    uqsub v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %z = call <12 x i8> @llvm.usub.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
   ret <12 x i8> %z
@@ -237,10 +226,8 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ldp q3, q2, [x1]
-; CHECK-NEXT:    umax v1.8h, v1.8h, v2.8h
-; CHECK-NEXT:    umax v0.8h, v0.8h, v3.8h
-; CHECK-NEXT:    sub v1.8h, v1.8h, v2.8h
-; CHECK-NEXT:    sub v0.8h, v0.8h, v3.8h
+; CHECK-NEXT:    uqsub v1.8h, v1.8h, v2.8h
+; CHECK-NEXT:    uqsub v0.8h, v0.8h, v3.8h
 ; CHECK-NEXT:    str q0, [x2]
 ; CHECK-NEXT:    str d1, [x2, #16]
 ; CHECK-NEXT:    ret
@@ -256,8 +243,7 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    ldr b1, [x1]
-; CHECK-NEXT:    umax v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    sub v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    uqsub v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i8>, <1 x i8>* %px
@@ -272,8 +258,7 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ldr h1, [x1]
-; CHECK-NEXT:    umax v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    uqsub v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i16>, <1 x i16>* %px
@@ -287,10 +272,12 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; CHECK-LABEL: v16i4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.16b, #15
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    umax v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    sub v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    shl v1.16b, v1.16b, #4
+; CHECK-NEXT:    shl v0.16b, v0.16b, #4
+; CHECK-NEXT:    uqsub v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ushr v0.16b, v0.16b, #4
 ; CHECK-NEXT:    ret
   %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
   ret <16 x i4> %z
@@ -300,10 +287,12 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 ; CHECK-LABEL: v16i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.16b, #1
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    umax v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    sub v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    shl v1.16b, v1.16b, #7
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    uqsub v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ushr v0.16b, v0.16b, #7
 ; CHECK-NEXT:    ret
   %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
   ret <16 x i1> %z
@@ -312,8 +301,7 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; CHECK-LABEL: v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    sub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    uqsub v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %z = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
   ret <2 x i32> %z
@@ -322,8 +310,7 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uqsub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %z = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %z
@@ -332,10 +319,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; CHECK-LABEL: v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    umax v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    uqsub v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    uqsub v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    ret
   %z = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
@@ -344,14 +329,10 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; CHECK-LABEL: v16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    umax v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    umax v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    umax v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    sub v3.4s, v3.4s, v7.4s
+; CHECK-NEXT:    uqsub v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    uqsub v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    uqsub v2.4s, v2.4s, v6.4s
+; CHECK-NEXT:    uqsub v3.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    ret
   %z = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
@@ -360,9 +341,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK-LABEL: v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v1.2d, v0.2d, v1.2d
-; CHECK-NEXT:    cmhi v0.2d, v1.2d, v0.2d
-; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    uqsub v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %z = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
   ret <2 x i64> %z
@@ -371,12 +350,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v2.2d, v0.2d, v2.2d
-; CHECK-NEXT:    sub v3.2d, v1.2d, v3.2d
-; CHECK-NEXT:    cmhi v0.2d, v2.2d, v0.2d
-; CHECK-NEXT:    cmhi v1.2d, v3.2d, v1.2d
-; CHECK-NEXT:    bic v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    bic v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    uqsub v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    uqsub v1.2d, v1.2d, v3.2d
 ; CHECK-NEXT:    ret
   %z = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
@@ -385,18 +360,10 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; CHECK-LABEL: v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v4.2d, v0.2d, v4.2d
-; CHECK-NEXT:    sub v5.2d, v1.2d, v5.2d
-; CHECK-NEXT:    sub v6.2d, v2.2d, v6.2d
-; CHECK-NEXT:    sub v7.2d, v3.2d, v7.2d
-; CHECK-NEXT:    cmhi v0.2d, v4.2d, v0.2d
-; CHECK-NEXT:    cmhi v1.2d, v5.2d, v1.2d
-; CHECK-NEXT:    cmhi v2.2d, v6.2d, v2.2d
-; CHECK-NEXT:    cmhi v3.2d, v7.2d, v3.2d
-; CHECK-NEXT:    bic v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    bic v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    bic v2.16b, v6.16b, v2.16b
-; CHECK-NEXT:    bic v3.16b, v7.16b, v3.16b
+; CHECK-NEXT:    uqsub v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    uqsub v1.2d, v1.2d, v5.2d
+; CHECK-NEXT:    uqsub v2.2d, v2.2d, v6.2d
+; CHECK-NEXT:    uqsub v3.2d, v3.2d, v7.2d
 ; CHECK-NEXT:    ret
   %z = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z


        


More information about the llvm-commits mailing list