[llvm] [AArch64][GlobalISel] Basic add_sat and sub_sat vector handling. (PR #80650)

David Green via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 28 00:58:38 PST 2024


https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/80650

>From c0b9bc6f1ceffc42ed079ebde43175eb569a0d11 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 28 Feb 2024 08:55:59 +0000
Subject: [PATCH 1/2] [AArch64][GlobalISel] Basic add_sat and sub_sat vector
 handling.

This tries to fill in the basic vector handling for sadd_sat/uadd_sat and
ssub_sat/usub_sat. It just handles the basics, marking legal types and clamping
illegally sized vectors to legal ones.
---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  13 +-
 .../GlobalISel/legalizer-info-validation.mir  |   1 +
 llvm/test/CodeGen/AArch64/sadd_sat.ll         |   2 -
 llvm/test/CodeGen/AArch64/sadd_sat_vec.ll     | 293 +++++++++++------
 llvm/test/CodeGen/AArch64/ssub_sat.ll         |   2 -
 llvm/test/CodeGen/AArch64/ssub_sat_vec.ll     | 293 +++++++++++------
 llvm/test/CodeGen/AArch64/uadd_sat_vec.ll     | 295 ++++++++++++------
 llvm/test/CodeGen/AArch64/usub_sat_vec.ll     | 291 +++++++++++------
 8 files changed, 813 insertions(+), 377 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index a2e805e8cb56dc..1d9289d9f56242 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1110,9 +1110,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .scalarize(1)
       .lower();
 
-  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
-      .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
-
   getActionDefinitionsBuilder({G_FSHL, G_FSHR})
       .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
       .lower();
@@ -1160,8 +1157,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .minScalarEltSameAsIf(always, 1, 0)
       .maxScalarEltSameAsIf(always, 1, 0);
 
-  // TODO: Vector types.
-  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0));
+  getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
+      .legalFor({v2s64, v2s32, v4s32, v4s16, v8s16, v8s8, v16s8})
+      .clampNumElements(0, v8s8, v16s8)
+      .clampNumElements(0, v4s16, v8s16)
+      .clampNumElements(0, v2s32, v4s32)
+      .clampMaxNumElements(0, s64, 2)
+      .moreElementsToNextPow2(0)
+      .lowerIf(isScalar(0));
 
   // TODO: Libcall support for s128.
   // TODO: s16 should be legal with full FP16 support.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index d87704cf45d5d5..e553759c358042 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -395,6 +395,7 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_SADDSAT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_USUBSAT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat.ll b/llvm/test/CodeGen/AArch64/sadd_sat.ll
index 9e09b7f9a4bd6f..789fd7b20a7f99 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat.ll
@@ -2,8 +2,6 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for vec
-
 declare i4 @llvm.sadd.sat.i4(i4, i4)
 declare i8 @llvm.sadd.sat.i8(i8, i8)
 declare i16 @llvm.sadd.sat.i16(i16, i16)
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 5f905d94e3573c..0f1f42cacff6f2 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -2,28 +2,11 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v16i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v64i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
+; CHECK-GI:       warning: Instruction selection used fallback path for v4i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.sadd.sat.v1i8(<1 x i8>, <1 x i8>)
@@ -67,23 +50,37 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 }
 
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    sqadd v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    sqadd v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    sqadd v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
 }
 
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
-; CHECK-LABEL: v64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    sqadd v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    sqadd v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    sqadd v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v64i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    sqadd v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    sqadd v1.16b, v1.16b, v5.16b
+; CHECK-SD-NEXT:    sqadd v3.16b, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v64i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    sqadd v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    sqadd v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    sqadd v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ret
   %z = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
 }
@@ -98,23 +95,37 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 }
 
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    sqadd v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    sqadd v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    sqadd v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
 }
 
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
-; CHECK-LABEL: v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    sqadd v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    sqadd v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    sqadd v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    sqadd v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    sqadd v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    sqadd v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    sqadd v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    sqadd v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    sqadd v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
 }
@@ -196,23 +207,41 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    add x9, x1, #2
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-NEXT:    shl v1.2s, v1.2s, #16
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    sqadd v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x2]
-; CHECK-NEXT:    strh w8, [x2, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
+; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    add x9, x1, #2
+; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    sqadd v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x2]
+; CHECK-SD-NEXT:    strh w8, [x2, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h2, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[0]
+; CHECK-GI-NEXT:    sqadd v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    str h0, [x2]
+; CHECK-GI-NEXT:    str h1, [x2, #2]
+; CHECK-GI-NEXT:    ret
   %x = load <2 x i16>, ptr %px
   %y = load <2 x i16>, ptr %py
   %z = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
@@ -230,15 +259,67 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 }
 
 define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v12i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    sqadd v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    sqadd v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    str q0, [x2]
-; CHECK-NEXT:    str d1, [x2, #16]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v12i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q3, [x1]
+; CHECK-SD-NEXT:    ldp q1, q2, [x0]
+; CHECK-SD-NEXT:    sqadd v0.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    sqadd v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    str q0, [x2]
+; CHECK-SD-NEXT:    str d1, [x2, #16]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v12i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    ldr h4, [x1, #10]
+; CHECK-GI-NEXT:    ldr h5, [x0, #18]
+; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #4]
+; CHECK-GI-NEXT:    ldr h6, [x1, #16]
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x1, #4]
+; CHECK-GI-NEXT:    ldr h7, [x1, #18]
+; CHECK-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NEXT:    ldr h7, [x1, #20]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #6]
+; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x1, #6]
+; CHECK-GI-NEXT:    mov v6.h[2], v7.h[0]
+; CHECK-GI-NEXT:    ldr h7, [x1, #22]
+; CHECK-GI-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #8]
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x1, #8]
+; CHECK-GI-NEXT:    mov v6.h[3], v7.h[0]
+; CHECK-GI-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #10]
+; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x0, #16]
+; CHECK-GI-NEXT:    mov v3.h[1], v5.h[0]
+; CHECK-GI-NEXT:    ldr h5, [x0, #20]
+; CHECK-GI-NEXT:    mov v0.h[5], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #12]
+; CHECK-GI-NEXT:    mov v1.h[5], v4.h[0]
+; CHECK-GI-NEXT:    ldr h4, [x1, #12]
+; CHECK-GI-NEXT:    mov v3.h[2], v5.h[0]
+; CHECK-GI-NEXT:    ldr h5, [x0, #22]
+; CHECK-GI-NEXT:    mov v0.h[6], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #14]
+; CHECK-GI-NEXT:    mov v1.h[6], v4.h[0]
+; CHECK-GI-NEXT:    ldr h4, [x1, #14]
+; CHECK-GI-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-GI-NEXT:    mov v0.h[7], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[7], v4.h[0]
+; CHECK-GI-NEXT:    sqadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sqadd v1.4h, v3.4h, v6.4h
+; CHECK-GI-NEXT:    str q0, [x2]
+; CHECK-GI-NEXT:    str d1, [x2, #16]
+; CHECK-GI-NEXT:    ret
   %x = load <12 x i16>, ptr %px
   %y = load <12 x i16>, ptr %py
   %z = call <12 x i16> @llvm.sadd.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
@@ -346,23 +427,37 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sqadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    sqadd v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    sqadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
 }
 
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
-; CHECK-LABEL: v16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    sqadd v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sqadd v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sqadd v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    sqadd v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    sqadd v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    sqadd v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    sqadd v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    sqadd v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    sqadd v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
 }
@@ -377,23 +472,37 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 }
 
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    sqadd v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    sqadd v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sqadd v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
   %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
 }
 
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
-; CHECK-LABEL: v8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    sqadd v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    sqadd v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    sqadd v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    sqadd v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    sqadd v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    sqadd v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    sqadd v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    sqadd v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    sqadd v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
 }
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat.ll b/llvm/test/CodeGen/AArch64/ssub_sat.ll
index abeb4b357fa9fb..4d755f480c3fc9 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat.ll
@@ -2,8 +2,6 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for vec
-
 declare i4 @llvm.ssub.sat.i4(i4, i4)
 declare i8 @llvm.ssub.sat.i8(i8, i8)
 declare i16 @llvm.ssub.sat.i16(i16, i16)
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index acec3e74d3e93f..a768bbbdc6343f 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -2,28 +2,11 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v16i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v64i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
+; CHECK-GI:       warning: Instruction selection used fallback path for v4i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.ssub.sat.v1i8(<1 x i8>, <1 x i8>)
@@ -68,23 +51,37 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 }
 
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    sqsub v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    sqsub v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    sqsub v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
 }
 
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
-; CHECK-LABEL: v64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    sqsub v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    sqsub v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    sqsub v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v64i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    sqsub v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    sqsub v1.16b, v1.16b, v5.16b
+; CHECK-SD-NEXT:    sqsub v3.16b, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v64i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    sqsub v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    sqsub v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    sqsub v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ret
   %z = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
 }
@@ -99,23 +96,37 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 }
 
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    sqsub v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    sqsub v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    sqsub v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
 }
 
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
-; CHECK-LABEL: v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    sqsub v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    sqsub v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    sqsub v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    sqsub v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    sqsub v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    sqsub v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    sqsub v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    sqsub v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    sqsub v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
 }
@@ -197,23 +208,41 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    add x9, x1, #2
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-NEXT:    shl v1.2s, v1.2s, #16
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    sqsub v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x2]
-; CHECK-NEXT:    strh w8, [x2, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
+; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    add x9, x1, #2
+; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    sqsub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x2]
+; CHECK-SD-NEXT:    strh w8, [x2, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h2, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[0]
+; CHECK-GI-NEXT:    sqsub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    str h0, [x2]
+; CHECK-GI-NEXT:    str h1, [x2, #2]
+; CHECK-GI-NEXT:    ret
   %x = load <2 x i16>, ptr %px
   %y = load <2 x i16>, ptr %py
   %z = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
@@ -231,15 +260,67 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 }
 
 define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v12i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    sqsub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    sqsub v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    str q0, [x2]
-; CHECK-NEXT:    str d1, [x2, #16]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v12i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q3, [x1]
+; CHECK-SD-NEXT:    ldp q1, q2, [x0]
+; CHECK-SD-NEXT:    sqsub v0.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    sqsub v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    str q0, [x2]
+; CHECK-SD-NEXT:    str d1, [x2, #16]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v12i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    ldr h4, [x1, #10]
+; CHECK-GI-NEXT:    ldr h5, [x0, #18]
+; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #4]
+; CHECK-GI-NEXT:    ldr h6, [x1, #16]
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x1, #4]
+; CHECK-GI-NEXT:    ldr h7, [x1, #18]
+; CHECK-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NEXT:    ldr h7, [x1, #20]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #6]
+; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x1, #6]
+; CHECK-GI-NEXT:    mov v6.h[2], v7.h[0]
+; CHECK-GI-NEXT:    ldr h7, [x1, #22]
+; CHECK-GI-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #8]
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x1, #8]
+; CHECK-GI-NEXT:    mov v6.h[3], v7.h[0]
+; CHECK-GI-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #10]
+; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x0, #16]
+; CHECK-GI-NEXT:    mov v3.h[1], v5.h[0]
+; CHECK-GI-NEXT:    ldr h5, [x0, #20]
+; CHECK-GI-NEXT:    mov v0.h[5], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #12]
+; CHECK-GI-NEXT:    mov v1.h[5], v4.h[0]
+; CHECK-GI-NEXT:    ldr h4, [x1, #12]
+; CHECK-GI-NEXT:    mov v3.h[2], v5.h[0]
+; CHECK-GI-NEXT:    ldr h5, [x0, #22]
+; CHECK-GI-NEXT:    mov v0.h[6], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #14]
+; CHECK-GI-NEXT:    mov v1.h[6], v4.h[0]
+; CHECK-GI-NEXT:    ldr h4, [x1, #14]
+; CHECK-GI-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-GI-NEXT:    mov v0.h[7], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[7], v4.h[0]
+; CHECK-GI-NEXT:    sqsub v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sqsub v1.4h, v3.4h, v6.4h
+; CHECK-GI-NEXT:    str q0, [x2]
+; CHECK-GI-NEXT:    str d1, [x2, #16]
+; CHECK-GI-NEXT:    ret
   %x = load <12 x i16>, ptr %px
   %y = load <12 x i16>, ptr %py
   %z = call <12 x i16> @llvm.ssub.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
@@ -349,23 +430,37 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sqsub v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    sqsub v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    sqsub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
 }
 
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
-; CHECK-LABEL: v16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    sqsub v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sqsub v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sqsub v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    sqsub v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    sqsub v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    sqsub v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    sqsub v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    sqsub v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    sqsub v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
 }
@@ -380,23 +475,37 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 }
 
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    sqsub v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    sqsub v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sqsub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
   %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
 }
 
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
-; CHECK-LABEL: v8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    sqsub v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    sqsub v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    sqsub v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    sqsub v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    sqsub v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    sqsub v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    sqsub v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    sqsub v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    sqsub v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
 }
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index e05c65daf50aae..e1536aff4915c6 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -2,28 +2,11 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v16i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v64i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
+; CHECK-GI:       warning: Instruction selection used fallback path for v4i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.uadd.sat.v1i8(<1 x i8>, <1 x i8>)
@@ -67,23 +50,37 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 }
 
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    uqadd v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    uqadd v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uqadd v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
 }
 
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
-; CHECK-LABEL: v64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    uqadd v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    uqadd v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    uqadd v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v64i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    uqadd v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    uqadd v1.16b, v1.16b, v5.16b
+; CHECK-SD-NEXT:    uqadd v3.16b, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v64i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    uqadd v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    uqadd v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    uqadd v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ret
   %z = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
 }
@@ -98,23 +95,37 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 }
 
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    uqadd v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    uqadd v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uqadd v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
 }
 
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
-; CHECK-LABEL: v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    uqadd v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    uqadd v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    uqadd v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    uqadd v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    uqadd v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    uqadd v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    uqadd v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    uqadd v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    uqadd v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
 }
@@ -194,24 +205,42 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    ldrh w10, [x0, #2]
-; CHECK-NEXT:    ldrh w11, [x1, #2]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    umin v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x2]
-; CHECK-NEXT:    strh w8, [x2, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldrh w8, [x0]
+; CHECK-SD-NEXT:    ldrh w9, [x1]
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    ldrh w10, [x0, #2]
+; CHECK-SD-NEXT:    ldrh w11, [x1, #2]
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    fmov s1, w9
+; CHECK-SD-NEXT:    mov v0.s[1], w10
+; CHECK-SD-NEXT:    mov v1.s[1], w11
+; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    umin v0.2s, v0.2s, v2.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x2]
+; CHECK-SD-NEXT:    strh w8, [x2, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h2, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[0]
+; CHECK-GI-NEXT:    uqadd v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    str h0, [x2]
+; CHECK-GI-NEXT:    str h1, [x2, #2]
+; CHECK-GI-NEXT:    ret
   %x = load <2 x i16>, ptr %px
   %y = load <2 x i16>, ptr %py
   %z = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
@@ -229,15 +258,67 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 }
 
 define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v12i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    uqadd v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    uqadd v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    str q0, [x2]
-; CHECK-NEXT:    str d1, [x2, #16]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v12i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q3, [x1]
+; CHECK-SD-NEXT:    ldp q1, q2, [x0]
+; CHECK-SD-NEXT:    uqadd v0.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    uqadd v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    str q0, [x2]
+; CHECK-SD-NEXT:    str d1, [x2, #16]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v12i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    ldr h4, [x1, #10]
+; CHECK-GI-NEXT:    ldr h5, [x0, #18]
+; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #4]
+; CHECK-GI-NEXT:    ldr h6, [x1, #16]
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x1, #4]
+; CHECK-GI-NEXT:    ldr h7, [x1, #18]
+; CHECK-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NEXT:    ldr h7, [x1, #20]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #6]
+; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x1, #6]
+; CHECK-GI-NEXT:    mov v6.h[2], v7.h[0]
+; CHECK-GI-NEXT:    ldr h7, [x1, #22]
+; CHECK-GI-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #8]
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x1, #8]
+; CHECK-GI-NEXT:    mov v6.h[3], v7.h[0]
+; CHECK-GI-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #10]
+; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x0, #16]
+; CHECK-GI-NEXT:    mov v3.h[1], v5.h[0]
+; CHECK-GI-NEXT:    ldr h5, [x0, #20]
+; CHECK-GI-NEXT:    mov v0.h[5], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #12]
+; CHECK-GI-NEXT:    mov v1.h[5], v4.h[0]
+; CHECK-GI-NEXT:    ldr h4, [x1, #12]
+; CHECK-GI-NEXT:    mov v3.h[2], v5.h[0]
+; CHECK-GI-NEXT:    ldr h5, [x0, #22]
+; CHECK-GI-NEXT:    mov v0.h[6], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #14]
+; CHECK-GI-NEXT:    mov v1.h[6], v4.h[0]
+; CHECK-GI-NEXT:    ldr h4, [x1, #14]
+; CHECK-GI-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-GI-NEXT:    mov v0.h[7], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[7], v4.h[0]
+; CHECK-GI-NEXT:    uqadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    uqadd v1.4h, v3.4h, v6.4h
+; CHECK-GI-NEXT:    str q0, [x2]
+; CHECK-GI-NEXT:    str d1, [x2, #16]
+; CHECK-GI-NEXT:    ret
   %x = load <12 x i16>, ptr %px
   %y = load <12 x i16>, ptr %py
   %z = call <12 x i16> @llvm.uadd.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
@@ -336,23 +417,37 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    uqadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    uqadd v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    uqadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
 }
 
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
-; CHECK-LABEL: v16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    uqadd v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    uqadd v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    uqadd v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    uqadd v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    uqadd v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    uqadd v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    uqadd v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    uqadd v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    uqadd v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
 }
@@ -367,23 +462,37 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 }
 
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    uqadd v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    uqadd v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    uqadd v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
   %z = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
 }
 
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
-; CHECK-LABEL: v8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    uqadd v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    uqadd v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    uqadd v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    uqadd v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    uqadd v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    uqadd v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    uqadd v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    uqadd v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    uqadd v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
 }
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 05f43e7d8427b7..7915182a8bba95 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -2,28 +2,11 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v16i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v64i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
+; CHECK-GI:       warning: Instruction selection used fallback path for v4i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.usub.sat.v1i8(<1 x i8>, <1 x i8>)
@@ -68,23 +51,37 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 }
 
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    uqsub v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    uqsub v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uqsub v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
 }
 
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
-; CHECK-LABEL: v64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    uqsub v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    uqsub v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    uqsub v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v64i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    uqsub v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    uqsub v1.16b, v1.16b, v5.16b
+; CHECK-SD-NEXT:    uqsub v3.16b, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v64i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    uqsub v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    uqsub v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    uqsub v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ret
   %z = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
 }
@@ -99,23 +96,37 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 }
 
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    uqsub v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    uqsub v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uqsub v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
 }
 
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
-; CHECK-LABEL: v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    uqsub v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    uqsub v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    uqsub v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    uqsub v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    uqsub v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    uqsub v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    uqsub v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    uqsub v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    uqsub v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
 }
@@ -193,22 +204,40 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    ldrh w10, [x0, #2]
-; CHECK-NEXT:    ldrh w11, [x1, #2]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    uqsub v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x2]
-; CHECK-NEXT:    strh w8, [x2, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldrh w8, [x0]
+; CHECK-SD-NEXT:    ldrh w9, [x1]
+; CHECK-SD-NEXT:    ldrh w10, [x0, #2]
+; CHECK-SD-NEXT:    ldrh w11, [x1, #2]
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    fmov s1, w9
+; CHECK-SD-NEXT:    mov v0.s[1], w10
+; CHECK-SD-NEXT:    mov v1.s[1], w11
+; CHECK-SD-NEXT:    uqsub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x2]
+; CHECK-SD-NEXT:    strh w8, [x2, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h2, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[0]
+; CHECK-GI-NEXT:    uqsub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    str h0, [x2]
+; CHECK-GI-NEXT:    str h1, [x2, #2]
+; CHECK-GI-NEXT:    ret
   %x = load <2 x i16>, ptr %px
   %y = load <2 x i16>, ptr %py
   %z = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
@@ -226,15 +255,67 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 }
 
 define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v12i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    uqsub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    uqsub v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    str q0, [x2]
-; CHECK-NEXT:    str d1, [x2, #16]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v12i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q3, [x1]
+; CHECK-SD-NEXT:    ldp q1, q2, [x0]
+; CHECK-SD-NEXT:    uqsub v0.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    uqsub v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    str q0, [x2]
+; CHECK-SD-NEXT:    str d1, [x2, #16]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v12i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    ldr h4, [x1, #10]
+; CHECK-GI-NEXT:    ldr h5, [x0, #18]
+; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #4]
+; CHECK-GI-NEXT:    ldr h6, [x1, #16]
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x1, #4]
+; CHECK-GI-NEXT:    ldr h7, [x1, #18]
+; CHECK-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NEXT:    ldr h7, [x1, #20]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #6]
+; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x1, #6]
+; CHECK-GI-NEXT:    mov v6.h[2], v7.h[0]
+; CHECK-GI-NEXT:    ldr h7, [x1, #22]
+; CHECK-GI-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #8]
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x1, #8]
+; CHECK-GI-NEXT:    mov v6.h[3], v7.h[0]
+; CHECK-GI-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #10]
+; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    ldr h3, [x0, #16]
+; CHECK-GI-NEXT:    mov v3.h[1], v5.h[0]
+; CHECK-GI-NEXT:    ldr h5, [x0, #20]
+; CHECK-GI-NEXT:    mov v0.h[5], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #12]
+; CHECK-GI-NEXT:    mov v1.h[5], v4.h[0]
+; CHECK-GI-NEXT:    ldr h4, [x1, #12]
+; CHECK-GI-NEXT:    mov v3.h[2], v5.h[0]
+; CHECK-GI-NEXT:    ldr h5, [x0, #22]
+; CHECK-GI-NEXT:    mov v0.h[6], v2.h[0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #14]
+; CHECK-GI-NEXT:    mov v1.h[6], v4.h[0]
+; CHECK-GI-NEXT:    ldr h4, [x1, #14]
+; CHECK-GI-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-GI-NEXT:    mov v0.h[7], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[7], v4.h[0]
+; CHECK-GI-NEXT:    uqsub v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    uqsub v1.4h, v3.4h, v6.4h
+; CHECK-GI-NEXT:    str q0, [x2]
+; CHECK-GI-NEXT:    str d1, [x2, #16]
+; CHECK-GI-NEXT:    ret
   %x = load <12 x i16>, ptr %px
   %y = load <12 x i16>, ptr %py
   %z = call <12 x i16> @llvm.usub.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
@@ -334,23 +415,37 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    uqsub v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    uqsub v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    uqsub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
 }
 
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
-; CHECK-LABEL: v16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    uqsub v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    uqsub v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    uqsub v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    uqsub v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    uqsub v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    uqsub v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    uqsub v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    uqsub v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    uqsub v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
 }
@@ -365,23 +460,37 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 }
 
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    uqsub v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    uqsub v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    uqsub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
   %z = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
 }
 
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
-; CHECK-LABEL: v8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    uqsub v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    uqsub v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    uqsub v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    uqsub v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    uqsub v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    uqsub v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    uqsub v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    uqsub v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    uqsub v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
 }

>From 35bd97637f8f3f82acc9a1c4bc56499f45de6fb5 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 28 Feb 2024 08:58:02 +0000
Subject: [PATCH 2/2] Use lower instead of lowerIf(scalar)

---
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 1d9289d9f56242..56e51329274018 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1164,7 +1164,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v2s32, v4s32)
       .clampMaxNumElements(0, s64, 2)
       .moreElementsToNextPow2(0)
-      .lowerIf(isScalar(0));
+      .lower();
 
   // TODO: Libcall support for s128.
   // TODO: s16 should be legal with full FP16 support.



More information about the llvm-commits mailing list