[llvm] [AArch64][GlobalISel] Added support for hadd family of intrinsics (PR #163985)

Joshua Rodriguez via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 5 08:28:31 PST 2025


https://github.com/JoshdRod updated https://github.com/llvm/llvm-project/pull/163985

>From d4fd27582fa9d428911d56deff26131917b33661 Mon Sep 17 00:00:00 2001
From: Josh Rodriguez <josh.rodriguez at arm.com>
Date: Thu, 16 Oct 2025 15:47:02 +0000
Subject: [PATCH 01/12] [AArch64][GlobalISel] Added uhadd intrinsic support

GlobalISel now selects uhadd intrinsic, without falling back to SDAG.
Note that GlobalISel-generated code involving uhadd seems to be inefficent when compared to SDAG.
---
 llvm/lib/Target/AArch64/AArch64InstrGISel.td  |  8 ++++++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  2 ++
 llvm/test/CodeGen/AArch64/freeze.ll           | 25 +++++++++++++------
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 30b7b03f7a69a..a80390011f986 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -239,6 +239,12 @@ def G_USDOT : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_UHADD : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type1:$src2);
+  let hasSideEffects = 0;
+}
+
 // Generic instruction for the BSP pseudo. It is expanded into BSP, which
 // expands into BSL/BIT/BIF after register allocation.
 def G_BSP : AArch64GenericInstruction {
@@ -286,6 +292,8 @@ def : GINodeEquiv<G_UDOT, AArch64udot>;
 def : GINodeEquiv<G_SDOT, AArch64sdot>;
 def : GINodeEquiv<G_USDOT, AArch64usdot>;
 
+def : GINodeEquiv<G_UHADD, avgflooru>;
+
 def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
 def : GINodeEquiv<G_AARCH64_PREFETCH, AArch64Prefetch>;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 5f93847bc680e..44ed11c396dbd 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1817,6 +1817,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return LowerBinOp(TargetOpcode::G_ABDS);
   case Intrinsic::aarch64_neon_uabd:
     return LowerBinOp(TargetOpcode::G_ABDU);
+  case Intrinsic::aarch64_neon_uhadd:
+    return LowerBinOp(AArch64::G_UHADD);
   case Intrinsic::aarch64_neon_abs: {
     // Lower the intrinsic to G_ABS.
     MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
index fb909fec90434..f5e2ffd7361ce 100644
--- a/llvm/test/CodeGen/AArch64/freeze.ll
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -3,7 +3,6 @@
 ; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for freeze_v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for freeze_uhadd
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for freeze_urhadd
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for freeze_shadd
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for freeze_srhadd
@@ -435,13 +434,23 @@ define <8 x i16> @freeze_abds(<8 x i16> %a, <8 x i16> %b) {
 }
 
 define <8 x i16> @freeze_uhadd(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: freeze_uhadd:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.8h, #15
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: freeze_uhadd:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v2.8h, #15
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_uhadd:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v2.8h, #15
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    movi v2.8h, #31
+; CHECK-GI-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %m0 = and <8 x i16> %a0, splat (i16 15)
   %m1 = and <8 x i16> %a1, splat (i16 15)
   %avg = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %m0, <8 x i16> %m1)

>From 279919cf16fbcba23dd8e9f7849076cb0cc744ee Mon Sep 17 00:00:00 2001
From: Josh Rodriguez <josh.rodriguez at arm.com>
Date: Fri, 17 Oct 2025 13:13:22 +0000
Subject: [PATCH 02/12] [AArch64][GlobalISel] Added urhadd intrinsic support

GlobalISel now selects urhadd intrinsic, without falling back to SDAG.
Note that GlobalISel-generated code involving urhadd seems to be inefficent when compared to SDAG.
---
 llvm/lib/Target/AArch64/AArch64InstrGISel.td  |  7 ++++++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  2 ++
 llvm/test/CodeGen/AArch64/freeze.ll           | 25 +++++++++++++------
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index a80390011f986..68f921e030429 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -245,6 +245,12 @@ def G_UHADD : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_URHADD : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type1:$src2);
+  let hasSideEffects = 0;
+}
+
 // Generic instruction for the BSP pseudo. It is expanded into BSP, which
 // expands into BSL/BIT/BIF after register allocation.
 def G_BSP : AArch64GenericInstruction {
@@ -293,6 +299,7 @@ def : GINodeEquiv<G_SDOT, AArch64sdot>;
 def : GINodeEquiv<G_USDOT, AArch64usdot>;
 
 def : GINodeEquiv<G_UHADD, avgflooru>;
+def : GINodeEquiv<G_URHADD, avgceilu>;
 
 def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 44ed11c396dbd..f579c6f5ba091 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1819,6 +1819,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return LowerBinOp(TargetOpcode::G_ABDU);
   case Intrinsic::aarch64_neon_uhadd:
     return LowerBinOp(AArch64::G_UHADD);
+  case Intrinsic::aarch64_neon_urhadd:
+    return LowerBinOp(AArch64::G_URHADD);
   case Intrinsic::aarch64_neon_abs: {
     // Lower the intrinsic to G_ABS.
     MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
index f5e2ffd7361ce..e2ae046da1467 100644
--- a/llvm/test/CodeGen/AArch64/freeze.ll
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -3,7 +3,6 @@
 ; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for freeze_v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for freeze_urhadd
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for freeze_shadd
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for freeze_srhadd
 
@@ -460,13 +459,23 @@ define <8 x i16> @freeze_uhadd(<8 x i16> %a0, <8 x i16> %a1) {
 }
 
 define <8 x i16> @freeze_urhadd(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: freeze_urhadd:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.8h, #15
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: freeze_urhadd:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v2.8h, #15
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_urhadd:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v2.8h, #15
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    movi v2.8h, #31
+; CHECK-GI-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %m0 = and <8 x i16> %a0, splat (i16 15)
   %m1 = and <8 x i16> %a1, splat (i16 15)
   %avg = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %m0, <8 x i16> %m1)

>From faf2d20728f9cf3fdb9ba16fdd8ca601511caa66 Mon Sep 17 00:00:00 2001
From: Josh Rodriguez <josh.rodriguez at arm.com>
Date: Fri, 17 Oct 2025 15:06:58 +0000
Subject: [PATCH 03/12] [AArch64][GlobalISel] Added shadd intrinsic support
 GlobalISel now selects shadd intrinsic, without falling back to SDAG. Note
 that GlobalISel-generated code involving shadd seems to be inefficent when
 compared to SDAG.

---
 llvm/lib/Target/AArch64/AArch64InstrGISel.td  |  7 ++++++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  2 ++
 llvm/test/CodeGen/AArch64/freeze.ll           | 22 +++++++++++++------
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 68f921e030429..2c2c403d96d72 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -251,6 +251,12 @@ def G_URHADD : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_SHADD : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type1:$src2);
+  let hasSideEffects = 0;
+}
+
 // Generic instruction for the BSP pseudo. It is expanded into BSP, which
 // expands into BSL/BIT/BIF after register allocation.
 def G_BSP : AArch64GenericInstruction {
@@ -300,6 +306,7 @@ def : GINodeEquiv<G_USDOT, AArch64usdot>;
 
 def : GINodeEquiv<G_UHADD, avgflooru>;
 def : GINodeEquiv<G_URHADD, avgceilu>;
+def : GINodeEquiv<G_SHADD, avgfloors>;
 
 def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index f579c6f5ba091..14f592b895c9e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1821,6 +1821,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return LowerBinOp(AArch64::G_UHADD);
   case Intrinsic::aarch64_neon_urhadd:
     return LowerBinOp(AArch64::G_URHADD);
+  case Intrinsic::aarch64_neon_shadd:
+    return LowerBinOp(AArch64::G_SHADD);
   case Intrinsic::aarch64_neon_abs: {
     // Lower the intrinsic to G_ABS.
     MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
index e2ae046da1467..dffd89143d16b 100644
--- a/llvm/test/CodeGen/AArch64/freeze.ll
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -3,7 +3,6 @@
 ; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for freeze_v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for freeze_shadd
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for freeze_srhadd
 
 %struct.T = type { i32, i32 }
@@ -485,12 +484,21 @@ define <8 x i16> @freeze_urhadd(<8 x i16> %a0, <8 x i16> %a1) {
 }
 
 define <8 x i16> @freeze_shadd(<8 x i8> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: freeze_shadd:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshr v1.8h, v1.8h, #8
-; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: freeze_shadd:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshr v1.8h, v1.8h, #8
+; CHECK-SD-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_shadd:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshr v1.8h, v1.8h, #8
+; CHECK-GI-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #8
+; CHECK-GI-NEXT:    sshr v0.8h, v0.8h, #8
+; CHECK-GI-NEXT:    ret
   %x0 = sext <8 x i8> %a0 to <8 x i16>
   %x1 = ashr <8 x i16> %a1, splat (i16 8)
   %avg = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)

>From 701522aa4ff39827737b89af902131cd591da5c0 Mon Sep 17 00:00:00 2001
From: Josh Rodriguez <josh.rodriguez at arm.com>
Date: Fri, 17 Oct 2025 15:46:32 +0000
Subject: [PATCH 04/12] [AArch64][GlobalISel] Added srhadd intrinsic support
 GlobalISel now selects srhadd intrinsic, without falling back to SDAG. Note
 that GlobalISel-generated code involving uhadd seems to be inefficent when
 compared to SDAG.

---
 llvm/lib/Target/AArch64/AArch64InstrGISel.td  |  7 ++++++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  2 ++
 llvm/test/CodeGen/AArch64/freeze.ll           | 22 +++++++++++++------
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 2c2c403d96d72..e44e31845380a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -257,6 +257,12 @@ def G_SHADD : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_SRHADD : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type1:$src2);
+  let hasSideEffects = 0;
+}
+
 // Generic instruction for the BSP pseudo. It is expanded into BSP, which
 // expands into BSL/BIT/BIF after register allocation.
 def G_BSP : AArch64GenericInstruction {
@@ -307,6 +313,7 @@ def : GINodeEquiv<G_USDOT, AArch64usdot>;
 def : GINodeEquiv<G_UHADD, avgflooru>;
 def : GINodeEquiv<G_URHADD, avgceilu>;
 def : GINodeEquiv<G_SHADD, avgfloors>;
+def : GINodeEquiv<G_SRHADD, avgceils>;
 
 def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 14f592b895c9e..c8d31bbbc8b9a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1823,6 +1823,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return LowerBinOp(AArch64::G_URHADD);
   case Intrinsic::aarch64_neon_shadd:
     return LowerBinOp(AArch64::G_SHADD);
+  case Intrinsic::aarch64_neon_srhadd:
+    return LowerBinOp(AArch64::G_SRHADD);
   case Intrinsic::aarch64_neon_abs: {
     // Lower the intrinsic to G_ABS.
     MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
index dffd89143d16b..136ac8b0a2aa1 100644
--- a/llvm/test/CodeGen/AArch64/freeze.ll
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -3,7 +3,6 @@
 ; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for freeze_v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for freeze_srhadd
 
 %struct.T = type { i32, i32 }
 
@@ -509,12 +508,21 @@ define <8 x i16> @freeze_shadd(<8 x i8> %a0, <8 x i16> %a1) {
 }
 
 define <8 x i16> @freeze_srhadd(<8 x i8> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: freeze_srhadd:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshr v1.8h, v1.8h, #8
-; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: freeze_srhadd:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshr v1.8h, v1.8h, #8
+; CHECK-SD-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_srhadd:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshr v1.8h, v1.8h, #8
+; CHECK-GI-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #8
+; CHECK-GI-NEXT:    sshr v0.8h, v0.8h, #8
+; CHECK-GI-NEXT:    ret
   %x0 = sext <8 x i8> %a0 to <8 x i16>
   %x1 = ashr <8 x i16> %a1, splat (i16 8)
   %avg = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)

>From 09b24f5afc26f983b1f8d15e7fb96ac9bfdba7b8 Mon Sep 17 00:00:00 2001
From: Josh Rodriguez <josh.rodriguez at arm.com>
Date: Fri, 17 Oct 2025 15:48:31 +0000
Subject: [PATCH 05/12] [AArch64][GlobalISel] Modified llc test to check
 generation from both SDAG and GISel Note that GlobalISel-generated code
 involving the hadd family of intrinsics seems to be inefficent when compared
 to SDAG.

---
 .../AArch64/aarch64-known-bits-hadd.ll        | 171 +++++++++++++-----
 1 file changed, 123 insertions(+), 48 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-known-bits-hadd.ll b/llvm/test/CodeGen/AArch64/aarch64-known-bits-hadd.ll
index f900f0209a108..a6fbaf01c5476 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-known-bits-hadd.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-known-bits-hadd.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>)
 declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>)
@@ -7,11 +8,20 @@ declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>)
 declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>)
 
 define <8 x i16> @haddu_zext(<8 x i8> %a0, <8 x i8> %a1) {
-; CHECK-LABEL: haddu_zext:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: haddu_zext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: haddu_zext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mvni v2.8h, #254, lsl #8
+; CHECK-GI-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %x0 = zext <8 x i8> %a0 to <8 x i16>
   %x1 = zext <8 x i8> %a1 to <8 x i16>
   %hadd = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
@@ -20,11 +30,20 @@ define <8 x i16> @haddu_zext(<8 x i8> %a0, <8 x i8> %a1) {
 }
 
 define <8 x i16> @rhaddu_zext(<8 x i8> %a0, <8 x i8> %a1) {
-; CHECK-LABEL: rhaddu_zext:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: rhaddu_zext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    urhadd v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rhaddu_zext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mvni v2.8h, #254, lsl #8
+; CHECK-GI-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %x0 = zext <8 x i8> %a0 to <8 x i16>
   %x1 = zext <8 x i8> %a1 to <8 x i16>
   %hadd = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
@@ -33,11 +52,20 @@ define <8 x i16> @rhaddu_zext(<8 x i8> %a0, <8 x i8> %a1) {
 }
 
 define <8 x i16> @hadds_zext(<8 x i8> %a0, <8 x i8> %a1) {
-; CHECK-LABEL: hadds_zext:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadds_zext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadds_zext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mvni v2.8h, #254, lsl #8
+; CHECK-GI-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %x0 = zext <8 x i8> %a0 to <8 x i16>
   %x1 = zext <8 x i8> %a1 to <8 x i16>
   %hadd = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
@@ -46,12 +74,21 @@ define <8 x i16> @hadds_zext(<8 x i8> %a0, <8 x i8> %a1) {
 }
 
 define <8 x i16> @shaddu_zext(<8 x i8> %a0, <8 x i8> %a1) {
-; CHECK-LABEL: shaddu_zext:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shaddu_zext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shaddu_zext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mvni v2.8h, #254, lsl #8
+; CHECK-GI-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %x0 = zext <8 x i8> %a0 to <8 x i16>
   %x1 = zext <8 x i8> %a1 to <8 x i16>
   %hadd = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
@@ -62,13 +99,22 @@ define <8 x i16> @shaddu_zext(<8 x i8> %a0, <8 x i8> %a1) {
 ; ; negative tests
 
 define <8 x i16> @haddu_sext(<8 x i8> %a0, <8 x i8> %a1) {
-; CHECK-LABEL: haddu_sext:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    bic v0.8h, #254, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: haddu_sext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    bic v0.8h, #254, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: haddu_sext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mvni v2.8h, #254, lsl #8
+; CHECK-GI-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %x0 = sext <8 x i8> %a0 to <8 x i16>
   %x1 = sext <8 x i8> %a1 to <8 x i16>
   %hadd = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
@@ -77,13 +123,22 @@ define <8 x i16> @haddu_sext(<8 x i8> %a0, <8 x i8> %a1) {
 }
 
 define <8 x i16> @urhadd_sext(<8 x i8> %a0, <8 x i8> %a1) {
-; CHECK-LABEL: urhadd_sext:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    bic v0.8h, #254, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urhadd_sext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    bic v0.8h, #254, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urhadd_sext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mvni v2.8h, #254, lsl #8
+; CHECK-GI-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %x0 = sext <8 x i8> %a0 to <8 x i16>
   %x1 = sext <8 x i8> %a1 to <8 x i16>
   %hadd = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
@@ -92,12 +147,21 @@ define <8 x i16> @urhadd_sext(<8 x i8> %a0, <8 x i8> %a1) {
 }
 
 define <8 x i16> @hadds_sext(<8 x i8> %a0, <8 x i8> %a1) {
-; CHECK-LABEL: hadds_sext:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    bic v0.8h, #254, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadds_sext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shadd v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    bic v0.8h, #254, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadds_sext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mvni v2.8h, #254, lsl #8
+; CHECK-GI-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %x0 = sext <8 x i8> %a0 to <8 x i16>
   %x1 = sext <8 x i8> %a1 to <8 x i16>
   %hadd = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
@@ -106,15 +170,26 @@ define <8 x i16> @hadds_sext(<8 x i8> %a0, <8 x i8> %a1) {
 }
 
 define <8 x i16> @shaddu_sext(<8 x i8> %a0, <8 x i8> %a1) {
-; CHECK-LABEL: shaddu_sext:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    bic v0.8h, #254, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shaddu_sext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    srhadd v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    bic v0.8h, #254, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shaddu_sext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mvni v2.8h, #254, lsl #8
+; CHECK-GI-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %x0 = sext <8 x i8> %a0 to <8 x i16>
   %x1 = sext <8 x i8> %a1 to <8 x i16>
   %hadd = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
   %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
   ret <8 x i16> %res
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}

>From f48cbfec7e7dd36f8248993e96849e7380dd4606 Mon Sep 17 00:00:00 2001
From: Josh Rodriguez <josh.rodriguez at arm.com>
Date: Tue, 21 Oct 2025 14:36:01 +0000
Subject: [PATCH 06/12] [AArch64][GlobalISel] Modified gMIR instruction names
 to match SDAG equivalents.

---
 llvm/lib/Target/AArch64/AArch64InstrGISel.td     | 16 ++++++++--------
 .../AArch64/GISel/AArch64LegalizerInfo.cpp       |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index e44e31845380a..d055e28f41e35 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -239,25 +239,25 @@ def G_USDOT : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
-def G_UHADD : AArch64GenericInstruction {
+def G_AVGFLOORU : AArch64GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type1:$src2);
   let hasSideEffects = 0;
 }
 
-def G_URHADD : AArch64GenericInstruction {
+def G_AVGCEILU : AArch64GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type1:$src2);
   let hasSideEffects = 0;
 }
 
-def G_SHADD : AArch64GenericInstruction {
+def G_AVGFLOORS : AArch64GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type1:$src2);
   let hasSideEffects = 0;
 }
 
-def G_SRHADD : AArch64GenericInstruction {
+def G_AVGCEILS : AArch64GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type1:$src2);
   let hasSideEffects = 0;
@@ -310,10 +310,10 @@ def : GINodeEquiv<G_UDOT, AArch64udot>;
 def : GINodeEquiv<G_SDOT, AArch64sdot>;
 def : GINodeEquiv<G_USDOT, AArch64usdot>;
 
-def : GINodeEquiv<G_UHADD, avgflooru>;
-def : GINodeEquiv<G_URHADD, avgceilu>;
-def : GINodeEquiv<G_SHADD, avgfloors>;
-def : GINodeEquiv<G_SRHADD, avgceils>;
+def : GINodeEquiv<G_AVGFLOORU, avgflooru>;
+def : GINodeEquiv<G_AVGCEILU, avgceilu>;
+def : GINodeEquiv<G_AVGFLOORS, avgfloors>;
+def : GINodeEquiv<G_AVGCEILS, avgceils>;
 
 def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index c8d31bbbc8b9a..204f4b2b4c2de 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1818,13 +1818,13 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::aarch64_neon_uabd:
     return LowerBinOp(TargetOpcode::G_ABDU);
   case Intrinsic::aarch64_neon_uhadd:
-    return LowerBinOp(AArch64::G_UHADD);
+    return LowerBinOp(AArch64::G_AVGFLOORU);
   case Intrinsic::aarch64_neon_urhadd:
-    return LowerBinOp(AArch64::G_URHADD);
+    return LowerBinOp(AArch64::G_AVGCEILU);
   case Intrinsic::aarch64_neon_shadd:
-    return LowerBinOp(AArch64::G_SHADD);
+    return LowerBinOp(AArch64::G_AVGFLOORS);
   case Intrinsic::aarch64_neon_srhadd:
-    return LowerBinOp(AArch64::G_SRHADD);
+    return LowerBinOp(AArch64::G_AVGCEILS);
   case Intrinsic::aarch64_neon_abs: {
     // Lower the intrinsic to G_ABS.
     MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});

>From cc40eb7f9968628348959186a5ff7780c4207c83 Mon Sep 17 00:00:00 2001
From: Josh Rodriguez <josh.rodriguez at arm.com>
Date: Mon, 27 Oct 2025 13:55:11 +0000
Subject: [PATCH 07/12] [AArch64][GlobalISel] Converted intrinsics to
 machine-independent form

---
 llvm/include/llvm/Support/TargetOpcodes.def   | 11 ++++++++
 llvm/include/llvm/Target/GenericOpcodes.td    | 28 +++++++++++++++++++
 .../Target/GlobalISel/SelectionDAGCompat.td   |  4 +++
 llvm/lib/Target/AArch64/AArch64InstrGISel.td  | 24 ----------------
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    | 10 +++----
 .../GlobalISel/legalizer-info-validation.mir  | 20 +++++++++++++
 .../match-table-cxx.td                        |  2 +-
 .../GlobalISelEmitter/GlobalISelEmitter.td    |  2 +-
 llvm/test/TableGen/get-named-operand-idx.td   |  3 +-
 9 files changed, 72 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index e55314568d683..d7a2e899ffd6f 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -295,6 +295,17 @@ HANDLE_TARGET_OPCODE(G_ABDS)
 /// Generic absolute difference unsigned instruction.
 HANDLE_TARGET_OPCODE(G_ABDU)
 
+/// Generic vector average with truncate unsigned instruction.
+HANDLE_TARGET_OPCODE(G_AVGFLOORU)
+
+/// Generic vector average with round unsigned instruction.
+HANDLE_TARGET_OPCODE(G_AVGCEILU)
+
+/// Generic vector average with truncate signed instruction.
+HANDLE_TARGET_OPCODE(G_AVGFLOORS)
+
+/// Generic vector average with round signed instruction.
+HANDLE_TARGET_OPCODE(G_AVGCEILS)
 
 HANDLE_TARGET_OPCODE(G_IMPLICIT_DEF)
 
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index e3f995d53484f..b847e0425cf2b 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -423,6 +423,34 @@ def G_ABDU : GenericInstruction {
   let isCommutable = true;
 }
 
+// Generic vector average truncated unsigned.
+def G_AVGFLOORU : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+}
+
+// Generic vector average rounded unsigned.
+def G_AVGCEILU : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+}
+
+// Generic vector average truncated signed.
+def G_AVGFLOORS : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+}
+
+// Generic vector average rounded signed.
+def G_AVGCEILS : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+}
+
 /// Funnel 'double' shifts take 3 operands, 2 inputs and the shift amount.
 /// fshl(X,Y,Z): (X << (Z % bitwidth)) | (Y >> (bitwidth - (Z % bitwidth)))
 def G_FSHL : GenericInstruction {
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index c0d480294dd8b..137b291d25d35 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -83,6 +83,10 @@ def : GINodeEquiv<G_LSHR, srl>;
 def : GINodeEquiv<G_ASHR, sra>;
 def : GINodeEquiv<G_ABDS, abds>;
 def : GINodeEquiv<G_ABDU, abdu>;
+def : GINodeEquiv<G_AVGFLOORU, avgflooru>;
+def : GINodeEquiv<G_AVGCEILU, avgceilu>;
+def : GINodeEquiv<G_AVGFLOORS, avgfloors>;
+def : GINodeEquiv<G_AVGCEILS, avgceils>;
 def : GINodeEquiv<G_SADDSAT, saddsat>;
 def : GINodeEquiv<G_UADDSAT, uaddsat>;
 def : GINodeEquiv<G_SSUBSAT, ssubsat>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index d055e28f41e35..7791eda6cd14a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -239,30 +239,6 @@ def G_USDOT : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
-def G_AVGFLOORU : AArch64GenericInstruction {
-  let OutOperandList = (outs type0:$dst);
-  let InOperandList = (ins type0:$src1, type1:$src2);
-  let hasSideEffects = 0;
-}
-
-def G_AVGCEILU : AArch64GenericInstruction {
-  let OutOperandList = (outs type0:$dst);
-  let InOperandList = (ins type0:$src1, type1:$src2);
-  let hasSideEffects = 0;
-}
-
-def G_AVGFLOORS : AArch64GenericInstruction {
-  let OutOperandList = (outs type0:$dst);
-  let InOperandList = (ins type0:$src1, type1:$src2);
-  let hasSideEffects = 0;
-}
-
-def G_AVGCEILS : AArch64GenericInstruction {
-  let OutOperandList = (outs type0:$dst);
-  let InOperandList = (ins type0:$src1, type1:$src2);
-  let hasSideEffects = 0;
-}
-
 // Generic instruction for the BSP pseudo. It is expanded into BSP, which
 // expands into BSL/BIT/BIF after register allocation.
 def G_BSP : AArch64GenericInstruction {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 204f4b2b4c2de..2e64b7ba0bd5a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -289,7 +289,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .moreElementsToNextPow2(0)
       .lower();
 
-  getActionDefinitionsBuilder({G_ABDS, G_ABDU})
+  getActionDefinitionsBuilder({G_ABDS, G_ABDU, G_AVGFLOORU, G_AVGCEILU, G_AVGFLOORS, G_AVGCEILS})
       .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
       .lower();
 
@@ -1818,13 +1818,13 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::aarch64_neon_uabd:
     return LowerBinOp(TargetOpcode::G_ABDU);
   case Intrinsic::aarch64_neon_uhadd:
-    return LowerBinOp(AArch64::G_AVGFLOORU);
+    return LowerBinOp(TargetOpcode::G_AVGFLOORU);
   case Intrinsic::aarch64_neon_urhadd:
-    return LowerBinOp(AArch64::G_AVGCEILU);
+    return LowerBinOp(TargetOpcode::G_AVGCEILU);
   case Intrinsic::aarch64_neon_shadd:
-    return LowerBinOp(AArch64::G_AVGFLOORS);
+    return LowerBinOp(TargetOpcode::G_AVGFLOORS);
   case Intrinsic::aarch64_neon_srhadd:
-    return LowerBinOp(AArch64::G_AVGCEILS);
+    return LowerBinOp(TargetOpcode::G_AVGCEILS);
   case Intrinsic::aarch64_neon_abs: {
     // Lower the intrinsic to G_ABS.
     MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 896603d6eb20d..800b575dc7920 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -79,6 +79,26 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
+# DEBUG-NEXT: G_AVGFLOORU (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+#
+# DEBUG-NEXT: G_AVGCEILU (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+#
+# DEBUG-NEXT: G_AVGFLOORS (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+#
+# DEBUG-NEXT: G_AVGCEILS (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+#
 # DEBUG-NEXT: G_IMPLICIT_DEF (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: {{[0-9]+}}, OK
 # DEBUG-NEXT: .. the first uncovered imm index: {{[0-9]+}}, OK
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
index 18960b43ab97d..df645c28ace9b 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
@@ -96,7 +96,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:      /*   0 */ GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(99), GIMT_Encode2(211), /*)*//*default:*//*Label 5*/ GIMT_Encode4(524),
+// CHECK-NEXT:      /*   0 */ GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(103), GIMT_Encode2(215), /*)*//*default:*//*Label 5*/ GIMT_Encode4(524),
 // CHECK-NEXT:      /* 10 */ /*TargetOpcode::G_STORE*//*Label 0*/ GIMT_Encode4(458), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:      /* 182 */ /*TargetOpcode::G_SEXT*//*Label 1*/ GIMT_Encode4(476), GIMT_Encode4(0),
 // CHECK-NEXT:      /* 190 */ /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(488), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
diff --git a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
index fdabc53a3ff3b..64ca63da3b6f0 100644
--- a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
@@ -535,7 +535,7 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3),
 // R00O-NEXT:  GIM_Reject,
 // R00O:       // Label [[DEFAULT_NUM]]: @[[DEFAULT]]
 // R00O-NEXT:  GIM_Reject,
-// R00O-NEXT:  }; // Size: 1902 bytes
+// R00O-NEXT:  }; // Size: 1918 bytes
 
 def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4),
                  [(set GPR32:$dst,
diff --git a/llvm/test/TableGen/get-named-operand-idx.td b/llvm/test/TableGen/get-named-operand-idx.td
index e6f6331cd9c48..59693eba50bdc 100644
--- a/llvm/test/TableGen/get-named-operand-idx.td
+++ b/llvm/test/TableGen/get-named-operand-idx.td
@@ -89,7 +89,8 @@ def InstD : InstBase {
 // CHECK-NEXT:      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 // CHECK-NEXT:      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 // CHECK-NEXT:      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-// CHECK-NEXT:      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0,
+// CHECK-NEXT:      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// CHECK-NEXT:      1, 2, 2, 0,
 // CHECK-NEXT:    };
 // CHECK-NEXT:    return InstructionIndex[Opcode];
 // CHECK-NEXT:  }

>From bc312fc63d46163f9e5c1acc6c97d3c511cbe86c Mon Sep 17 00:00:00 2001
From: Josh Rodriguez <josh.rodriguez at arm.com>
Date: Tue, 28 Oct 2025 09:17:55 +0000
Subject: [PATCH 08/12] [AArch64][GlobalISel] Fixed formatting

---
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 2e64b7ba0bd5a..942455e1942b8 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -289,7 +289,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .moreElementsToNextPow2(0)
       .lower();
 
-  getActionDefinitionsBuilder({G_ABDS, G_ABDU, G_AVGFLOORU, G_AVGCEILU, G_AVGFLOORS, G_AVGCEILS})
+  getActionDefinitionsBuilder(
+      {G_ABDS, G_ABDU, G_AVGFLOORU, G_AVGCEILU, G_AVGFLOORS, G_AVGCEILS})
       .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
       .lower();
 

>From dc04caaac14bb8be0f937b5cafe2e57b97b08bac Mon Sep 17 00:00:00 2001
From: Josh Rodriguez <josh.rodriguez at arm.com>
Date: Tue, 4 Nov 2025 09:19:02 +0000
Subject: [PATCH 09/12] [AArch64][GlobalISel] Renamed GISel nodes for
 consistency

---
 llvm/include/llvm/Support/TargetOpcodes.def            |  8 ++++----
 llvm/include/llvm/Target/GenericOpcodes.td             |  8 ++++----
 .../llvm/Target/GlobalISel/SelectionDAGCompat.td       |  8 ++++----
 llvm/lib/Target/AArch64/AArch64InstrGISel.td           |  8 ++++----
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 10 +++++-----
 .../AArch64/GlobalISel/legalizer-info-validation.mir   |  8 ++++----
 6 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index d7a2e899ffd6f..0d43dce5d6357 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -296,16 +296,16 @@ HANDLE_TARGET_OPCODE(G_ABDS)
 HANDLE_TARGET_OPCODE(G_ABDU)
 
 /// Generic vector average with truncate unsigned instruction.
-HANDLE_TARGET_OPCODE(G_AVGFLOORU)
+HANDLE_TARGET_OPCODE(G_UAVGFLOOR)
 
 /// Generic vector average with round unsigned instruction.
-HANDLE_TARGET_OPCODE(G_AVGCEILU)
+HANDLE_TARGET_OPCODE(G_UAVGCEIL)
 
 /// Generic vector average with truncate signed instruction.
-HANDLE_TARGET_OPCODE(G_AVGFLOORS)
+HANDLE_TARGET_OPCODE(G_SAVGFLOOR)
 
 /// Generic vector average with round signed instruction.
-HANDLE_TARGET_OPCODE(G_AVGCEILS)
+HANDLE_TARGET_OPCODE(G_SAVGCEIL)
 
 HANDLE_TARGET_OPCODE(G_IMPLICIT_DEF)
 
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index b847e0425cf2b..1b65b8b73527d 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -424,28 +424,28 @@ def G_ABDU : GenericInstruction {
 }
 
 // Generic vector average truncated unsigned.
-def G_AVGFLOORU : GenericInstruction {
+def G_UAVGFLOOR : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
   let hasSideEffects = 0;
 }
 
 // Generic vector average rounded unsigned.
-def G_AVGCEILU : GenericInstruction {
+def G_UAVGCEIL : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
   let hasSideEffects = 0;
 }
 
 // Generic vector average truncated signed.
-def G_AVGFLOORS : GenericInstruction {
+def G_SAVGFLOOR : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
   let hasSideEffects = 0;
 }
 
 // Generic vector average rounded signed.
-def G_AVGCEILS : GenericInstruction {
+def G_SAVGCEIL : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
   let hasSideEffects = 0;
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 137b291d25d35..a69e089779315 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -83,10 +83,10 @@ def : GINodeEquiv<G_LSHR, srl>;
 def : GINodeEquiv<G_ASHR, sra>;
 def : GINodeEquiv<G_ABDS, abds>;
 def : GINodeEquiv<G_ABDU, abdu>;
-def : GINodeEquiv<G_AVGFLOORU, avgflooru>;
-def : GINodeEquiv<G_AVGCEILU, avgceilu>;
-def : GINodeEquiv<G_AVGFLOORS, avgfloors>;
-def : GINodeEquiv<G_AVGCEILS, avgceils>;
+def : GINodeEquiv<G_UAVGFLOOR, avgflooru>;
+def : GINodeEquiv<G_UAVGCEIL, avgceilu>;
+def : GINodeEquiv<G_SAVGFLOOR, avgfloors>;
+def : GINodeEquiv<G_SAVGCEIL, avgceils>;
 def : GINodeEquiv<G_SADDSAT, saddsat>;
 def : GINodeEquiv<G_UADDSAT, uaddsat>;
 def : GINodeEquiv<G_SSUBSAT, ssubsat>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 7791eda6cd14a..dffff27ce94aa 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -286,10 +286,10 @@ def : GINodeEquiv<G_UDOT, AArch64udot>;
 def : GINodeEquiv<G_SDOT, AArch64sdot>;
 def : GINodeEquiv<G_USDOT, AArch64usdot>;
 
-def : GINodeEquiv<G_AVGFLOORU, avgflooru>;
-def : GINodeEquiv<G_AVGCEILU, avgceilu>;
-def : GINodeEquiv<G_AVGFLOORS, avgfloors>;
-def : GINodeEquiv<G_AVGCEILS, avgceils>;
+def : GINodeEquiv<G_UAVGFLOOR, avgflooru>;
+def : GINodeEquiv<G_UAVGCEIL, avgceilu>;
+def : GINodeEquiv<G_SAVGFLOOR, avgfloors>;
+def : GINodeEquiv<G_SAVGCEIL, avgceils>;
 
 def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 942455e1942b8..6af3fd9c65984 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -290,7 +290,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .lower();
 
   getActionDefinitionsBuilder(
-      {G_ABDS, G_ABDU, G_AVGFLOORU, G_AVGCEILU, G_AVGFLOORS, G_AVGCEILS})
+      {G_ABDS, G_ABDU, G_UAVGFLOOR, G_UAVGCEIL, G_SAVGFLOOR, G_SAVGCEIL})
       .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
       .lower();
 
@@ -1819,13 +1819,13 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::aarch64_neon_uabd:
     return LowerBinOp(TargetOpcode::G_ABDU);
   case Intrinsic::aarch64_neon_uhadd:
-    return LowerBinOp(TargetOpcode::G_AVGFLOORU);
+    return LowerBinOp(TargetOpcode::G_UAVGFLOOR);
   case Intrinsic::aarch64_neon_urhadd:
-    return LowerBinOp(TargetOpcode::G_AVGCEILU);
+    return LowerBinOp(TargetOpcode::G_UAVGCEIL);
   case Intrinsic::aarch64_neon_shadd:
-    return LowerBinOp(TargetOpcode::G_AVGFLOORS);
+    return LowerBinOp(TargetOpcode::G_SAVGFLOOR);
   case Intrinsic::aarch64_neon_srhadd:
-    return LowerBinOp(TargetOpcode::G_AVGCEILS);
+    return LowerBinOp(TargetOpcode::G_SAVGCEIL);
   case Intrinsic::aarch64_neon_abs: {
     // Lower the intrinsic to G_ABS.
     MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 800b575dc7920..7edebd576d268 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -79,22 +79,22 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
-# DEBUG-NEXT: G_AVGFLOORU (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: G_UAVGFLOOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
-# DEBUG-NEXT: G_AVGCEILU (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: G_UAVGCEIL (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
-# DEBUG-NEXT: G_AVGFLOORS (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: G_SAVGFLOOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
-# DEBUG-NEXT: G_AVGCEILS (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: G_SAVGCEIL (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected

>From 61d7a971cdfb605c43c5d76269cf6cf84f798bd2 Mon Sep 17 00:00:00 2001
From: Josh Rodriguez <josh.rodriguez at arm.com>
Date: Wed, 5 Nov 2025 09:57:54 +0000
Subject: [PATCH 10/12] [GlobalISel] Added documentation for gMIR instructions
 into GenericOpcode.rst

---
 llvm/docs/GlobalISel/GenericOpcode.rst | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index 661a11537cf57..72cb8c6efca10 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -511,6 +511,19 @@ Compute the absolute difference (signed and unsigned), e.g. trunc(abs(ext(x)-ext
   %0:_(s33) = G_ABDS %2, %3
   %1:_(s33) = G_ABDU %4, %5
 
+G_UAVGFLOOR, G_UAVGCEIL, G_SAVGFLOOR, G_SAVGCEIL
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Computes the average of corresponding elements in two vectors (signed and unsigned).
+Resulting vector contains values that are either rounded or truncated. e.g. trunc(shr(ext(a)+ext(b))).
+
+.. code-block:: none
+
+  %0:_(<4 x i16>) = G_UAVGFLOOR %4:_(<4 x i16>), %5:_(<4 x i16>)
+  %1:_(<4 x i16>) = G_UAVGCEIL %6:_(<4 x i16>), %7:_(<4 x i16>)
+  %2:_(<4 x i16>) = G_SAVGFLOOR %8:_(<4 x i16>), %9:_(<4 x i16>)
+  %3:_(<4 x i16>) = G_SAVGCEIL %10:_(<4 x i16>), %11:_(<4 x i16>)
+
 Floating Point Operations
 -------------------------
 

>From f11c039a8aa23277f5efe35033b9bd392a9af56e Mon Sep 17 00:00:00 2001
From: Josh Rodriguez <josh.rodriguez at arm.com>
Date: Wed, 5 Nov 2025 10:56:50 +0000
Subject: [PATCH 11/12] [AArch64][GlobalISel] Modified trunc-avg-fold.ll to
 separately test SDAG and GISel generated code

Test file contains only CHECK-SD and CHECK-GI prefixes, as shared CHECK prefix is not needed.
---
 llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 69 +++++++++++++++------
 1 file changed, 50 insertions(+), 19 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index 54fcae4ba28b7..0a72bbccf0ed2 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -1,11 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK-SD
+; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon -global-isel < %s | FileCheck %s --check-prefixes=CHECK-GI
 
 define <8 x i8> @avgceil_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: avgceil_u_i8_to_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: avgceil_u_i8_to_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    urhadd v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: avgceil_u_i8_to_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
   %a16 = zext <8 x i8> %a to <8 x i16>
   %b16 = zext <8 x i8> %b to <8 x i16>
   %avg16 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16)
@@ -15,10 +24,18 @@ define <8 x i8> @avgceil_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) {
 
 
 define <8 x i8> @test_avgceil_s(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_avgceil_s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_avgceil_s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    srhadd v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_avgceil_s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
   %a16 = sext <8 x i8> %a to <8 x i16>
   %b16 = sext <8 x i8> %b to <8 x i16>
   %avg16 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16)
@@ -27,10 +44,18 @@ define <8 x i8> @test_avgceil_s(<8 x i8> %a, <8 x i8> %b) {
 }
 
 define <8 x i8> @avgfloor_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: avgfloor_u_i8_to_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: avgfloor_u_i8_to_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: avgfloor_u_i8_to_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
   %a16 = zext  <8 x i8>  %a to <8 x i16>
   %b16 = zext  <8 x i8>  %b to <8 x i16>
   %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16)
@@ -39,15 +64,21 @@ define <8 x i8> @avgfloor_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) {
 }
 
 define <8 x i8> @test_avgfloor_s(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_avgfloor_s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_avgfloor_s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shadd v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_avgfloor_s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
   %a16 = sext  <8 x i8>  %a to <8 x i16>
   %b16 = sext  <8 x i8>  %b to <8 x i16>
   %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %a16, <8 x i16> %b16)
   %res  = trunc <8 x i16> %avg16 to <8 x i8>
   ret <8 x i8> %res
 }
-
-

>From 2f2dd5fd41df79c89846de3d4e3bce19f844d4d1 Mon Sep 17 00:00:00 2001
From: Josh Rodriguez <josh.rodriguez at arm.com>
Date: Wed, 5 Nov 2025 16:28:01 +0000
Subject: [PATCH 12/12] [AArch64][GlobalISel] Modified arm64-vhadd.ll to
 separately test SDAG and GISel generated code

---
 llvm/test/CodeGen/AArch64/arm64-vhadd.ll | 1599 ++++++++++++++++------
 1 file changed, 1162 insertions(+), 437 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
index a505b42e3423a..09ea9eeb03914 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -1,5 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:	 warning: Instruction selection used fallback path for ext_via_i19
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for srhadd_v2i32_trunc
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for urhadd_v2i32_trunc
 
 define <8 x i8> @shadd8b(ptr nocapture readonly %A, ptr nocapture readonly %B) {
 ; CHECK-LABEL: shadd8b:
@@ -327,11 +332,20 @@ define <4 x i32> @urhadd4s(ptr nocapture readonly %A, ptr nocapture readonly %B)
 }
 
 define void @testLowerToSRHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSRHADD8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd.8b v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSRHADD8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    srhadd.8b v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSRHADD8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v2, #1
+; CHECK-GI-NEXT:    saddl.8h v0, v0, v1
+; CHECK-GI-NEXT:    add.8h v0, v0, v2
+; CHECK-GI-NEXT:    shrn.8b v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
   %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
   %add1 = add nsw <8 x i16> %sextsrc1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -343,11 +357,20 @@ define void @testLowerToSRHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture w
 }
 
 define void @testLowerToSRHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSRHADD4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd.4h v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSRHADD4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    srhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSRHADD4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.4s v2, #1
+; CHECK-GI-NEXT:    saddl.4s v0, v0, v1
+; CHECK-GI-NEXT:    add.4s v0, v0, v2
+; CHECK-GI-NEXT:    shrn.4h v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
   %sextsrc2 = sext <4 x i16> %src2 to <4 x i32>
   %add1 = add nsw <4 x i32> %sextsrc1, <i32 1, i32 1, i32 1, i32 1>
@@ -359,11 +382,21 @@ define void @testLowerToSRHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture
 }
 
 define void @testLowerToSRHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSRHADD2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd.2s v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSRHADD2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    srhadd.2s v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSRHADD2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI26_0
+; CHECK-GI-NEXT:    saddl.2d v0, v0, v1
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI26_0]
+; CHECK-GI-NEXT:    add.2d v0, v0, v1
+; CHECK-GI-NEXT:    shrn.2s v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
   %sextsrc2 = sext <2 x i32> %src2 to <2 x i64>
   %add1 = add nsw <2 x i64> %sextsrc1, <i64 1, i64 1>
@@ -375,11 +408,23 @@ define void @testLowerToSRHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture
 }
 
 define void @testLowerToSRHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSRHADD16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd.16b v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSRHADD16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    srhadd.16b v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSRHADD16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v2, #1
+; CHECK-GI-NEXT:    saddl.8h v3, v0, v1
+; CHECK-GI-NEXT:    saddl2.8h v0, v0, v1
+; CHECK-GI-NEXT:    add.8h v1, v3, v2
+; CHECK-GI-NEXT:    add.8h v0, v0, v2
+; CHECK-GI-NEXT:    shrn.8b v1, v1, #1
+; CHECK-GI-NEXT:    shrn2.16b v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
   %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
   %add1 = add nsw <16 x i16> %sextsrc1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -391,11 +436,23 @@ define void @testLowerToSRHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocaptur
 }
 
 define void @testLowerToSRHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSRHADD8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd.8h v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSRHADD8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    srhadd.8h v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSRHADD8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.4s v2, #1
+; CHECK-GI-NEXT:    saddl.4s v3, v0, v1
+; CHECK-GI-NEXT:    saddl2.4s v0, v0, v1
+; CHECK-GI-NEXT:    add.4s v1, v3, v2
+; CHECK-GI-NEXT:    add.4s v0, v0, v2
+; CHECK-GI-NEXT:    shrn.4h v1, v1, #1
+; CHECK-GI-NEXT:    shrn2.8h v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
   %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
   %add1 = add nsw <8 x i32> %sextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -407,11 +464,24 @@ define void @testLowerToSRHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture
 }
 
 define void @testLowerToSRHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSRHADD4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd.4s v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSRHADD4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    srhadd.4s v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSRHADD4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI29_0
+; CHECK-GI-NEXT:    saddl.2d v2, v0, v1
+; CHECK-GI-NEXT:    saddl2.2d v0, v0, v1
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI29_0]
+; CHECK-GI-NEXT:    add.2d v1, v2, v3
+; CHECK-GI-NEXT:    add.2d v0, v0, v3
+; CHECK-GI-NEXT:    shrn.2s v1, v1, #1
+; CHECK-GI-NEXT:    shrn2.4s v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
   %add1 = add nsw <4 x i64> %sextsrc1, <i64 1, i64 1, i64 1, i64 1>
@@ -423,11 +493,18 @@ define void @testLowerToSRHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture
 }
 
 define void @testLowerToSHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSHADD8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd.8b v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSHADD8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shadd.8b v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSHADD8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    saddl.8h v0, v0, v1
+; CHECK-GI-NEXT:    shrn.8b v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
   %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
   %add = add nsw <8 x i16> %sextsrc1, %sextsrc2
@@ -438,11 +515,18 @@ define void @testLowerToSHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture wr
 }
 
 define void @testLowerToSHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSHADD4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd.4h v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSHADD4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSHADD4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    saddl.4s v0, v0, v1
+; CHECK-GI-NEXT:    shrn.4h v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
   %sextsrc2 = sext <4 x i16> %src2 to <4 x i32>
   %add = add nsw <4 x i32> %sextsrc1, %sextsrc2
@@ -453,11 +537,18 @@ define void @testLowerToSHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture
 }
 
 define void @testLowerToSHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSHADD2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd.2s v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSHADD2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shadd.2s v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSHADD2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    saddl.2d v0, v0, v1
+; CHECK-GI-NEXT:    shrn.2s v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
   %sextsrc2 = sext <2 x i32> %src2 to <2 x i64>
   %add = add nsw <2 x i64> %sextsrc1, %sextsrc2
@@ -468,11 +559,20 @@ define void @testLowerToSHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture
 }
 
 define void @testLowerToSHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSHADD16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd.16b v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSHADD16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shadd.16b v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSHADD16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    saddl.8h v2, v0, v1
+; CHECK-GI-NEXT:    saddl2.8h v0, v0, v1
+; CHECK-GI-NEXT:    shrn.8b v1, v2, #1
+; CHECK-GI-NEXT:    shrn2.16b v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
   %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
   %add = add nsw <16 x i16> %sextsrc1, %sextsrc2
@@ -483,11 +583,20 @@ define void @testLowerToSHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocapture
 }
 
 define void @testLowerToSHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSHADD8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd.8h v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSHADD8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shadd.8h v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSHADD8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    saddl.4s v2, v0, v1
+; CHECK-GI-NEXT:    saddl2.4s v0, v0, v1
+; CHECK-GI-NEXT:    shrn.4h v1, v2, #1
+; CHECK-GI-NEXT:    shrn2.8h v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
   %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
   %add = add nsw <8 x i32> %sextsrc1, %sextsrc2
@@ -498,11 +607,20 @@ define void @testLowerToSHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture
 }
 
 define void @testLowerToSHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSHADD4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd.4s v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSHADD4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shadd.4s v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSHADD4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    saddl.2d v2, v0, v1
+; CHECK-GI-NEXT:    saddl2.2d v0, v0, v1
+; CHECK-GI-NEXT:    shrn.2s v1, v2, #1
+; CHECK-GI-NEXT:    shrn2.4s v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
   %add = add nsw <4 x i64> %sextsrc1, %sextsrc2
@@ -513,11 +631,20 @@ define void @testLowerToSHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture
 }
 
 define void @testLowerToURHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToURHADD8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd.8b v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToURHADD8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    urhadd.8b v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToURHADD8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v2, #1
+; CHECK-GI-NEXT:    uaddl.8h v0, v0, v1
+; CHECK-GI-NEXT:    add.8h v0, v0, v2
+; CHECK-GI-NEXT:    shrn.8b v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
   %zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
   %add1 = add nuw nsw <8 x i16> %zextsrc1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -529,11 +656,20 @@ define void @testLowerToURHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture w
 }
 
 define void @testLowerToURHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToURHADD4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd.4h v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToURHADD4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    urhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToURHADD4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.4s v2, #1
+; CHECK-GI-NEXT:    uaddl.4s v0, v0, v1
+; CHECK-GI-NEXT:    add.4s v0, v0, v2
+; CHECK-GI-NEXT:    shrn.4h v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
   %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
   %add1 = add nuw nsw <4 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1>
@@ -545,11 +681,21 @@ define void @testLowerToURHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture
 }
 
 define void @testLowerToURHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToURHADD2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd.2s v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToURHADD2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    urhadd.2s v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToURHADD2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI38_0
+; CHECK-GI-NEXT:    uaddl.2d v0, v0, v1
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI38_0]
+; CHECK-GI-NEXT:    add.2d v0, v0, v1
+; CHECK-GI-NEXT:    shrn.2s v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
   %zextsrc2 = zext <2 x i32> %src2 to <2 x i64>
   %add1 = add nuw nsw <2 x i64> %zextsrc1, <i64 1, i64 1>
@@ -561,11 +707,23 @@ define void @testLowerToURHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture
 }
 
 define void @testLowerToURHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToURHADD16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd.16b v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToURHADD16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    urhadd.16b v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToURHADD16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v2, #1
+; CHECK-GI-NEXT:    uaddl.8h v3, v0, v1
+; CHECK-GI-NEXT:    uaddl2.8h v0, v0, v1
+; CHECK-GI-NEXT:    add.8h v1, v3, v2
+; CHECK-GI-NEXT:    add.8h v0, v0, v2
+; CHECK-GI-NEXT:    shrn.8b v1, v1, #1
+; CHECK-GI-NEXT:    shrn2.16b v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
   %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
   %add1 = add nuw nsw <16 x i16> %zextsrc1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -577,11 +735,23 @@ define void @testLowerToURHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocaptur
 }
 
 define void @testLowerToURHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToURHADD8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd.8h v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToURHADD8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    urhadd.8h v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToURHADD8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.4s v2, #1
+; CHECK-GI-NEXT:    uaddl.4s v3, v0, v1
+; CHECK-GI-NEXT:    uaddl2.4s v0, v0, v1
+; CHECK-GI-NEXT:    add.4s v1, v3, v2
+; CHECK-GI-NEXT:    add.4s v0, v0, v2
+; CHECK-GI-NEXT:    shrn.4h v1, v1, #1
+; CHECK-GI-NEXT:    shrn2.8h v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
   %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
   %add1 = add nuw nsw <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -593,11 +763,24 @@ define void @testLowerToURHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture
 }
 
 define void @testLowerToURHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToURHADD4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd.4s v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToURHADD4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    urhadd.4s v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToURHADD4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI41_0
+; CHECK-GI-NEXT:    uaddl.2d v2, v0, v1
+; CHECK-GI-NEXT:    uaddl2.2d v0, v0, v1
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI41_0]
+; CHECK-GI-NEXT:    add.2d v1, v2, v3
+; CHECK-GI-NEXT:    add.2d v0, v0, v3
+; CHECK-GI-NEXT:    shrn.2s v1, v1, #1
+; CHECK-GI-NEXT:    shrn2.4s v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
   %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
   %add1 = add nuw nsw <4 x i64> %zextsrc1, <i64 1, i64 1, i64 1, i64 1>
@@ -609,11 +792,18 @@ define void @testLowerToURHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture
 }
 
 define void @testLowerToUHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToUHADD8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd.8b v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToUHADD8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd.8b v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToUHADD8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddl.8h v0, v0, v1
+; CHECK-GI-NEXT:    shrn.8b v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
   %zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
   %add = add nuw nsw <8 x i16> %zextsrc1, %zextsrc2
@@ -624,11 +814,18 @@ define void @testLowerToUHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture wr
 }
 
 define void @testLowerToUHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToUHADD4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd.4h v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToUHADD4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToUHADD4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddl.4s v0, v0, v1
+; CHECK-GI-NEXT:    shrn.4h v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
   %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
   %add = add nuw nsw <4 x i32> %zextsrc1, %zextsrc2
@@ -639,11 +836,18 @@ define void @testLowerToUHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture
 }
 
 define void @testLowerToUHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToUHADD2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd.2s v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToUHADD2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd.2s v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToUHADD2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddl.2d v0, v0, v1
+; CHECK-GI-NEXT:    shrn.2s v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
   %zextsrc2 = zext <2 x i32> %src2 to <2 x i64>
   %add = add nuw nsw <2 x i64> %zextsrc1, %zextsrc2
@@ -654,11 +858,20 @@ define void @testLowerToUHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture
 }
 
 define void @testLowerToUHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToUHADD16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd.16b v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToUHADD16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd.16b v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToUHADD16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddl.8h v2, v0, v1
+; CHECK-GI-NEXT:    uaddl2.8h v0, v0, v1
+; CHECK-GI-NEXT:    shrn.8b v1, v2, #1
+; CHECK-GI-NEXT:    shrn2.16b v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
   %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
   %add = add nuw nsw <16 x i16> %zextsrc1, %zextsrc2
@@ -669,11 +882,20 @@ define void @testLowerToUHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocapture
 }
 
 define void @testLowerToUHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToUHADD8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd.8h v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToUHADD8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd.8h v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToUHADD8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddl.4s v2, v0, v1
+; CHECK-GI-NEXT:    uaddl2.4s v0, v0, v1
+; CHECK-GI-NEXT:    shrn.4h v1, v2, #1
+; CHECK-GI-NEXT:    shrn2.8h v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
   %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
   %add = add nuw nsw <8 x i32> %zextsrc1, %zextsrc2
@@ -684,11 +906,20 @@ define void @testLowerToUHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture
 }
 
 define void @testLowerToUHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToUHADD4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd.4s v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToUHADD4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd.4s v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToUHADD4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddl.2d v2, v0, v1
+; CHECK-GI-NEXT:    uaddl2.2d v0, v0, v1
+; CHECK-GI-NEXT:    shrn.2s v1, v2, #1
+; CHECK-GI-NEXT:    shrn2.4s v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
   %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
   %add = add nuw nsw <4 x i64> %zextsrc1, %zextsrc2
@@ -699,11 +930,17 @@ define void @testLowerToUHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture
 }
 
 define <4 x i32> @hadd16_sext_asr(<4 x i16> %src1, <4 x i16> %src2) {
-; CHECK-LABEL: hadd16_sext_asr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd.4h v0, v0, v1
-; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd16_sext_asr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    sshll.4s v0, v0, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd16_sext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    saddl.4s v0, v0, v1
+; CHECK-GI-NEXT:    sshr.4s v0, v0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = sext <4 x i16> %src1 to <4 x i32>
   %zextsrc2 = sext <4 x i16> %src2 to <4 x i32>
   %add = add nsw <4 x i32> %zextsrc1, %zextsrc2
@@ -712,11 +949,17 @@ define <4 x i32> @hadd16_sext_asr(<4 x i16> %src1, <4 x i16> %src2) {
 }
 
 define <4 x i32> @hadd16_zext_asr(<4 x i16> %src1, <4 x i16> %src2) {
-; CHECK-LABEL: hadd16_zext_asr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd.4h v0, v0, v1
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd16_zext_asr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    ushll.4s v0, v0, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd16_zext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddl.4s v0, v0, v1
+; CHECK-GI-NEXT:    ushr.4s v0, v0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
   %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
   %add = add nuw nsw <4 x i32> %zextsrc1, %zextsrc2
@@ -738,11 +981,17 @@ define <4 x i32> @hadd16_sext_lsr(<4 x i16> %src1, <4 x i16> %src2) {
 }
 
 define <4 x i32> @hadd16_zext_lsr(<4 x i16> %src1, <4 x i16> %src2) {
-; CHECK-LABEL: hadd16_zext_lsr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd.4h v0, v0, v1
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd16_zext_lsr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    ushll.4s v0, v0, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd16_zext_lsr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddl.4s v0, v0, v1
+; CHECK-GI-NEXT:    ushr.4s v0, v0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
   %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
   %add = add nuw nsw <4 x i32> %zextsrc1, %zextsrc2
@@ -751,12 +1000,20 @@ define <4 x i32> @hadd16_zext_lsr(<4 x i16> %src1, <4 x i16> %src2) {
 }
 
 define <4 x i64> @hadd32_sext_asr(<4 x i32> %src1, <4 x i32> %src2) {
-; CHECK-LABEL: hadd32_sext_asr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd.4s v0, v0, v1
-; CHECK-NEXT:    sshll2.2d v1, v0, #0
-; CHECK-NEXT:    sshll.2d v0, v0, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd32_sext_asr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shadd.4s v0, v0, v1
+; CHECK-SD-NEXT:    sshll2.2d v1, v0, #0
+; CHECK-SD-NEXT:    sshll.2d v0, v0, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd32_sext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    saddl.2d v2, v0, v1
+; CHECK-GI-NEXT:    saddl2.2d v1, v0, v1
+; CHECK-GI-NEXT:    sshr.2d v0, v2, #1
+; CHECK-GI-NEXT:    sshr.2d v1, v1, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %zextsrc2 = sext <4 x i32> %src2 to <4 x i64>
   %add = add nsw <4 x i64> %zextsrc1, %zextsrc2
@@ -765,12 +1022,20 @@ define <4 x i64> @hadd32_sext_asr(<4 x i32> %src1, <4 x i32> %src2) {
 }
 
 define <4 x i64> @hadd32_zext_asr(<4 x i32> %src1, <4 x i32> %src2) {
-; CHECK-LABEL: hadd32_zext_asr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd.4s v0, v0, v1
-; CHECK-NEXT:    ushll2.2d v1, v0, #0
-; CHECK-NEXT:    ushll.2d v0, v0, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd32_zext_asr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd.4s v0, v0, v1
+; CHECK-SD-NEXT:    ushll2.2d v1, v0, #0
+; CHECK-SD-NEXT:    ushll.2d v0, v0, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd32_zext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddl.2d v2, v0, v1
+; CHECK-GI-NEXT:    uaddl2.2d v1, v0, v1
+; CHECK-GI-NEXT:    ushr.2d v0, v2, #1
+; CHECK-GI-NEXT:    ushr.2d v1, v1, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
   %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
   %add = add nuw nsw <4 x i64> %zextsrc1, %zextsrc2
@@ -779,13 +1044,21 @@ define <4 x i64> @hadd32_zext_asr(<4 x i32> %src1, <4 x i32> %src2) {
 }
 
 define <4 x i64> @hadd32_sext_lsr(<4 x i32> %src1, <4 x i32> %src2) {
-; CHECK-LABEL: hadd32_sext_lsr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    saddl.2d v2, v0, v1
-; CHECK-NEXT:    saddl2.2d v0, v0, v1
-; CHECK-NEXT:    ushr.2d v1, v0, #1
-; CHECK-NEXT:    ushr.2d v0, v2, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd32_sext_lsr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    saddl.2d v2, v0, v1
+; CHECK-SD-NEXT:    saddl2.2d v0, v0, v1
+; CHECK-SD-NEXT:    ushr.2d v1, v0, #1
+; CHECK-SD-NEXT:    ushr.2d v0, v2, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd32_sext_lsr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    saddl.2d v2, v0, v1
+; CHECK-GI-NEXT:    saddl2.2d v1, v0, v1
+; CHECK-GI-NEXT:    ushr.2d v0, v2, #1
+; CHECK-GI-NEXT:    ushr.2d v1, v1, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %zextsrc2 = sext <4 x i32> %src2 to <4 x i64>
   %add = add nsw <4 x i64> %zextsrc1, %zextsrc2
@@ -794,12 +1067,20 @@ define <4 x i64> @hadd32_sext_lsr(<4 x i32> %src1, <4 x i32> %src2) {
 }
 
 define <4 x i64> @hadd32_zext_lsr(<4 x i32> %src1, <4 x i32> %src2) {
-; CHECK-LABEL: hadd32_zext_lsr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd.4s v0, v0, v1
-; CHECK-NEXT:    ushll2.2d v1, v0, #0
-; CHECK-NEXT:    ushll.2d v0, v0, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd32_zext_lsr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uhadd.4s v0, v0, v1
+; CHECK-SD-NEXT:    ushll2.2d v1, v0, #0
+; CHECK-SD-NEXT:    ushll.2d v0, v0, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd32_zext_lsr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddl.2d v2, v0, v1
+; CHECK-GI-NEXT:    uaddl2.2d v1, v0, v1
+; CHECK-GI-NEXT:    ushr.2d v0, v2, #1
+; CHECK-GI-NEXT:    ushr.2d v1, v1, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
   %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
   %add = add nuw nsw <4 x i64> %zextsrc1, %zextsrc2
@@ -808,14 +1089,23 @@ define <4 x i64> @hadd32_zext_lsr(<4 x i32> %src1, <4 x i32> %src2) {
 }
 
 define <4 x i16> @hadd8_sext_asr(<4 x i8> %src1, <4 x i8> %src2) {
-; CHECK-LABEL: hadd8_sext_asr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl.4h v1, v1, #8
-; CHECK-NEXT:    shl.4h v0, v0, #8
-; CHECK-NEXT:    sshr.4h v1, v1, #8
-; CHECK-NEXT:    sshr.4h v0, v0, #8
-; CHECK-NEXT:    shadd.4h v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd8_sext_asr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shl.4h v1, v1, #8
+; CHECK-SD-NEXT:    shl.4h v0, v0, #8
+; CHECK-SD-NEXT:    sshr.4h v1, v1, #8
+; CHECK-SD-NEXT:    sshr.4h v0, v0, #8
+; CHECK-SD-NEXT:    shadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd8_sext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shl.4h v1, v1, #8
+; CHECK-GI-NEXT:    shl.4h v0, v0, #8
+; CHECK-GI-NEXT:    sshr.4h v1, v1, #8
+; CHECK-GI-NEXT:    ssra.4h v1, v0, #8
+; CHECK-GI-NEXT:    sshr.4h v0, v1, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = sext <4 x i8> %src1 to <4 x i16>
   %zextsrc2 = sext <4 x i8> %src2 to <4 x i16>
   %add = add nsw <4 x i16> %zextsrc1, %zextsrc2
@@ -824,12 +1114,21 @@ define <4 x i16> @hadd8_sext_asr(<4 x i8> %src1, <4 x i8> %src2) {
 }
 
 define <4 x i16> @hadd8_zext_asr(<4 x i8> %src1, <4 x i8> %src2) {
-; CHECK-LABEL: hadd8_zext_asr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic.4h v1, #255, lsl #8
-; CHECK-NEXT:    bic.4h v0, #255, lsl #8
-; CHECK-NEXT:    uhadd.4h v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd8_zext_asr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    bic.4h v1, #255, lsl #8
+; CHECK-SD-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-SD-NEXT:    uhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd8_zext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and.8b v0, v0, v2
+; CHECK-GI-NEXT:    and.8b v1, v1, v2
+; CHECK-GI-NEXT:    add.4h v0, v0, v1
+; CHECK-GI-NEXT:    ushr.4h v0, v0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i8> %src1 to <4 x i16>
   %zextsrc2 = zext <4 x i8> %src2 to <4 x i16>
   %add = add nuw nsw <4 x i16> %zextsrc1, %zextsrc2
@@ -838,14 +1137,23 @@ define <4 x i16> @hadd8_zext_asr(<4 x i8> %src1, <4 x i8> %src2) {
 }
 
 define <4 x i16> @hadd8_sext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
-; CHECK-LABEL: hadd8_sext_lsr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl.4h v0, v0, #8
-; CHECK-NEXT:    shl.4h v1, v1, #8
-; CHECK-NEXT:    sshr.4h v0, v0, #8
-; CHECK-NEXT:    ssra.4h v0, v1, #8
-; CHECK-NEXT:    ushr.4h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd8_sext_lsr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shl.4h v0, v0, #8
+; CHECK-SD-NEXT:    shl.4h v1, v1, #8
+; CHECK-SD-NEXT:    sshr.4h v0, v0, #8
+; CHECK-SD-NEXT:    ssra.4h v0, v1, #8
+; CHECK-SD-NEXT:    ushr.4h v0, v0, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd8_sext_lsr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shl.4h v1, v1, #8
+; CHECK-GI-NEXT:    shl.4h v0, v0, #8
+; CHECK-GI-NEXT:    sshr.4h v1, v1, #8
+; CHECK-GI-NEXT:    ssra.4h v1, v0, #8
+; CHECK-GI-NEXT:    ushr.4h v0, v1, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = sext <4 x i8> %src1 to <4 x i16>
   %zextsrc2 = sext <4 x i8> %src2 to <4 x i16>
   %add = add nsw <4 x i16> %zextsrc1, %zextsrc2
@@ -854,12 +1162,21 @@ define <4 x i16> @hadd8_sext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
 }
 
 define <4 x i16> @hadd8_zext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
-; CHECK-LABEL: hadd8_zext_lsr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic.4h v1, #255, lsl #8
-; CHECK-NEXT:    bic.4h v0, #255, lsl #8
-; CHECK-NEXT:    uhadd.4h v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd8_zext_lsr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    bic.4h v1, #255, lsl #8
+; CHECK-SD-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-SD-NEXT:    uhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd8_zext_lsr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and.8b v0, v0, v2
+; CHECK-GI-NEXT:    and.8b v1, v1, v2
+; CHECK-GI-NEXT:    add.4h v0, v0, v1
+; CHECK-GI-NEXT:    ushr.4h v0, v0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i8> %src1 to <4 x i16>
   %zextsrc2 = zext <4 x i8> %src2 to <4 x i16>
   %add = add nuw nsw <4 x i16> %zextsrc1, %zextsrc2
@@ -868,14 +1185,30 @@ define <4 x i16> @hadd8_zext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
 }
 
 define <2 x i16> @hadd8x2_sext_asr(<2 x i8> %src1, <2 x i8> %src2) {
-; CHECK-LABEL: hadd8x2_sext_asr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl.2s v1, v1, #24
-; CHECK-NEXT:    shl.2s v0, v0, #24
-; CHECK-NEXT:    sshr.2s v1, v1, #24
-; CHECK-NEXT:    sshr.2s v0, v0, #24
-; CHECK-NEXT:    shadd.2s v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd8x2_sext_asr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shl.2s v1, v1, #24
+; CHECK-SD-NEXT:    shl.2s v0, v0, #24
+; CHECK-SD-NEXT:    sshr.2s v1, v1, #24
+; CHECK-SD-NEXT:    sshr.2s v0, v0, #24
+; CHECK-SD-NEXT:    shadd.2s v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd8x2_sext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shl.2s v1, v1, #24
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    shl.2s v0, v0, #24
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    sshr.2s v1, v1, #24
+; CHECK-GI-NEXT:    mov.h v2[1], w8
+; CHECK-GI-NEXT:    ssra.2s v1, v0, #24
+; CHECK-GI-NEXT:    uzp1.4h v0, v1, v0
+; CHECK-GI-NEXT:    neg.4h v1, v2
+; CHECK-GI-NEXT:    sshl.4h v0, v0, v1
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = sext <2 x i8> %src1 to <2 x i16>
   %zextsrc2 = sext <2 x i8> %src2 to <2 x i16>
   %add = add nsw <2 x i16> %zextsrc1, %zextsrc2
@@ -884,13 +1217,29 @@ define <2 x i16> @hadd8x2_sext_asr(<2 x i8> %src1, <2 x i8> %src2) {
 }
 
 define <2 x i16> @hadd8x2_zext_asr(<2 x i8> %src1, <2 x i8> %src2) {
-; CHECK-LABEL: hadd8x2_zext_asr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-NEXT:    and.8b v1, v1, v2
-; CHECK-NEXT:    and.8b v0, v0, v2
-; CHECK-NEXT:    uhadd.2s v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd8x2_zext_asr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-SD-NEXT:    and.8b v1, v1, v2
+; CHECK-SD-NEXT:    and.8b v0, v0, v2
+; CHECK-SD-NEXT:    uhadd.2s v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd8x2_zext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    and.8b v0, v0, v2
+; CHECK-GI-NEXT:    and.8b v1, v1, v2
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    add.2s v0, v0, v1
+; CHECK-GI-NEXT:    mov.h v2[1], w8
+; CHECK-GI-NEXT:    uzp1.4h v0, v0, v0
+; CHECK-GI-NEXT:    neg.4h v1, v2
+; CHECK-GI-NEXT:    ushl.4h v0, v0, v1
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <2 x i8> %src1 to <2 x i16>
   %zextsrc2 = zext <2 x i8> %src2 to <2 x i16>
   %add = add nuw nsw <2 x i16> %zextsrc1, %zextsrc2
@@ -899,16 +1248,32 @@ define <2 x i16> @hadd8x2_zext_asr(<2 x i8> %src1, <2 x i8> %src2) {
 }
 
 define <2 x i16> @hadd8x2_sext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
-; CHECK-LABEL: hadd8x2_sext_lsr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl.2s v0, v0, #24
-; CHECK-NEXT:    shl.2s v1, v1, #24
-; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    sshr.2s v0, v0, #24
-; CHECK-NEXT:    ssra.2s v0, v1, #24
-; CHECK-NEXT:    and.8b v0, v0, v2
-; CHECK-NEXT:    ushr.2s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd8x2_sext_lsr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shl.2s v0, v0, #24
+; CHECK-SD-NEXT:    shl.2s v1, v1, #24
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    sshr.2s v0, v0, #24
+; CHECK-SD-NEXT:    ssra.2s v0, v1, #24
+; CHECK-SD-NEXT:    and.8b v0, v0, v2
+; CHECK-SD-NEXT:    ushr.2s v0, v0, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd8x2_sext_lsr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shl.2s v1, v1, #24
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    shl.2s v0, v0, #24
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    sshr.2s v1, v1, #24
+; CHECK-GI-NEXT:    mov.h v2[1], w8
+; CHECK-GI-NEXT:    ssra.2s v1, v0, #24
+; CHECK-GI-NEXT:    uzp1.4h v0, v1, v0
+; CHECK-GI-NEXT:    neg.4h v1, v2
+; CHECK-GI-NEXT:    ushl.4h v0, v0, v1
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = sext <2 x i8> %src1 to <2 x i16>
   %zextsrc2 = sext <2 x i8> %src2 to <2 x i16>
   %add = add nsw <2 x i16> %zextsrc1, %zextsrc2
@@ -917,13 +1282,29 @@ define <2 x i16> @hadd8x2_sext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
 }
 
 define <2 x i16> @hadd8x2_zext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
-; CHECK-LABEL: hadd8x2_zext_lsr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-NEXT:    and.8b v1, v1, v2
-; CHECK-NEXT:    and.8b v0, v0, v2
-; CHECK-NEXT:    uhadd.2s v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: hadd8x2_zext_lsr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-SD-NEXT:    and.8b v1, v1, v2
+; CHECK-SD-NEXT:    and.8b v0, v0, v2
+; CHECK-SD-NEXT:    uhadd.2s v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd8x2_zext_lsr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    and.8b v0, v0, v2
+; CHECK-GI-NEXT:    and.8b v1, v1, v2
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    add.2s v0, v0, v1
+; CHECK-GI-NEXT:    mov.h v2[1], w8
+; CHECK-GI-NEXT:    uzp1.4h v0, v0, v0
+; CHECK-GI-NEXT:    neg.4h v1, v2
+; CHECK-GI-NEXT:    ushl.4h v0, v0, v1
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <2 x i8> %src1 to <2 x i16>
   %zextsrc2 = zext <2 x i8> %src2 to <2 x i16>
   %add = add nuw nsw <2 x i16> %zextsrc1, %zextsrc2
@@ -932,14 +1313,25 @@ define <2 x i16> @hadd8x2_zext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
 }
 
 define <4 x i16> @rhadd8_sext_asr(<4 x i8> %src1, <4 x i8> %src2) {
-; CHECK-LABEL: rhadd8_sext_asr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl.4h v1, v1, #8
-; CHECK-NEXT:    shl.4h v0, v0, #8
-; CHECK-NEXT:    sshr.4h v1, v1, #8
-; CHECK-NEXT:    sshr.4h v0, v0, #8
-; CHECK-NEXT:    srhadd.4h v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: rhadd8_sext_asr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shl.4h v1, v1, #8
+; CHECK-SD-NEXT:    shl.4h v0, v0, #8
+; CHECK-SD-NEXT:    sshr.4h v1, v1, #8
+; CHECK-SD-NEXT:    sshr.4h v0, v0, #8
+; CHECK-SD-NEXT:    srhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rhadd8_sext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shl.4h v1, v1, #8
+; CHECK-GI-NEXT:    shl.4h v0, v0, #8
+; CHECK-GI-NEXT:    movi.4h v2, #1
+; CHECK-GI-NEXT:    sshr.4h v1, v1, #8
+; CHECK-GI-NEXT:    ssra.4h v1, v0, #8
+; CHECK-GI-NEXT:    add.4h v0, v1, v2
+; CHECK-GI-NEXT:    sshr.4h v0, v0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = sext <4 x i8> %src1 to <4 x i16>
   %zextsrc2 = sext <4 x i8> %src2 to <4 x i16>
   %add = add nsw <4 x i16> %zextsrc1, %zextsrc2
@@ -949,12 +1341,23 @@ define <4 x i16> @rhadd8_sext_asr(<4 x i8> %src1, <4 x i8> %src2) {
 }
 
 define <4 x i16> @rhadd8_zext_asr(<4 x i8> %src1, <4 x i8> %src2) {
-; CHECK-LABEL: rhadd8_zext_asr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic.4h v1, #255, lsl #8
-; CHECK-NEXT:    bic.4h v0, #255, lsl #8
-; CHECK-NEXT:    urhadd.4h v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: rhadd8_zext_asr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    bic.4h v1, #255, lsl #8
+; CHECK-SD-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-SD-NEXT:    urhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rhadd8_zext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and.8b v0, v0, v2
+; CHECK-GI-NEXT:    and.8b v1, v1, v2
+; CHECK-GI-NEXT:    movi.4h v2, #1
+; CHECK-GI-NEXT:    add.4h v0, v0, v1
+; CHECK-GI-NEXT:    add.4h v0, v0, v2
+; CHECK-GI-NEXT:    ushr.4h v0, v0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i8> %src1 to <4 x i16>
   %zextsrc2 = zext <4 x i8> %src2 to <4 x i16>
   %add = add nuw nsw <4 x i16> %zextsrc1, %zextsrc2
@@ -964,16 +1367,27 @@ define <4 x i16> @rhadd8_zext_asr(<4 x i8> %src1, <4 x i8> %src2) {
 }
 
 define <4 x i16> @rhadd8_sext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
-; CHECK-LABEL: rhadd8_sext_lsr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl.4h v0, v0, #8
-; CHECK-NEXT:    shl.4h v1, v1, #8
-; CHECK-NEXT:    movi.4h v2, #1
-; CHECK-NEXT:    sshr.4h v0, v0, #8
-; CHECK-NEXT:    ssra.4h v0, v1, #8
-; CHECK-NEXT:    add.4h v0, v0, v2
-; CHECK-NEXT:    ushr.4h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: rhadd8_sext_lsr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shl.4h v0, v0, #8
+; CHECK-SD-NEXT:    shl.4h v1, v1, #8
+; CHECK-SD-NEXT:    movi.4h v2, #1
+; CHECK-SD-NEXT:    sshr.4h v0, v0, #8
+; CHECK-SD-NEXT:    ssra.4h v0, v1, #8
+; CHECK-SD-NEXT:    add.4h v0, v0, v2
+; CHECK-SD-NEXT:    ushr.4h v0, v0, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rhadd8_sext_lsr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shl.4h v1, v1, #8
+; CHECK-GI-NEXT:    shl.4h v0, v0, #8
+; CHECK-GI-NEXT:    movi.4h v2, #1
+; CHECK-GI-NEXT:    sshr.4h v1, v1, #8
+; CHECK-GI-NEXT:    ssra.4h v1, v0, #8
+; CHECK-GI-NEXT:    add.4h v0, v1, v2
+; CHECK-GI-NEXT:    ushr.4h v0, v0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = sext <4 x i8> %src1 to <4 x i16>
   %zextsrc2 = sext <4 x i8> %src2 to <4 x i16>
   %add = add nsw <4 x i16> %zextsrc1, %zextsrc2
@@ -983,12 +1397,23 @@ define <4 x i16> @rhadd8_sext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
 }
 
 define <4 x i16> @rhadd8_zext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
-; CHECK-LABEL: rhadd8_zext_lsr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic.4h v1, #255, lsl #8
-; CHECK-NEXT:    bic.4h v0, #255, lsl #8
-; CHECK-NEXT:    urhadd.4h v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: rhadd8_zext_lsr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    bic.4h v1, #255, lsl #8
+; CHECK-SD-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-SD-NEXT:    urhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rhadd8_zext_lsr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and.8b v0, v0, v2
+; CHECK-GI-NEXT:    and.8b v1, v1, v2
+; CHECK-GI-NEXT:    movi.4h v2, #1
+; CHECK-GI-NEXT:    add.4h v0, v0, v1
+; CHECK-GI-NEXT:    add.4h v0, v0, v2
+; CHECK-GI-NEXT:    ushr.4h v0, v0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i8> %src1 to <4 x i16>
   %zextsrc2 = zext <4 x i8> %src2 to <4 x i16>
   %add = add nuw nsw <4 x i16> %zextsrc1, %zextsrc2
@@ -998,14 +1423,32 @@ define <4 x i16> @rhadd8_zext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
 }
 
 define <2 x i16> @rhadd8x2_sext_asr(<2 x i8> %src1, <2 x i8> %src2) {
-; CHECK-LABEL: rhadd8x2_sext_asr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl.2s v1, v1, #24
-; CHECK-NEXT:    shl.2s v0, v0, #24
-; CHECK-NEXT:    sshr.2s v1, v1, #24
-; CHECK-NEXT:    sshr.2s v0, v0, #24
-; CHECK-NEXT:    srhadd.2s v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: rhadd8x2_sext_asr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shl.2s v1, v1, #24
+; CHECK-SD-NEXT:    shl.2s v0, v0, #24
+; CHECK-SD-NEXT:    sshr.2s v1, v1, #24
+; CHECK-SD-NEXT:    sshr.2s v0, v0, #24
+; CHECK-SD-NEXT:    srhadd.2s v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rhadd8x2_sext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shl.2s v1, v1, #24
+; CHECK-GI-NEXT:    shl.2s v0, v0, #24
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    dup.2s v2, w8
+; CHECK-GI-NEXT:    sshr.2s v1, v1, #24
+; CHECK-GI-NEXT:    ssra.2s v1, v0, #24
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov.h v0[1], w8
+; CHECK-GI-NEXT:    add.2s v1, v1, v2
+; CHECK-GI-NEXT:    uzp1.4h v1, v1, v0
+; CHECK-GI-NEXT:    neg.4h v0, v0
+; CHECK-GI-NEXT:    sshl.4h v0, v1, v0
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = sext <2 x i8> %src1 to <2 x i16>
   %zextsrc2 = sext <2 x i8> %src2 to <2 x i16>
   %add = add nsw <2 x i16> %zextsrc1, %zextsrc2
@@ -1015,13 +1458,31 @@ define <2 x i16> @rhadd8x2_sext_asr(<2 x i8> %src1, <2 x i8> %src2) {
 }
 
 define <2 x i16> @rhadd8x2_zext_asr(<2 x i8> %src1, <2 x i8> %src2) {
-; CHECK-LABEL: rhadd8x2_zext_asr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-NEXT:    and.8b v1, v1, v2
-; CHECK-NEXT:    and.8b v0, v0, v2
-; CHECK-NEXT:    urhadd.2s v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: rhadd8x2_zext_asr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-SD-NEXT:    and.8b v1, v1, v2
+; CHECK-SD-NEXT:    and.8b v0, v0, v2
+; CHECK-SD-NEXT:    urhadd.2s v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rhadd8x2_zext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    and.8b v0, v0, v2
+; CHECK-GI-NEXT:    and.8b v1, v1, v2
+; CHECK-GI-NEXT:    dup.2s v2, w8
+; CHECK-GI-NEXT:    add.2s v0, v0, v1
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    add.2s v0, v0, v2
+; CHECK-GI-NEXT:    mov.h v1[1], w8
+; CHECK-GI-NEXT:    uzp1.4h v0, v0, v0
+; CHECK-GI-NEXT:    neg.4h v1, v1
+; CHECK-GI-NEXT:    ushl.4h v0, v0, v1
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <2 x i8> %src1 to <2 x i16>
   %zextsrc2 = zext <2 x i8> %src2 to <2 x i16>
   %add = add nuw nsw <2 x i16> %zextsrc1, %zextsrc2
@@ -1031,18 +1492,36 @@ define <2 x i16> @rhadd8x2_zext_asr(<2 x i8> %src1, <2 x i8> %src2) {
 }
 
 define <2 x i16> @rhadd8x2_sext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
-; CHECK-LABEL: rhadd8x2_sext_lsr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl.2s v0, v0, #24
-; CHECK-NEXT:    shl.2s v1, v1, #24
-; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    sshr.2s v0, v0, #24
-; CHECK-NEXT:    sshr.2s v1, v1, #24
-; CHECK-NEXT:    mvn.8b v0, v0
-; CHECK-NEXT:    sub.2s v0, v1, v0
-; CHECK-NEXT:    and.8b v0, v0, v2
-; CHECK-NEXT:    ushr.2s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: rhadd8x2_sext_lsr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shl.2s v0, v0, #24
+; CHECK-SD-NEXT:    shl.2s v1, v1, #24
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    sshr.2s v0, v0, #24
+; CHECK-SD-NEXT:    sshr.2s v1, v1, #24
+; CHECK-SD-NEXT:    mvn.8b v0, v0
+; CHECK-SD-NEXT:    sub.2s v0, v1, v0
+; CHECK-SD-NEXT:    and.8b v0, v0, v2
+; CHECK-SD-NEXT:    ushr.2s v0, v0, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rhadd8x2_sext_lsr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shl.2s v1, v1, #24
+; CHECK-GI-NEXT:    shl.2s v0, v0, #24
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    dup.2s v2, w8
+; CHECK-GI-NEXT:    sshr.2s v1, v1, #24
+; CHECK-GI-NEXT:    ssra.2s v1, v0, #24
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov.h v0[1], w8
+; CHECK-GI-NEXT:    add.2s v1, v1, v2
+; CHECK-GI-NEXT:    uzp1.4h v1, v1, v0
+; CHECK-GI-NEXT:    neg.4h v0, v0
+; CHECK-GI-NEXT:    ushl.4h v0, v1, v0
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = sext <2 x i8> %src1 to <2 x i16>
   %zextsrc2 = sext <2 x i8> %src2 to <2 x i16>
   %add = add nsw <2 x i16> %zextsrc1, %zextsrc2
@@ -1052,13 +1531,31 @@ define <2 x i16> @rhadd8x2_sext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
 }
 
 define <2 x i16> @rhadd8x2_zext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
-; CHECK-LABEL: rhadd8x2_zext_lsr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-NEXT:    and.8b v1, v1, v2
-; CHECK-NEXT:    and.8b v0, v0, v2
-; CHECK-NEXT:    urhadd.2s v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: rhadd8x2_zext_lsr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-SD-NEXT:    and.8b v1, v1, v2
+; CHECK-SD-NEXT:    and.8b v0, v0, v2
+; CHECK-SD-NEXT:    urhadd.2s v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rhadd8x2_zext_lsr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    and.8b v0, v0, v2
+; CHECK-GI-NEXT:    and.8b v1, v1, v2
+; CHECK-GI-NEXT:    dup.2s v2, w8
+; CHECK-GI-NEXT:    add.2s v0, v0, v1
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    add.2s v0, v0, v2
+; CHECK-GI-NEXT:    mov.h v1[1], w8
+; CHECK-GI-NEXT:    uzp1.4h v0, v0, v0
+; CHECK-GI-NEXT:    neg.4h v1, v1
+; CHECK-GI-NEXT:    ushl.4h v0, v0, v1
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <2 x i8> %src1 to <2 x i16>
   %zextsrc2 = zext <2 x i8> %src2 to <2 x i16>
   %add = add nuw nsw <2 x i16> %zextsrc1, %zextsrc2
@@ -1069,12 +1566,20 @@ define <2 x i16> @rhadd8x2_zext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
 
 
 define void @testLowerToSHADD8b_c(<8 x i8> %src1, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSHADD8b_c:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.8b v1, #10
-; CHECK-NEXT:    shadd.8b v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSHADD8b_c:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.8b v1, #10
+; CHECK-SD-NEXT:    shadd.8b v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSHADD8b_c:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v1, #10
+; CHECK-GI-NEXT:    saddw.8h v0, v1, v0
+; CHECK-GI-NEXT:    shrn.8b v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
   %add = add nsw <8 x i16> %sextsrc1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
   %resulti16 = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -1084,12 +1589,20 @@ define void @testLowerToSHADD8b_c(<8 x i8> %src1, ptr nocapture writeonly %dest)
 }
 
 define void @testLowerToSHADD4h_c(<4 x i16> %src1, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSHADD4h_c:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.4h v1, #10
-; CHECK-NEXT:    shadd.4h v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSHADD4h_c:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.4h v1, #10
+; CHECK-SD-NEXT:    shadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSHADD4h_c:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.4s v1, #10
+; CHECK-GI-NEXT:    saddw.4s v0, v1, v0
+; CHECK-GI-NEXT:    shrn.4h v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
   %add = add nsw <4 x i32> %sextsrc1, <i32 10, i32 10, i32 10, i32 10>
   %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
@@ -1099,12 +1612,21 @@ define void @testLowerToSHADD4h_c(<4 x i16> %src1, ptr nocapture writeonly %dest
 }
 
 define void @testLowerToSHADD2s_c(<2 x i32> %src1, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSHADD2s_c:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.2s v1, #10
-; CHECK-NEXT:    shadd.2s v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSHADD2s_c:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.2s v1, #10
+; CHECK-SD-NEXT:    shadd.2s v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSHADD2s_c:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI74_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI74_0]
+; CHECK-GI-NEXT:    saddw.2d v0, v1, v0
+; CHECK-GI-NEXT:    shrn.2s v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
   %add = add nsw <2 x i64> %sextsrc1, <i64 10, i64 10>
   %resulti16 = lshr <2 x i64> %add, <i64 1, i64 1>
@@ -1114,12 +1636,22 @@ define void @testLowerToSHADD2s_c(<2 x i32> %src1, ptr nocapture writeonly %dest
 }
 
 define void @testLowerToSHADD16b_c(<16 x i8> %src1, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSHADD16b_c:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.16b v1, #10
-; CHECK-NEXT:    shadd.16b v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSHADD16b_c:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.16b v1, #10
+; CHECK-SD-NEXT:    shadd.16b v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSHADD16b_c:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v1, #10
+; CHECK-GI-NEXT:    saddw.8h v2, v1, v0
+; CHECK-GI-NEXT:    saddw2.8h v0, v1, v0
+; CHECK-GI-NEXT:    shrn.8b v1, v2, #1
+; CHECK-GI-NEXT:    shrn2.16b v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
   %add = add nsw <16 x i16> %sextsrc1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
   %resulti16 = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -1129,12 +1661,22 @@ define void @testLowerToSHADD16b_c(<16 x i8> %src1, ptr nocapture writeonly %des
 }
 
 define void @testLowerToSHADD8h_c(<8 x i16> %src1, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSHADD8h_c:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.8h v1, #10
-; CHECK-NEXT:    shadd.8h v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSHADD8h_c:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.8h v1, #10
+; CHECK-SD-NEXT:    shadd.8h v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSHADD8h_c:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.4s v1, #10
+; CHECK-GI-NEXT:    saddw.4s v2, v1, v0
+; CHECK-GI-NEXT:    saddw2.4s v0, v1, v0
+; CHECK-GI-NEXT:    shrn.4h v1, v2, #1
+; CHECK-GI-NEXT:    shrn2.8h v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
   %add = add nsw <8 x i32> %sextsrc1, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
   %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1144,12 +1686,23 @@ define void @testLowerToSHADD8h_c(<8 x i16> %src1, ptr nocapture writeonly %dest
 }
 
 define void @testLowerToSHADD4s_c(<4 x i32> %src1, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToSHADD4s_c:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.4s v1, #10
-; CHECK-NEXT:    shadd.4s v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToSHADD4s_c:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.4s v1, #10
+; CHECK-SD-NEXT:    shadd.4s v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToSHADD4s_c:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI77_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI77_0]
+; CHECK-GI-NEXT:    saddw.2d v2, v1, v0
+; CHECK-GI-NEXT:    saddw2.2d v0, v1, v0
+; CHECK-GI-NEXT:    shrn.2s v1, v2, #1
+; CHECK-GI-NEXT:    shrn2.4s v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %add = add nsw <4 x i64> %sextsrc1, <i64 10, i64 10, i64 10, i64 10>
   %resulti16 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
@@ -1159,12 +1712,20 @@ define void @testLowerToSHADD4s_c(<4 x i32> %src1, ptr nocapture writeonly %dest
 }
 
 define void @testLowerToUHADD8b_c(<8 x i8> %src1, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToUHADD8b_c:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.8b v1, #10
-; CHECK-NEXT:    uhadd.8b v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToUHADD8b_c:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.8b v1, #10
+; CHECK-SD-NEXT:    uhadd.8b v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToUHADD8b_c:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v1, #10
+; CHECK-GI-NEXT:    uaddw.8h v0, v1, v0
+; CHECK-GI-NEXT:    shrn.8b v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
   %add = add nuw nsw <8 x i16> %zextsrc1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
   %resulti16 = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -1174,12 +1735,20 @@ define void @testLowerToUHADD8b_c(<8 x i8> %src1, ptr nocapture writeonly %dest)
 }
 
 define void @testLowerToUHADD4h_c(<4 x i16> %src1, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToUHADD4h_c:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.4h v1, #10
-; CHECK-NEXT:    uhadd.4h v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToUHADD4h_c:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.4h v1, #10
+; CHECK-SD-NEXT:    uhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToUHADD4h_c:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.4s v1, #10
+; CHECK-GI-NEXT:    uaddw.4s v0, v1, v0
+; CHECK-GI-NEXT:    shrn.4h v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
   %add = add nuw nsw <4 x i32> %zextsrc1, <i32 10, i32 10, i32 10, i32 10>
   %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
@@ -1189,12 +1758,21 @@ define void @testLowerToUHADD4h_c(<4 x i16> %src1, ptr nocapture writeonly %dest
 }
 
 define void @testLowerToUHADD2s_c(<2 x i32> %src1, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToUHADD2s_c:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.2s v1, #10
-; CHECK-NEXT:    uhadd.2s v0, v0, v1
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToUHADD2s_c:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.2s v1, #10
+; CHECK-SD-NEXT:    uhadd.2s v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToUHADD2s_c:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI80_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI80_0]
+; CHECK-GI-NEXT:    uaddw.2d v0, v1, v0
+; CHECK-GI-NEXT:    shrn.2s v0, v0, #1
+; CHECK-GI-NEXT:    str d0, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
   %add = add nuw nsw <2 x i64> %zextsrc1, <i64 10, i64 10>
   %resulti16 = lshr <2 x i64> %add, <i64 1, i64 1>
@@ -1204,12 +1782,22 @@ define void @testLowerToUHADD2s_c(<2 x i32> %src1, ptr nocapture writeonly %dest
 }
 
 define void @testLowerToUHADD16b_c(<16 x i8> %src1, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToUHADD16b_c:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.16b v1, #10
-; CHECK-NEXT:    uhadd.16b v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToUHADD16b_c:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.16b v1, #10
+; CHECK-SD-NEXT:    uhadd.16b v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToUHADD16b_c:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v1, #10
+; CHECK-GI-NEXT:    uaddw.8h v2, v1, v0
+; CHECK-GI-NEXT:    uaddw2.8h v0, v1, v0
+; CHECK-GI-NEXT:    shrn.8b v1, v2, #1
+; CHECK-GI-NEXT:    shrn2.16b v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
   %add = add nuw nsw <16 x i16> %zextsrc1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
   %resulti16 = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -1219,12 +1807,22 @@ define void @testLowerToUHADD16b_c(<16 x i8> %src1, ptr nocapture writeonly %des
 }
 
 define void @testLowerToUHADD8h_c(<8 x i16> %src1, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToUHADD8h_c:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.8h v1, #10
-; CHECK-NEXT:    uhadd.8h v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToUHADD8h_c:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.8h v1, #10
+; CHECK-SD-NEXT:    uhadd.8h v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToUHADD8h_c:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.4s v1, #10
+; CHECK-GI-NEXT:    uaddw.4s v2, v1, v0
+; CHECK-GI-NEXT:    uaddw2.4s v0, v1, v0
+; CHECK-GI-NEXT:    shrn.4h v1, v2, #1
+; CHECK-GI-NEXT:    shrn2.8h v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
   %add = add nuw nsw <8 x i32> %zextsrc1, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
   %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1234,12 +1832,23 @@ define void @testLowerToUHADD8h_c(<8 x i16> %src1, ptr nocapture writeonly %dest
 }
 
 define void @testLowerToUHADD4s_c(<4 x i32> %src1, ptr nocapture writeonly %dest) {
-; CHECK-LABEL: testLowerToUHADD4s_c:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.4s v1, #10
-; CHECK-NEXT:    uhadd.4s v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: testLowerToUHADD4s_c:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.4s v1, #10
+; CHECK-SD-NEXT:    uhadd.4s v0, v0, v1
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: testLowerToUHADD4s_c:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI83_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI83_0]
+; CHECK-GI-NEXT:    uaddw.2d v2, v1, v0
+; CHECK-GI-NEXT:    uaddw2.2d v0, v1, v0
+; CHECK-GI-NEXT:    shrn.2s v1, v2, #1
+; CHECK-GI-NEXT:    shrn2.4s v1, v0, #1
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
   %add = add nuw nsw <4 x i64> %zextsrc1, <i64 10, i64 10, i64 10, i64 10>
   %resulti16 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
@@ -1249,13 +1858,21 @@ define void @testLowerToUHADD4s_c(<4 x i32> %src1, ptr nocapture writeonly %dest
 }
 
 define <8 x i8> @andmaskv8i8(<8 x i16> %src1, <8 x i8> %src2) {
-; CHECK-LABEL: andmaskv8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.8b v2, #7
-; CHECK-NEXT:    xtn.8b v0, v0
-; CHECK-NEXT:    and.8b v0, v0, v2
-; CHECK-NEXT:    uhadd.8b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: andmaskv8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.8b v2, #7
+; CHECK-SD-NEXT:    xtn.8b v0, v0
+; CHECK-SD-NEXT:    and.8b v0, v0, v2
+; CHECK-SD-NEXT:    uhadd.8b v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: andmaskv8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v2, #7
+; CHECK-GI-NEXT:    and.16b v0, v0, v2
+; CHECK-GI-NEXT:    uaddw.8h v0, v0, v1
+; CHECK-GI-NEXT:    shrn.8b v0, v0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = and <8 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
   %add = add nuw nsw <8 x i16> %zextsrc1, %zextsrc2
@@ -1265,13 +1882,24 @@ define <8 x i8> @andmaskv8i8(<8 x i16> %src1, <8 x i8> %src2) {
 }
 
 define <16 x i8> @andmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) {
-; CHECK-LABEL: andmaskv16i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.16b v3, #7
-; CHECK-NEXT:    uzp1.16b v0, v0, v1
-; CHECK-NEXT:    and.16b v0, v0, v3
-; CHECK-NEXT:    uhadd.16b v0, v0, v2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: andmaskv16i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.16b v3, #7
+; CHECK-SD-NEXT:    uzp1.16b v0, v0, v1
+; CHECK-SD-NEXT:    and.16b v0, v0, v3
+; CHECK-SD-NEXT:    uhadd.16b v0, v0, v2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: andmaskv16i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v3, #7
+; CHECK-GI-NEXT:    and.16b v0, v0, v3
+; CHECK-GI-NEXT:    and.16b v1, v1, v3
+; CHECK-GI-NEXT:    uaddw.8h v0, v0, v2
+; CHECK-GI-NEXT:    uaddw2.8h v1, v1, v2
+; CHECK-GI-NEXT:    shrn.8b v0, v0, #1
+; CHECK-GI-NEXT:    shrn2.16b v0, v1, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = and <16 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
   %add = add nuw nsw <16 x i16> %zextsrc1, %zextsrc2
@@ -1281,16 +1909,30 @@ define <16 x i8> @andmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) {
 }
 
 define <16 x i8> @andmask2v16i8(<16 x i16> %src1, <16 x i16> %src2) {
-; CHECK-LABEL: andmask2v16i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1.16b v2, v2, v3
-; CHECK-NEXT:    movi.16b v3, #3
-; CHECK-NEXT:    uzp1.16b v0, v0, v1
-; CHECK-NEXT:    movi.16b v1, #7
-; CHECK-NEXT:    and.16b v2, v2, v3
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    uhadd.16b v0, v0, v2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: andmask2v16i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uzp1.16b v2, v2, v3
+; CHECK-SD-NEXT:    movi.16b v3, #3
+; CHECK-SD-NEXT:    uzp1.16b v0, v0, v1
+; CHECK-SD-NEXT:    movi.16b v1, #7
+; CHECK-SD-NEXT:    and.16b v2, v2, v3
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    uhadd.16b v0, v0, v2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: andmask2v16i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v4, #7
+; CHECK-GI-NEXT:    movi.8h v5, #3
+; CHECK-GI-NEXT:    and.16b v0, v0, v4
+; CHECK-GI-NEXT:    and.16b v2, v2, v5
+; CHECK-GI-NEXT:    and.16b v1, v1, v4
+; CHECK-GI-NEXT:    and.16b v3, v3, v5
+; CHECK-GI-NEXT:    add.8h v0, v0, v2
+; CHECK-GI-NEXT:    add.8h v1, v1, v3
+; CHECK-GI-NEXT:    shrn.8b v0, v0, #1
+; CHECK-GI-NEXT:    shrn2.16b v0, v1, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = and <16 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %zextsrc2 = and <16 x i16> %src2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   %add = add nuw nsw <16 x i16> %zextsrc1, %zextsrc2
@@ -1300,14 +1942,24 @@ define <16 x i8> @andmask2v16i8(<16 x i16> %src1, <16 x i16> %src2) {
 }
 
 define <8 x i8> @andmask2v8i8(<8 x i16> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: andmask2v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.8b v2, #7
-; CHECK-NEXT:    xtn.8b v0, v0
-; CHECK-NEXT:    xtn.8b v1, v1
-; CHECK-NEXT:    and.8b v0, v0, v2
-; CHECK-NEXT:    uhadd.8b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: andmask2v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.8b v2, #7
+; CHECK-SD-NEXT:    xtn.8b v0, v0
+; CHECK-SD-NEXT:    xtn.8b v1, v1
+; CHECK-SD-NEXT:    and.8b v0, v0, v2
+; CHECK-SD-NEXT:    uhadd.8b v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: andmask2v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v2, #7
+; CHECK-GI-NEXT:    movi.2d v3, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and.16b v0, v0, v2
+; CHECK-GI-NEXT:    and.16b v1, v1, v3
+; CHECK-GI-NEXT:    add.8h v0, v0, v1
+; CHECK-GI-NEXT:    shrn.8b v0, v0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = and <8 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %zextsrc2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
   %add = add nuw nsw <8 x i16> %zextsrc1, %zextsrc2
@@ -1317,13 +1969,23 @@ define <8 x i8> @andmask2v8i8(<8 x i16> %src1, <8 x i16> %src2) {
 }
 
 define <8 x i16> @andmask3v8i8(<8 x i16> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: andmask3v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.8h v2, #7
-; CHECK-NEXT:    bic.8h v1, #254, lsl #8
-; CHECK-NEXT:    and.16b v0, v0, v2
-; CHECK-NEXT:    uhadd.8h v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: andmask3v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.8h v2, #7
+; CHECK-SD-NEXT:    bic.8h v1, #254, lsl #8
+; CHECK-SD-NEXT:    and.16b v0, v0, v2
+; CHECK-SD-NEXT:    uhadd.8h v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: andmask3v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.8h v2, #7
+; CHECK-GI-NEXT:    mvni.8h v3, #254, lsl #8
+; CHECK-GI-NEXT:    and.16b v1, v1, v3
+; CHECK-GI-NEXT:    and.16b v0, v0, v2
+; CHECK-GI-NEXT:    add.8h v0, v0, v1
+; CHECK-GI-NEXT:    ushr.8h v0, v0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = and <8 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %zextsrc2 = and <8 x i16> %src2, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
   %add = add nuw nsw <8 x i16> %zextsrc1, %zextsrc2
@@ -1332,13 +1994,23 @@ define <8 x i16> @andmask3v8i8(<8 x i16> %src1, <8 x i16> %src2) {
 }
 
 define <16 x i8> @sextmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) {
-; CHECK-LABEL: sextmaskv16i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshr.8h v1, v1, #11
-; CHECK-NEXT:    sshr.8h v0, v0, #11
-; CHECK-NEXT:    uzp1.16b v0, v0, v1
-; CHECK-NEXT:    shadd.16b v0, v0, v2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sextmaskv16i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sshr.8h v1, v1, #11
+; CHECK-SD-NEXT:    sshr.8h v0, v0, #11
+; CHECK-SD-NEXT:    uzp1.16b v0, v0, v1
+; CHECK-SD-NEXT:    shadd.16b v0, v0, v2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sextmaskv16i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll.8h v3, v2, #0
+; CHECK-GI-NEXT:    sshr.8h v1, v1, #11
+; CHECK-GI-NEXT:    ssra.8h v3, v0, #11
+; CHECK-GI-NEXT:    saddw2.8h v1, v1, v2
+; CHECK-GI-NEXT:    shrn.8b v0, v3, #1
+; CHECK-GI-NEXT:    shrn2.16b v0, v1, #1
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = ashr <16 x i16> %src1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
   %add = add nsw <16 x i16> %sextsrc1, %sextsrc2
@@ -1348,12 +2020,19 @@ define <16 x i8> @sextmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) {
 }
 
 define <8 x i8> @sextmaskv8i8(<8 x i16> %src1, <8 x i8> %src2) {
-; CHECK-LABEL: sextmaskv8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshr.8h v0, v0, #11
-; CHECK-NEXT:    xtn.8b v0, v0
-; CHECK-NEXT:    shadd.8b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sextmaskv8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sshr.8h v0, v0, #11
+; CHECK-SD-NEXT:    xtn.8b v0, v0
+; CHECK-SD-NEXT:    shadd.8b v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sextmaskv8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll.8h v1, v1, #0
+; CHECK-GI-NEXT:    ssra.8h v1, v0, #11
+; CHECK-GI-NEXT:    shrn.8b v0, v1, #1
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = ashr <8 x i16> %src1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
   %add = add nsw <8 x i16> %sextsrc1, %sextsrc2
@@ -1363,11 +2042,18 @@ define <8 x i8> @sextmaskv8i8(<8 x i16> %src1, <8 x i8> %src2) {
 }
 
 define <8 x i8> @sextmask2v8i8(<8 x i16> %src1, <8 x i8> %src2) {
-; CHECK-LABEL: sextmask2v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shrn.8b v0, v0, #8
-; CHECK-NEXT:    shadd.8b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sextmask2v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shrn.8b v0, v0, #8
+; CHECK-SD-NEXT:    shadd.8b v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sextmask2v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll.8h v1, v1, #0
+; CHECK-GI-NEXT:    ssra.8h v1, v0, #8
+; CHECK-GI-NEXT:    shrn.8b v0, v1, #1
+; CHECK-GI-NEXT:    ret
   %sextsrc1 = ashr <8 x i16> %src1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
   %add = add nsw <8 x i16> %sextsrc1, %sextsrc2
@@ -1377,13 +2063,20 @@ define <8 x i8> @sextmask2v8i8(<8 x i16> %src1, <8 x i8> %src2) {
 }
 
 define <8 x i8> @sextmask3v8i8(<8 x i16> %src1, <8 x i8> %src2) {
-; CHECK-LABEL: sextmask3v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushr.8h v0, v0, #7
-; CHECK-NEXT:    sshll.8h v1, v1, #0
-; CHECK-NEXT:    shadd.8h v0, v0, v1
-; CHECK-NEXT:    xtn.8b v0, v0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sextmask3v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ushr.8h v0, v0, #7
+; CHECK-SD-NEXT:    sshll.8h v1, v1, #0
+; CHECK-SD-NEXT:    shadd.8h v0, v0, v1
+; CHECK-SD-NEXT:    xtn.8b v0, v0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sextmask3v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll.8h v1, v1, #0
+; CHECK-GI-NEXT:    ssra.8h v1, v0, #7
+; CHECK-GI-NEXT:    shrn.8b v0, v1, #1
+; CHECK-GI-NEXT:    ret
   %1 = ashr <8 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
   %add = add nsw <8 x i16> %1, %sextsrc2
@@ -1409,10 +2102,18 @@ define <4 x i16> @ext_via_i19(<4 x i16> %a) {
 }
 
 define <8 x i8> @srhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) {
-; CHECK-LABEL: srhadd_v8i8_trunc:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd.8b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srhadd_v8i8_trunc:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    srhadd.8b v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srhadd_v8i8_trunc:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll.8h v0, v0, #0
+; CHECK-GI-NEXT:    sshll.8h v1, v1, #0
+; CHECK-GI-NEXT:    urhadd.8h v0, v0, v1
+; CHECK-GI-NEXT:    xtn.8b v0, v0
+; CHECK-GI-NEXT:    ret
   %s0s = sext <8 x i8> %s0 to <8 x i16>
   %s1s = sext <8 x i8> %s1 to <8 x i16>
   %s = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %s0s, <8 x i16> %s1s)
@@ -1421,10 +2122,18 @@ define <8 x i8> @srhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) {
 }
 
 define <4 x i16> @srhadd_v4i16_trunc(<4 x i16> %s0, <4 x i16> %s1) {
-; CHECK-LABEL: srhadd_v4i16_trunc:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd.4h v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srhadd_v4i16_trunc:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    srhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srhadd_v4i16_trunc:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll.4s v0, v0, #0
+; CHECK-GI-NEXT:    sshll.4s v1, v1, #0
+; CHECK-GI-NEXT:    urhadd.4s v0, v0, v1
+; CHECK-GI-NEXT:    xtn.4h v0, v0
+; CHECK-GI-NEXT:    ret
   %s0s = sext <4 x i16> %s0 to <4 x i32>
   %s1s = sext <4 x i16> %s1 to <4 x i32>
   %s = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %s0s, <4 x i32> %s1s)
@@ -1451,10 +2160,18 @@ define <2 x i32> @srhadd_v2i32_trunc(<2 x i32> %s0, <2 x i32> %s1) {
 }
 
 define <8 x i8> @urhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) {
-; CHECK-LABEL: urhadd_v8i8_trunc:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd.8b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urhadd_v8i8_trunc:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    urhadd.8b v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urhadd_v8i8_trunc:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll.8h v0, v0, #0
+; CHECK-GI-NEXT:    ushll.8h v1, v1, #0
+; CHECK-GI-NEXT:    srhadd.8h v0, v0, v1
+; CHECK-GI-NEXT:    xtn.8b v0, v0
+; CHECK-GI-NEXT:    ret
   %s0s = zext <8 x i8> %s0 to <8 x i16>
   %s1s = zext <8 x i8> %s1 to <8 x i16>
   %s = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %s0s, <8 x i16> %s1s)
@@ -1463,10 +2180,18 @@ define <8 x i8> @urhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) {
 }
 
 define <4 x i16> @urhadd_v4i16_trunc(<4 x i16> %s0, <4 x i16> %s1) {
-; CHECK-LABEL: urhadd_v4i16_trunc:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd.4h v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urhadd_v4i16_trunc:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    urhadd.4h v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urhadd_v4i16_trunc:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    ushll.4s v1, v1, #0
+; CHECK-GI-NEXT:    srhadd.4s v0, v0, v1
+; CHECK-GI-NEXT:    xtn.4h v0, v0
+; CHECK-GI-NEXT:    ret
   %s0s = zext <4 x i16> %s0 to <4 x i32>
   %s1s = zext <4 x i16> %s1 to <4 x i32>
   %s = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %s0s, <4 x i32> %s1s)



More information about the llvm-commits mailing list