[llvm] [GlobalISel][AArch64] Combine Vector Reduction Add Long (PR #76241)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 22 01:42:44 PST 2024


https://github.com/chuongg3 updated https://github.com/llvm/llvm-project/pull/76241

>From 450f50d519440b888a8e58ed6e0d5a2ada7dc361 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Tue, 19 Dec 2023 13:28:36 +0000
Subject: [PATCH 1/3] [AArch64][GlobalISel] Pre-Commit Tests for Combine
 ADDLV(ADDLP)

---
 llvm/test/CodeGen/AArch64/arm64-vadd.ll | 883 ++++++++++++++++++------
 llvm/test/CodeGen/AArch64/neon-addlv.ll | 259 ++++---
 2 files changed, 871 insertions(+), 271 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
index ad089f38955be3..38a568ac919168 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
@@ -1,8 +1,17 @@
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=arm64-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:         warning: Instruction selection used fallback path for saddlp1d
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for uaddlp1d
 
 define <8 x i8> @addhn8b(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn8b:
-;CHECK: addhn.8b
+; CHECK-LABEL: addhn8b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addhn v0.8b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -10,8 +19,12 @@ define <8 x i8> @addhn8b(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @addhn4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn4h:
-;CHECK: addhn.4h
+; CHECK-LABEL: addhn4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -19,8 +32,12 @@ define <4 x i16> @addhn4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @addhn2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2s:
-;CHECK: addhn.2s
+; CHECK-LABEL: addhn2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addhn v0.2s, v0.2d, v1.2d
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -28,9 +45,12 @@ define <2 x i32> @addhn2s(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
-;CHECK-LABEL: addhn2_16b:
-;CHECK: addhn.8b
-;CHECK-NEXT: addhn2.16b
+; CHECK-LABEL: addhn2_16b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addhn v2.8b, v0.8h, v1.8h
+; CHECK-NEXT:    addhn2 v2.16b, v0.8h, v1.8h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -38,9 +58,12 @@ define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
 }
 
 define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
-;CHECK-LABEL: addhn2_8h:
-;CHECK: addhn.4h
-;CHECK-NEXT: addhn2.8h
+; CHECK-LABEL: addhn2_8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addhn v2.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn2 v2.8h, v0.4s, v1.4s
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -48,9 +71,12 @@ define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
 }
 
 define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
-;CHECK-LABEL: addhn2_4s:
-;CHECK: addhn.2s
-;CHECK-NEXT: addhn2.4s
+; CHECK-LABEL: addhn2_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addhn v2.2s, v0.2d, v1.2d
+; CHECK-NEXT:    addhn2 v2.4s, v0.2d, v1.2d
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -63,8 +89,12 @@ declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind re
 
 
 define <8 x i8> @raddhn8b(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: raddhn8b:
-;CHECK: raddhn.8b
+; CHECK-LABEL: raddhn8b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    raddhn v0.8b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -72,8 +102,12 @@ define <8 x i8> @raddhn8b(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @raddhn4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: raddhn4h:
-;CHECK: raddhn.4h
+; CHECK-LABEL: raddhn4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    raddhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -81,8 +115,12 @@ define <4 x i16> @raddhn4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @raddhn2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: raddhn2s:
-;CHECK: raddhn.2s
+; CHECK-LABEL: raddhn2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    raddhn v0.2s, v0.2d, v1.2d
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -90,9 +128,12 @@ define <2 x i32> @raddhn2s(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
-;CHECK-LABEL: raddhn2_16b:
-;CHECK: raddhn.8b
-;CHECK-NEXT: raddhn2.16b
+; CHECK-LABEL: raddhn2_16b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    raddhn v2.8b, v0.8h, v1.8h
+; CHECK-NEXT:    raddhn2 v2.16b, v0.8h, v1.8h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -100,9 +141,12 @@ define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
 }
 
 define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
-;CHECK-LABEL: raddhn2_8h:
-;CHECK: raddhn.4h
-;CHECK-NEXT: raddhn2.8h
+; CHECK-LABEL: raddhn2_8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    raddhn v2.4h, v0.4s, v1.4s
+; CHECK-NEXT:    raddhn2 v2.8h, v0.4s, v1.4s
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -110,9 +154,12 @@ define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
 }
 
 define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
-;CHECK-LABEL: raddhn2_4s:
-;CHECK: raddhn.2s
-;CHECK-NEXT: raddhn2.4s
+; CHECK-LABEL: raddhn2_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    raddhn v2.2s, v0.2d, v1.2d
+; CHECK-NEXT:    raddhn2 v2.4s, v0.2d, v1.2d
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -124,8 +171,12 @@ declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind
 declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @saddl8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddl8h:
-;CHECK: saddl.8h
+; CHECK-LABEL: saddl8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i8>, ptr %A
         %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
@@ -135,8 +186,12 @@ define <8 x i16> @saddl8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @saddl4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddl4s:
-;CHECK: saddl.4s
+; CHECK-LABEL: saddl4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i16>, ptr %A
         %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
@@ -146,8 +201,12 @@ define <4 x i32> @saddl4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @saddl2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddl2d:
-;CHECK: saddl.2d
+; CHECK-LABEL: saddl2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i32>, ptr %A
         %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
@@ -158,8 +217,9 @@ define <2 x i64> @saddl2d(ptr %A, ptr %B) nounwind {
 
 define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
 ; CHECK-LABEL: saddl2_8h:
-; CHECK-NEXT: saddl2.8h v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    saddl2 v0.8h, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp = bitcast <16 x i8> %a to <2 x i64>
   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
@@ -174,8 +234,9 @@ define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
 
 define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
 ; CHECK-LABEL: saddl2_4s:
-; CHECK-NEXT: saddl2.4s v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    saddl2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %tmp = bitcast <8 x i16> %a to <2 x i64>
   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
@@ -190,8 +251,9 @@ define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
 
 define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
 ; CHECK-LABEL: saddl2_2d:
-; CHECK-NEXT: saddl2.2d v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    saddl2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %tmp = bitcast <4 x i32> %a to <2 x i64>
   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
@@ -205,8 +267,12 @@ define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
 }
 
 define <8 x i16> @uaddl8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddl8h:
-;CHECK: uaddl.8h
+; CHECK-LABEL: uaddl8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -216,8 +282,12 @@ define <8 x i16> @uaddl8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @uaddl4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddl4s:
-;CHECK: uaddl.4s
+; CHECK-LABEL: uaddl4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -227,8 +297,12 @@ define <4 x i32> @uaddl4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @uaddl2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddl2d:
-;CHECK: uaddl.2d
+; CHECK-LABEL: uaddl2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -240,8 +314,9 @@ define <2 x i64> @uaddl2d(ptr %A, ptr %B) nounwind {
 
 define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
 ; CHECK-LABEL: uaddl2_8h:
-; CHECK-NEXT: uaddl2.8h v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddl2 v0.8h, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp = bitcast <16 x i8> %a to <2 x i64>
   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
@@ -256,8 +331,9 @@ define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
 
 define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
 ; CHECK-LABEL: uaddl2_4s:
-; CHECK-NEXT: uaddl2.4s v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddl2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %tmp = bitcast <8 x i16> %a to <2 x i64>
   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
@@ -272,8 +348,9 @@ define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
 
 define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
 ; CHECK-LABEL: uaddl2_2d:
-; CHECK-NEXT: uaddl2.2d v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddl2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %tmp = bitcast <4 x i32> %a to <2 x i64>
   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
@@ -287,8 +364,12 @@ define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
 }
 
 define <8 x i16> @uaddw8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw8h:
-;CHECK: uaddw.8h
+; CHECK-LABEL: uaddw8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddw v0.8h, v0.8h, v1.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
@@ -297,8 +378,12 @@ define <8 x i16> @uaddw8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @uaddw4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw4s:
-;CHECK: uaddw.4s
+; CHECK-LABEL: uaddw4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
@@ -307,8 +392,12 @@ define <4 x i32> @uaddw4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @uaddw2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw2d:
-;CHECK: uaddw.2d
+; CHECK-LABEL: uaddw2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddw v0.2d, v0.2d, v1.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
@@ -317,8 +406,19 @@ define <2 x i64> @uaddw2d(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @uaddw2_8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw2_8h:
-;CHECK: uaddw.8h
+; CHECK-SD-LABEL: uaddw2_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1, #8]
+; CHECK-SD-NEXT:    uaddw v0.8h, v0.8h, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddw2_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    uaddw2 v0.8h, v0.8h, v1.16b
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
 
         %tmp2 = load <16 x i8>, ptr %B
@@ -330,8 +430,19 @@ define <8 x i16> @uaddw2_8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @uaddw2_4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw2_4s:
-;CHECK: uaddw.4s
+; CHECK-SD-LABEL: uaddw2_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1, #8]
+; CHECK-SD-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddw2_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
 
         %tmp2 = load <8 x i16>, ptr %B
@@ -343,8 +454,19 @@ define <4 x i32> @uaddw2_4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @uaddw2_2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw2_2d:
-;CHECK: uaddw.2d
+; CHECK-SD-LABEL: uaddw2_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1, #8]
+; CHECK-SD-NEXT:    uaddw v0.2d, v0.2d, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddw2_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
 
         %tmp2 = load <4 x i32>, ptr %B
@@ -356,8 +478,12 @@ define <2 x i64> @uaddw2_2d(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @saddw8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw8h:
-;CHECK: saddw.8h
+; CHECK-LABEL: saddw8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    saddw v0.8h, v0.8h, v1.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i8>, ptr %B
         %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
@@ -366,8 +492,12 @@ define <8 x i16> @saddw8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @saddw4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw4s:
-;CHECK: saddw.4s
+; CHECK-LABEL: saddw4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i16>, ptr %B
         %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
@@ -376,8 +506,12 @@ define <4 x i32> @saddw4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @saddw2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw2d:
-;CHECK: saddw.2d
+; CHECK-LABEL: saddw2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    saddw v0.2d, v0.2d, v1.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i32>, ptr %B
         %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
@@ -386,8 +520,19 @@ define <2 x i64> @saddw2d(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @saddw2_8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw2_8h:
-;CHECK: saddw.8h
+; CHECK-SD-LABEL: saddw2_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1, #8]
+; CHECK-SD-NEXT:    saddw v0.8h, v0.8h, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddw2_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    saddw2 v0.8h, v0.8h, v1.16b
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
 
         %tmp2 = load <16 x i8>, ptr %B
@@ -399,8 +544,19 @@ define <8 x i16> @saddw2_8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @saddw2_4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw2_4s:
-;CHECK: saddw.4s
+; CHECK-SD-LABEL: saddw2_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1, #8]
+; CHECK-SD-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddw2_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
 
         %tmp2 = load <8 x i16>, ptr %B
@@ -412,8 +568,19 @@ define <4 x i32> @saddw2_4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @saddw2_2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw2_2d:
-;CHECK: saddw.2d
+; CHECK-SD-LABEL: saddw2_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1, #8]
+; CHECK-SD-NEXT:    saddw v0.2d, v0.2d, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddw2_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    saddw2 v0.2d, v0.2d, v1.4s
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
 
         %tmp2 = load <4 x i32>, ptr %B
@@ -425,48 +592,66 @@ define <2 x i64> @saddw2_2d(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @saddlp4h(ptr %A) nounwind {
-;CHECK-LABEL: saddlp4h:
-;CHECK: saddlp.4h
+; CHECK-LABEL: saddlp4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    saddlp v0.4h, v0.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i8>, ptr %A
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
         ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @saddlp2s(ptr %A) nounwind {
-;CHECK-LABEL: saddlp2s:
-;CHECK: saddlp.2s
+; CHECK-LABEL: saddlp2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    saddlp v0.2s, v0.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i16>, ptr %A
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
         ret <2 x i32> %tmp3
 }
 
 define <1 x i64> @saddlp1d(ptr %A) nounwind {
-;CHECK-LABEL: saddlp1d:
-;CHECK: saddlp.1d
+; CHECK-LABEL: saddlp1d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    saddlp v0.1d, v0.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i32>, ptr %A
         %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
         ret <1 x i64> %tmp3
 }
 
 define <8 x i16> @saddlp8h(ptr %A) nounwind {
-;CHECK-LABEL: saddlp8h:
-;CHECK: saddlp.8h
+; CHECK-LABEL: saddlp8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    saddlp v0.8h, v0.16b
+; CHECK-NEXT:    ret
         %tmp1 = load <16 x i8>, ptr %A
         %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
         ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @saddlp4s(ptr %A) nounwind {
-;CHECK-LABEL: saddlp4s:
-;CHECK: saddlp.4s
+; CHECK-LABEL: saddlp4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    saddlp v0.4s, v0.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
         ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @saddlp2d(ptr %A) nounwind {
-;CHECK-LABEL: saddlp2d:
-;CHECK: saddlp.2d
+; CHECK-LABEL: saddlp2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    saddlp v0.2d, v0.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
         ret <2 x i64> %tmp3
@@ -481,48 +666,66 @@ declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind read
 declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
 
 define <4 x i16> @uaddlp4h(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp4h:
-;CHECK: uaddlp.4h
+; CHECK-LABEL: uaddlp4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i8>, ptr %A
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
         ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @uaddlp2s(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp2s:
-;CHECK: uaddlp.2s
+; CHECK-LABEL: uaddlp2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    uaddlp v0.2s, v0.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i16>, ptr %A
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
         ret <2 x i32> %tmp3
 }
 
 define <1 x i64> @uaddlp1d(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp1d:
-;CHECK: uaddlp.1d
+; CHECK-LABEL: uaddlp1d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    uaddlp v0.1d, v0.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i32>, ptr %A
         %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
         ret <1 x i64> %tmp3
 }
 
 define <8 x i16> @uaddlp8h(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp8h:
-;CHECK: uaddlp.8h
+; CHECK-LABEL: uaddlp8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    ret
         %tmp1 = load <16 x i8>, ptr %A
         %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
         ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @uaddlp4s(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp4s:
-;CHECK: uaddlp.4s
+; CHECK-LABEL: uaddlp4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
         ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @uaddlp2d(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp2d:
-;CHECK: uaddlp.2d
+; CHECK-LABEL: uaddlp2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    uaddlp v0.2d, v0.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
         ret <2 x i64> %tmp3
@@ -537,8 +740,12 @@ declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind read
 declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
 
 define <4 x i16> @sadalp4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp4h:
-;CHECK: sadalp.4h
+; CHECK-LABEL: sadalp4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    sadalp v0.4h, v1.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i8>, ptr %A
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
         %tmp4 = load <4 x i16>, ptr %B
@@ -547,8 +754,12 @@ define <4 x i16> @sadalp4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @sadalp2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp2s:
-;CHECK: sadalp.2s
+; CHECK-LABEL: sadalp2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    sadalp v0.2s, v1.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i16>, ptr %A
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
         %tmp4 = load <2 x i32>, ptr %B
@@ -557,8 +768,12 @@ define <2 x i32> @sadalp2s(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @sadalp8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp8h:
-;CHECK: sadalp.8h
+; CHECK-LABEL: sadalp8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    sadalp v0.8h, v1.16b
+; CHECK-NEXT:    ret
         %tmp1 = load <16 x i8>, ptr %A
         %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
         %tmp4 = load <8 x i16>, ptr %B
@@ -567,8 +782,12 @@ define <8 x i16> @sadalp8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @sadalp4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp4s:
-;CHECK: sadalp.4s
+; CHECK-LABEL: sadalp4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    sadalp v0.4s, v1.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
         %tmp4 = load <4 x i32>, ptr %B
@@ -577,8 +796,12 @@ define <4 x i32> @sadalp4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @sadalp2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp2d:
-;CHECK: sadalp.2d
+; CHECK-LABEL: sadalp2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    sadalp v0.2d, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
         %tmp4 = load <2 x i64>, ptr %B
@@ -587,8 +810,12 @@ define <2 x i64> @sadalp2d(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @uadalp4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp4h:
-;CHECK: uadalp.4h
+; CHECK-LABEL: uadalp4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    uadalp v0.4h, v1.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i8>, ptr %A
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
         %tmp4 = load <4 x i16>, ptr %B
@@ -597,8 +824,12 @@ define <4 x i16> @uadalp4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @uadalp2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp2s:
-;CHECK: uadalp.2s
+; CHECK-LABEL: uadalp2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    uadalp v0.2s, v1.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i16>, ptr %A
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
         %tmp4 = load <2 x i32>, ptr %B
@@ -607,8 +838,12 @@ define <2 x i32> @uadalp2s(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @uadalp8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp8h:
-;CHECK: uadalp.8h
+; CHECK-LABEL: uadalp8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    uadalp v0.8h, v1.16b
+; CHECK-NEXT:    ret
         %tmp1 = load <16 x i8>, ptr %A
         %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
         %tmp4 = load <8 x i16>, ptr %B
@@ -617,8 +852,12 @@ define <8 x i16> @uadalp8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @uadalp4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp4s:
-;CHECK: uadalp.4s
+; CHECK-LABEL: uadalp4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
         %tmp4 = load <4 x i32>, ptr %B
@@ -627,8 +866,12 @@ define <4 x i32> @uadalp4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @uadalp2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp2d:
-;CHECK: uadalp.2d
+; CHECK-LABEL: uadalp2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    uadalp v0.2d, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
         %tmp4 = load <2 x i64>, ptr %B
@@ -637,8 +880,12 @@ define <2 x i64> @uadalp2d(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i8> @addp_8b(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_8b:
-;CHECK: addp.8b
+; CHECK-LABEL: addp_8b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    addp v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i8>, ptr %A
         %tmp2 = load <8 x i8>, ptr %B
         %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -646,8 +893,12 @@ define <8 x i8> @addp_8b(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @addp_16b(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_16b:
-;CHECK: addp.16b
+; CHECK-LABEL: addp_16b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addp v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
         %tmp1 = load <16 x i8>, ptr %A
         %tmp2 = load <16 x i8>, ptr %B
         %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -655,8 +906,12 @@ define <16 x i8> @addp_16b(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @addp_4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_4h:
-;CHECK: addp.4h
+; CHECK-LABEL: addp_4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    addp v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i16>, ptr %A
         %tmp2 = load <4 x i16>, ptr %B
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -664,8 +919,12 @@ define <4 x i16> @addp_4h(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @addp_8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_8h:
-;CHECK: addp.8h
+; CHECK-LABEL: addp_8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addp v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -673,8 +932,12 @@ define <8 x i16> @addp_8h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @addp_2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_2s:
-;CHECK: addp.2s
+; CHECK-LABEL: addp_2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    addp v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i32>, ptr %A
         %tmp2 = load <2 x i32>, ptr %B
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -682,8 +945,12 @@ define <2 x i32> @addp_2s(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @addp_4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_4s:
-;CHECK: addp.4s
+; CHECK-LABEL: addp_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addp v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -691,8 +958,12 @@ define <4 x i32> @addp_4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @addp_2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_2d:
-;CHECK: addp.2d
+; CHECK-LABEL: addp_2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -708,8 +979,12 @@ declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind r
 declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <2 x float> @faddp_2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: faddp_2s:
-;CHECK: faddp.2s
+; CHECK-LABEL: faddp_2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    faddp v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x float>, ptr %A
         %tmp2 = load <2 x float>, ptr %B
         %tmp3 = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
@@ -717,8 +992,12 @@ define <2 x float> @faddp_2s(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x float> @faddp_4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: faddp_4s:
-;CHECK: faddp.4s
+; CHECK-LABEL: faddp_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    faddp v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x float>, ptr %A
         %tmp2 = load <4 x float>, ptr %B
         %tmp3 = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
@@ -726,8 +1005,12 @@ define <4 x float> @faddp_4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x double> @faddp_2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: faddp_2d:
-;CHECK: faddp.2d
+; CHECK-LABEL: faddp_2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    faddp v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x double>, ptr %A
         %tmp2 = load <2 x double>, ptr %B
         %tmp3 = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
@@ -739,9 +1022,11 @@ declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nou
 declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: uaddl_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: uaddl.2d
+; CHECK-LABEL: uaddl_duprhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 
@@ -755,9 +1040,18 @@ define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
 }
 
 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: uaddl2_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: uaddl2.2d
+; CHECK-SD-LABEL: uaddl2_duprhs:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4s, w0
+; CHECK-SD-NEXT:    uaddl2 v0.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddl2_duprhs:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v1.2s, w0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    ret
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 
@@ -771,9 +1065,11 @@ define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 }
 
 define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: saddl_duplhs
-; CHECK-NOT: ext.16b
-; CHECK: saddl.2d
+; CHECK-LABEL: saddl_duplhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    saddl v0.2d, v1.2s, v0.2s
+; CHECK-NEXT:    ret
   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
 
@@ -787,9 +1083,18 @@ define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
 }
 
 define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: saddl2_duplhs
-; CHECK-NOT: ext.16b
-; CHECK: saddl2.2d
+; CHECK-SD-LABEL: saddl2_duplhs:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4s, w0
+; CHECK-SD-NEXT:    saddl2 v0.2d, v1.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddl2_duplhs:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v1.2s, w0
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    ret
   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
 
@@ -803,9 +1108,11 @@ define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
 }
 
 define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: usubl_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: usubl.2d
+; CHECK-LABEL: usubl_duprhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    usubl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 
@@ -819,9 +1126,18 @@ define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
 }
 
 define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: usubl2_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: usubl2.2d
+; CHECK-SD-LABEL: usubl2_duprhs:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4s, w0
+; CHECK-SD-NEXT:    usubl2 v0.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: usubl2_duprhs:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v1.2s, w0
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    usubl v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 
@@ -836,8 +1152,10 @@ define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 
 define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: ssubl_duplhs:
-; CHECK-NOT: ext.16b
-; CHECK: ssubl.2d
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    ssubl v0.2d, v1.2s, v0.2s
+; CHECK-NEXT:    ret
   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
 
@@ -851,9 +1169,18 @@ define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
 }
 
 define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: ssubl2_duplhs:
-; CHECK-NOT: ext.16b
-; CHECK: ssubl2.2d
+; CHECK-SD-LABEL: ssubl2_duplhs:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4s, w0
+; CHECK-SD-NEXT:    ssubl2 v0.2d, v1.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ssubl2_duplhs:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v1.2s, w0
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ssubw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    ret
   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
 
@@ -867,8 +1194,20 @@ define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
 }
 
 define <8 x i8> @addhn8b_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn8b_natural:
-;CHECK: addhn.8b
+; CHECK-SD-LABEL: addhn8b_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    addhn v0.8b, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn8b_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %sum = add <8 x i16> %tmp1, %tmp2
@@ -878,8 +1217,20 @@ define <8 x i8> @addhn8b_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @addhn4h_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn4h_natural:
-;CHECK: addhn.4h
+; CHECK-SD-LABEL: addhn4h_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn4h_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %sum = add <4 x i32> %tmp1, %tmp2
@@ -889,8 +1240,20 @@ define <4 x i16> @addhn4h_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2s_natural:
-;CHECK: addhn.2s
+; CHECK-SD-LABEL: addhn2s_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    addhn v0.2s, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn2s_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %sum = add <2 x i64> %tmp1, %tmp2
@@ -900,8 +1263,22 @@ define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2_16b_natural:
-;CHECK: addhn2.16b
+; CHECK-SD-LABEL: addhn2_16b_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    addhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn2_16b_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    add v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    shrn2 v0.16b, v1.8h, #8
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %sum = add <8 x i16> %tmp1, %tmp2
@@ -912,8 +1289,22 @@ define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2_8h_natural:
-;CHECK: addhn2.8h
+; CHECK-SD-LABEL: addhn2_8h_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    addhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn2_8h_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    shrn2 v0.8h, v1.4s, #16
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %sum = add <4 x i32> %tmp1, %tmp2
@@ -924,8 +1315,22 @@ define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2_4s_natural:
-;CHECK: addhn2.4s
+; CHECK-SD-LABEL: addhn2_4s_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn2_4s_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %sum = add <2 x i64> %tmp1, %tmp2
@@ -936,10 +1341,22 @@ define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @addhn_addhn2_4s(ptr %A, ptr %B, ptr %C, ptr %D) nounwind {
-;CHECK-LABEL: addhn_addhn2_4s
-;CHECK:     addhn.2s
-;CHECK:     addhn2.4s
-;CHECK-NOT: uzp2.4s
+; CHECK-SD-LABEL: addhn_addhn2_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    addhn v0.2s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn_addhn2_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v1.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    shrn v0.2s, v1.2d, #32
+; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
+; CHECK-GI-NEXT:    ret
             %tmp1 = load <2 x i64>, ptr %A
             %tmp2 = load <2 x i64>, ptr %B
             %sum1 = add <2 x i64> %tmp1, %tmp2
@@ -955,8 +1372,20 @@ define <4 x i32> @addhn_addhn2_4s(ptr %A, ptr %B, ptr %C, ptr %D) nounwind {
 }
 
 define <8 x i8> @subhn8b_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn8b_natural:
-;CHECK: subhn.8b
+; CHECK-SD-LABEL: subhn8b_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    subhn v0.8b, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: subhn8b_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    sub v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %diff = sub <8 x i16> %tmp1, %tmp2
@@ -966,8 +1395,20 @@ define <8 x i8> @subhn8b_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @subhn4h_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn4h_natural:
-;CHECK: subhn.4h
+; CHECK-SD-LABEL: subhn4h_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    subhn v0.4h, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: subhn4h_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %diff = sub <4 x i32> %tmp1, %tmp2
@@ -977,8 +1418,20 @@ define <4 x i16> @subhn4h_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn2s_natural:
-;CHECK: subhn.2s
+; CHECK-SD-LABEL: subhn2s_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    subhn v0.2s, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: subhn2s_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %diff = sub <2 x i64> %tmp1, %tmp2
@@ -988,8 +1441,22 @@ define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn2_16b_natural:
-;CHECK: subhn2.16b
+; CHECK-SD-LABEL: subhn2_16b_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    subhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: subhn2_16b_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    sub v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    shrn2 v0.16b, v1.8h, #8
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %diff = sub <8 x i16> %tmp1, %tmp2
@@ -1000,8 +1467,22 @@ define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn2_8h_natural:
-;CHECK: subhn2.8h
+; CHECK-SD-LABEL: subhn2_8h_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    subhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: subhn2_8h_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    shrn2 v0.8h, v1.4s, #16
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %diff = sub <4 x i32> %tmp1, %tmp2
@@ -1012,8 +1493,22 @@ define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn2_4s_natural:
-;CHECK: subhn2.4s
+; CHECK-SD-LABEL: subhn2_4s_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    subhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: subhn2_4s_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %diff = sub <2 x i64> %tmp1, %tmp2
diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll
index 0241091fae0254..d3f703257e4751 100644
--- a/llvm/test/CodeGen/AArch64/neon-addlv.ll
+++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple aarch64-none-linux-gnu -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:         warning: Instruction selection used fallback path for uaddlv_v8i8_urshr
 
 declare <4 x i16>  @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
 declare <8 x i16>  @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
@@ -20,12 +23,20 @@ declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) nounwind readnone
 declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) nounwind readnone
 
 define i16 @uaddlv4h_from_v8i8(ptr %A) nounwind {
-; CHECK-LABEL: uaddlv4h_from_v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uaddlv4h_from_v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    uaddlv h0, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddlv4h_from_v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-GI-NEXT:    addv h0, v0.4h
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
   %tmp5 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %tmp3)
@@ -33,12 +44,20 @@ define i16 @uaddlv4h_from_v8i8(ptr %A) nounwind {
 }
 
 define i16 @uaddlv16b_from_v16i8(ptr %A) nounwind {
-; CHECK-LABEL: uaddlv16b_from_v16i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uaddlv h0, v0.16b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uaddlv16b_from_v16i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    uaddlv h0, v0.16b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddlv16b_from_v16i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
   %tmp5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %tmp3)
@@ -46,12 +65,20 @@ define i16 @uaddlv16b_from_v16i8(ptr %A) nounwind {
 }
 
 define i32 @uaddlv8h_from_v8i16(ptr %A) nounwind {
-; CHECK-LABEL: uaddlv8h_from_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uaddlv s0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uaddlv8h_from_v8i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddlv8h_from_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
   %tmp5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp3)
@@ -59,12 +86,20 @@ define i32 @uaddlv8h_from_v8i16(ptr %A) nounwind {
 }
 
 define i64 @uaddlv4s_from_v4i32(ptr %A) nounwind {
-; CHECK-LABEL: uaddlv4s_from_v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uaddlv4s_from_v4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddlv4s_from_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    uaddlp v0.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
   %tmp5 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %tmp3)
@@ -72,12 +107,20 @@ define i64 @uaddlv4s_from_v4i32(ptr %A) nounwind {
 }
 
 define i32 @uaddlv4h_from_v4i16(ptr %A) nounwind {
-; CHECK-LABEL: uaddlv4h_from_v4i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    uaddlv s0, v0.4h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uaddlv4h_from_v4i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    uaddlv s0, v0.4h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddlv4h_from_v4i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    uaddlp v0.2s, v0.4h
+; CHECK-GI-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
   %tmp5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %tmp3)
@@ -87,12 +130,20 @@ define i32 @uaddlv4h_from_v4i16(ptr %A) nounwind {
 
 
 define i16 @saddlv4h_from_v8i8(ptr %A) nounwind {
-; CHECK-LABEL: saddlv4h_from_v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    saddlv h0, v0.8b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: saddlv4h_from_v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    saddlv h0, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddlv4h_from_v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    saddlp v0.4h, v0.8b
+; CHECK-GI-NEXT:    addv h0, v0.4h
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
   %tmp5 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %tmp3)
@@ -100,12 +151,20 @@ define i16 @saddlv4h_from_v8i8(ptr %A) nounwind {
 }
 
 define i16 @saddlv16b_from_v16i8(ptr %A) nounwind {
-; CHECK-LABEL: saddlv16b_from_v16i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    saddlv h0, v0.16b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: saddlv16b_from_v16i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    saddlv h0, v0.16b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddlv16b_from_v16i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    saddlp v0.8h, v0.16b
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
   %tmp5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %tmp3)
@@ -113,12 +172,20 @@ define i16 @saddlv16b_from_v16i8(ptr %A) nounwind {
 }
 
 define i32 @saddlv8h_from_v8i16(ptr %A) nounwind {
-; CHECK-LABEL: saddlv8h_from_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: saddlv8h_from_v8i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    saddlv s0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddlv8h_from_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    saddlp v0.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
   %tmp5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp3)
@@ -126,12 +193,20 @@ define i32 @saddlv8h_from_v8i16(ptr %A) nounwind {
 }
 
 define i64 @saddlv4s_from_v4i32(ptr %A) nounwind {
-; CHECK-LABEL: saddlv4s_from_v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    saddlv d0, v0.4s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: saddlv4s_from_v4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddlv4s_from_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    saddlp v0.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
   %tmp5 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %tmp3)
@@ -139,12 +214,20 @@ define i64 @saddlv4s_from_v4i32(ptr %A) nounwind {
 }
 
 define i32 @saddlv4h_from_v4i16(ptr %A) nounwind {
-; CHECK-LABEL: saddlv4h_from_v4i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    saddlv s0, v0.4h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: saddlv4h_from_v4i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    saddlv s0, v0.4h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddlv4h_from_v4i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    saddlp v0.2s, v0.4h
+; CHECK-GI-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
   %tmp5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %tmp3)
@@ -154,11 +237,18 @@ define i32 @saddlv4h_from_v4i16(ptr %A) nounwind {
 declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>) nounwind readnone
 
 define i32 @uaddlv_known_bits_v8i8(<8 x i8> %a) {
-; CHECK-LABEL: uaddlv_known_bits_v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uaddlv_known_bits_v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uaddlv h0, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddlv_known_bits_v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
   %tmp1 = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
   %tmp2 = and i32 %tmp1, 65535
   ret i32 %tmp2
@@ -167,11 +257,18 @@ define i32 @uaddlv_known_bits_v8i8(<8 x i8> %a) {
 declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
 
 define i32 @uaddlv_known_bits_v16i8(<16 x i8> %a) {
-; CHECK-LABEL: uaddlv_known_bits_v16i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlv h0, v0.16b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uaddlv_known_bits_v16i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlv h0, v0.16b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddlv_known_bits_v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
   %0 = and i32 %vaddlv.i, 65535
@@ -179,12 +276,20 @@ entry:
 }
 
 define dso_local <8 x i8> @uaddlv_v8i8_dup(<8 x i8> %a) {
-; CHECK-LABEL: uaddlv_v8i8_dup:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    dup v0.8h, v0.h[0]
-; CHECK-NEXT:    rshrn v0.8b, v0.8h, #3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uaddlv_v8i8_dup:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlv h0, v0.8b
+; CHECK-SD-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-SD-NEXT:    rshrn v0.8b, v0.8h, #3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddlv_v8i8_dup:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    dup v0.8h, w8
+; CHECK-GI-NEXT:    rshrn v0.8b, v0.8h, #3
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
   %0 = trunc i32 %vaddlv.i to i16

>From 508dfdf81107e90c3781d3cc1a47321bfb65390f Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Tue, 19 Dec 2023 14:23:03 +0000
Subject: [PATCH 2/3] [AArch64][GlobalISel] Combine Vector Reduction Add Long

ADDLV(ADDLP) => ADDLV
Removes unnecessary ADDLP instruction

ADDV(ADDLP) => ADDLV
Already exists for SDAG, adding for GlobalISel
---
 llvm/lib/Target/AArch64/AArch64InstrGISel.td  |  15 ++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  23 ++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  52 +++++
 .../AArch64/GlobalISel/legalize-ctpop.mir     | 116 +++++-----
 .../AArch64/GlobalISel/legalize-cttz.mir      |  12 +-
 .../test/CodeGen/AArch64/arm64-neon-across.ll |  32 ++-
 llvm/test/CodeGen/AArch64/dp1.ll              |  33 +--
 llvm/test/CodeGen/AArch64/neon-addlv.ll       | 200 ++++++------------
 llvm/test/CodeGen/AArch64/popcount.ll         |  26 +--
 9 files changed, 268 insertions(+), 241 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index e53328d6553af3..58ca52f37b63b7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -227,6 +227,18 @@ def G_SMULL : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_UADDLP : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1);
+  let hasSideEffects = 0;
+}
+
+def G_SADDLP : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1);
+  let hasSideEffects = 0;
+}
+
 def G_UADDLV : AArch64GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
@@ -294,6 +306,9 @@ def : GINodeEquiv<G_BSP, AArch64bsp>;
 def : GINodeEquiv<G_UMULL, AArch64umull>;
 def : GINodeEquiv<G_SMULL, AArch64smull>;
 
+def : GINodeEquiv<G_SADDLP, AArch64saddlp_n>;
+def : GINodeEquiv<G_UADDLP, AArch64uaddlp_n>;
+
 def : GINodeEquiv<G_SADDLV, AArch64saddlv>;
 def : GINodeEquiv<G_UADDLV, AArch64uaddlv>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6a744689b79ec8..c63f23bda6805a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6664,6 +6664,26 @@ multiclass SIMDAcrossLaneLongPairIntrinsic<string Opc, SDPatternOperator addlp>
 defm : SIMDAcrossLaneLongPairIntrinsic<"UADDLV", AArch64uaddlp>;
 defm : SIMDAcrossLaneLongPairIntrinsic<"SADDLV", AArch64saddlp>;
 
+// Pattern is used for GlobalISel
+multiclass SIMDAcrossLaneLongPairIntrinsicGISel<string Opc, SDPatternOperator addlp> {
+  // Patterns for addv(addlp(x)) ==> addlv
+  def : Pat<(i16 (vecreduce_add (v4i16 (addlp (v8i8 V64:$Rn))))),
+            (!cast<Instruction>(Opc#"v8i8v") V64:$Rn)>;
+  def : Pat<(i16 (vecreduce_add (v8i16 (addlp (v16i8 V128:$Rn))))),
+            (!cast<Instruction>(Opc#"v16i8v") V128:$Rn)>;
+  def : Pat<(i32 (vecreduce_add (v4i32 (addlp (v8i16 V128:$Rn))))),
+            (!cast<Instruction>(Opc#"v8i16v") V128:$Rn)>;
+
+  // Patterns for addp(addlp(x))) ==> addlv
+  def : Pat<(i32 (vecreduce_add (v2i32 (addlp (v4i16 V64:$Rn))))),
+            (!cast<Instruction>(Opc#"v4i16v") V64:$Rn)>;
+  def : Pat<(i64 (vecreduce_add (v2i64 (addlp (v4i32 V128:$Rn))))),
+            (!cast<Instruction>(Opc#"v4i32v") V128:$Rn)>;
+}
+
+defm : SIMDAcrossLaneLongPairIntrinsicGISel<"UADDLV", AArch64uaddlp>;
+defm : SIMDAcrossLaneLongPairIntrinsicGISel<"SADDLV", AArch64saddlp>;
+
 // Patterns for uaddlv(uaddlp(x)) ==> uaddlv
 def : Pat<(i64 (int_aarch64_neon_uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
           (i64 (EXTRACT_SUBREG
@@ -6675,6 +6695,9 @@ def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op)))
             (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
             ssub))>;
 
+def : Pat<(v2i64 (AArch64uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
+          (v2i64 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub))>;
+
 def : Pat<(v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
           (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub))>;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index b561cb12c93a1c..13ffbb13d8ae30 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1451,6 +1451,58 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
 
     return true;
   }
+  case Intrinsic::aarch64_neon_uaddlp:
+  case Intrinsic::aarch64_neon_saddlp: {
+    MachineIRBuilder MIB(MI);
+    MachineRegisterInfo &MRI = *MIB.getMRI();
+
+    unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
+                       ? AArch64::G_UADDLP
+                       : AArch64::G_SADDLP;
+    MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)});
+    MI.eraseFromParent();
+
+    return true;
+  }
+  case Intrinsic::aarch64_neon_uaddlv:
+  case Intrinsic::aarch64_neon_saddlv: {
+    MachineIRBuilder MIB(MI);
+    MachineRegisterInfo &MRI = *MIB.getMRI();
+
+    unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
+                       ? AArch64::G_UADDLV
+                       : AArch64::G_SADDLV;
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(2).getReg();
+    LLT DstTy = MRI.getType(DstReg);
+
+    LLT MidTy, ExtTy;
+    if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) {
+      MidTy = LLT::fixed_vector(4, 32);
+      ExtTy = LLT::scalar(32);
+    } else {
+      MidTy = LLT::fixed_vector(2, 64);
+      ExtTy = LLT::scalar(64);
+    }
+
+    Register MidReg =
+        MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg();
+    Register ZeroReg =
+        MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
+    Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy},
+                                     {MidReg, ZeroReg})
+                          ->getOperand(0)
+                          .getReg();
+
+    if (DstTy.getScalarSizeInBits() < 32)
+      MIB.buildTrunc(DstReg, ExtReg);
+    else
+      MIB.buildCopy(DstReg, ExtReg);
+
+    MI.eraseFromParent();
+
+    return true;
+  }
   case Intrinsic::aarch64_neon_smax:
   case Intrinsic::aarch64_neon_smin:
   case Intrinsic::aarch64_neon_umax:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir
index ae0c29927afa6d..fe28c3a47ad5ed 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir
@@ -69,7 +69,10 @@ body:             |
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %copy(s32)
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[ZEXT]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: %ctpop:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C]](s64)
+    ; CHECK-NEXT: %ctpop:_(s32) = COPY [[EVEC]](s32)
     ; CHECK-NEXT: $w0 = COPY %ctpop(s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     ;
@@ -98,8 +101,11 @@ body:             |
     ; CHECK-NEXT: %copy:_(s64) = COPY $x0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST %copy(s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: %ctpop:_(s64) = G_ZEXT [[INT]](s32)
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C]](s64)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: %ctpop:_(s64) = G_ZEXT [[COPY]](s32)
     ; CHECK-NEXT: $x0 = COPY %ctpop(s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     ;
@@ -131,12 +137,14 @@ body:             |
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[COPY]](s64), [[COPY1]](s64)
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[MV]](s128)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<16 x s8>)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[INT]](s32), [[C]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C]](s64)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[C1]](s32)
     ; CHECK-NEXT: $x0 = COPY [[MV1]](s64)
-    ; CHECK-NEXT: $x1 = COPY [[C1]](s64)
+    ; CHECK-NEXT: $x1 = COPY [[C]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0, implicit $x1
     ;
     ; CHECK-CSSC-LABEL: name: s128_lower
@@ -177,9 +185,12 @@ body:             |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32)
-    ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     ;
     ; CHECK-CSSC-LABEL: name: widen_s16
@@ -216,9 +227,12 @@ body:             |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32)
-    ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     ;
     ; CHECK-CSSC-LABEL: name: widen_s8
@@ -255,9 +269,12 @@ body:             |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32)
-    ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     ;
     ; CHECK-CSSC-LABEL: name: widen_s3
@@ -293,9 +310,12 @@ body:             |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32)
-    ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     ;
     ; CHECK-CSSC-LABEL: name: different_sizes
@@ -329,8 +349,8 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<8 x s16>)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-NEXT: $q0 = COPY [[INT]](<8 x s16>)
+    ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-NEXT: $q0 = COPY [[UADDLP]](<8 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     ;
     ; CHECK-CSSC-LABEL: name: custom_8x16
@@ -339,8 +359,8 @@ body:             |
     ; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
     ; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<8 x s16>)
     ; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: $q0 = COPY [[INT]](<8 x s16>)
+    ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-CSSC-NEXT: $q0 = COPY [[UADDLP]](<8 x s16>)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $q0
     %0:_(<8 x s16>) = COPY $q0
     %1:_(<8 x s16>) = G_CTPOP %0(<8 x s16>)
@@ -361,9 +381,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<4 x s32>)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
-    ; CHECK-NEXT: $q0 = COPY [[INT1]](<4 x s32>)
+    ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-NEXT: $q0 = COPY [[UADDLP1]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     ;
     ; CHECK-CSSC-LABEL: name: custom_4x32
@@ -372,9 +392,9 @@ body:             |
     ; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
     ; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<4 x s32>)
     ; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
-    ; CHECK-CSSC-NEXT: $q0 = COPY [[INT1]](<4 x s32>)
+    ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-CSSC-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-CSSC-NEXT: $q0 = COPY [[UADDLP1]](<4 x s32>)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $q0
     %0:_(<4 x s32>) = COPY $q0
     %1:_(<4 x s32>) = G_CTPOP %0(<4 x s32>)
@@ -395,10 +415,10 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<2 x s64>)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
-    ; CHECK-NEXT: [[INT2:%[0-9]+]]:_(<2 x s64>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT1]](<4 x s32>)
-    ; CHECK-NEXT: $q0 = COPY [[INT2]](<2 x s64>)
+    ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-NEXT: [[UADDLP2:%[0-9]+]]:_(<2 x s64>) = G_UADDLP [[UADDLP1]]
+    ; CHECK-NEXT: $q0 = COPY [[UADDLP2]](<2 x s64>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     ;
     ; CHECK-CSSC-LABEL: name: custom_2x64
@@ -407,10 +427,10 @@ body:             |
     ; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<2 x s64>)
     ; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
-    ; CHECK-CSSC-NEXT: [[INT2:%[0-9]+]]:_(<2 x s64>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT1]](<4 x s32>)
-    ; CHECK-CSSC-NEXT: $q0 = COPY [[INT2]](<2 x s64>)
+    ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-CSSC-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-CSSC-NEXT: [[UADDLP2:%[0-9]+]]:_(<2 x s64>) = G_UADDLP [[UADDLP1]]
+    ; CHECK-CSSC-NEXT: $q0 = COPY [[UADDLP2]](<2 x s64>)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $q0
     %0:_(<2 x s64>) = COPY $q0
     %1:_(<2 x s64>) = G_CTPOP %0(<2 x s64>)
@@ -431,8 +451,8 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<4 x s16>)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: $d0 = COPY [[INT]](<4 x s16>)
+    ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<4 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-NEXT: $d0 = COPY [[UADDLP]](<4 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     ;
     ; CHECK-CSSC-LABEL: name: custom_4x16
@@ -441,8 +461,8 @@ body:             |
     ; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
     ; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<4 x s16>)
     ; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<8 x s8>)
-    ; CHECK-CSSC-NEXT: $d0 = COPY [[INT]](<4 x s16>)
+    ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<4 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-CSSC-NEXT: $d0 = COPY [[UADDLP]](<4 x s16>)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $d0
     %0:_(<4 x s16>) = COPY $d0
     %1:_(<4 x s16>) = G_CTPOP %0(<4 x s16>)
@@ -463,9 +483,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<2 x s32>)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<4 x s16>)
-    ; CHECK-NEXT: $d0 = COPY [[INT1]](<2 x s32>)
+    ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<4 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-NEXT: [[UADDLP1:%[0-9]+]]:_(<2 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-NEXT: $d0 = COPY [[UADDLP1]](<2 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     ;
     ; CHECK-CSSC-LABEL: name: custom_2x32
@@ -474,9 +494,9 @@ body:             |
     ; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
     ; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<2 x s32>)
     ; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<8 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT1:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<4 x s16>)
-    ; CHECK-CSSC-NEXT: $d0 = COPY [[INT1]](<2 x s32>)
+    ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<4 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-CSSC-NEXT: [[UADDLP1:%[0-9]+]]:_(<2 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-CSSC-NEXT: $d0 = COPY [[UADDLP1]](<2 x s32>)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $d0
     %0:_(<2 x s32>) = COPY $d0
     %1:_(<2 x s32>) = G_CTPOP %0(<2 x s32>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir
index 535a8d811e43a7..8b39ebd986dd74 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir
@@ -147,9 +147,9 @@ body:             |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[XOR]], [[ADD]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[AND]](<4 x s32>)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
-    ; CHECK-NEXT: $q0 = COPY [[INT1]](<4 x s32>)
+    ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-NEXT: $q0 = COPY [[UADDLP1]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     ;
     ; CHECK-CSSC-LABEL: name: v4s32
@@ -163,9 +163,9 @@ body:             |
     ; CHECK-CSSC-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[XOR]], [[ADD]]
     ; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[AND]](<4 x s32>)
     ; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
-    ; CHECK-CSSC-NEXT: $q0 = COPY [[INT1]](<4 x s32>)
+    ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-CSSC-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-CSSC-NEXT: $q0 = COPY [[UADDLP1]](<4 x s32>)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $q0
     %val:_(<4 x s32>) = COPY $q0
     %1:_(<4 x s32>) = G_CTTZ %val(<4 x s32>)
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
index 218f4147787d1d..f7ff64228ecd34 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
@@ -81,11 +81,17 @@ declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
 declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
 
 define i16 @test_vaddlv_s8(<8 x i8> %a) {
-; CHECK-LABEL: test_vaddlv_s8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlv h0, v0.8b
-; CHECK-NEXT:    smov w0, v0.h[0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddlv_s8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlv h0, v0.8b
+; CHECK-SD-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddlv_s8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
   %0 = trunc i32 %saddlvv.i to i16
@@ -127,11 +133,17 @@ entry:
 }
 
 define i16 @test_vaddlvq_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vaddlvq_s8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlv h0, v0.16b
-; CHECK-NEXT:    smov w0, v0.h[0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddlvq_s8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlv h0, v0.16b
+; CHECK-SD-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddlvq_s8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
   %0 = trunc i32 %saddlvv.i to i16
diff --git a/llvm/test/CodeGen/AArch64/dp1.ll b/llvm/test/CodeGen/AArch64/dp1.ll
index bb5b19e51995a4..949dad7798a6ca 100644
--- a/llvm/test/CodeGen/AArch64/dp1.ll
+++ b/llvm/test/CodeGen/AArch64/dp1.ll
@@ -197,27 +197,16 @@ define void @cttz_zeroundef_i64() {
 }
 
 define void @ctpop_i32() {
-; CHECK-SDAG-LABEL: ctpop_i32:
-; CHECK-SDAG:       // %bb.0:
-; CHECK-SDAG-NEXT:    adrp x8, :got:var32
-; CHECK-SDAG-NEXT:    ldr x8, [x8, :got_lo12:var32]
-; CHECK-SDAG-NEXT:    ldr w9, [x8]
-; CHECK-SDAG-NEXT:    fmov d0, x9
-; CHECK-SDAG-NEXT:    cnt v0.8b, v0.8b
-; CHECK-SDAG-NEXT:    uaddlv h0, v0.8b
-; CHECK-SDAG-NEXT:    str s0, [x8]
-; CHECK-SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: ctpop_i32:
-; CHECK-GISEL:       // %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, :got:var32
-; CHECK-GISEL-NEXT:    ldr x8, [x8, :got_lo12:var32]
-; CHECK-GISEL-NEXT:    ldr w9, [x8]
-; CHECK-GISEL-NEXT:    fmov d0, x9
-; CHECK-GISEL-NEXT:    cnt v0.8b, v0.8b
-; CHECK-GISEL-NEXT:    uaddlv h0, v0.8b
-; CHECK-GISEL-NEXT:    str s0, [x8]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: ctpop_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, :got:var32
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:var32]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlv h0, v0.8b
+; CHECK-NEXT:    str s0, [x8]
+; CHECK-NEXT:    ret
   %val0_tmp = load i32, ptr @var32
   %val4_tmp = call i32 @llvm.ctpop.i32(i32 %val0_tmp)
   store volatile i32 %val4_tmp, ptr @var32
@@ -244,7 +233,7 @@ define void @ctpop_i64() {
 ; CHECK-GISEL-NEXT:    fmov d0, x9
 ; CHECK-GISEL-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-GISEL-NEXT:    uaddlv h0, v0.8b
-; CHECK-GISEL-NEXT:    fmov w9, s0
+; CHECK-GISEL-NEXT:    mov w9, v0.s[0]
 ; CHECK-GISEL-NEXT:    str x9, [x8]
 ; CHECK-GISEL-NEXT:    ret
   %val0_tmp = load i64, ptr @var64
diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll
index d3f703257e4751..50f555b18ff07b 100644
--- a/llvm/test/CodeGen/AArch64/neon-addlv.ll
+++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll
@@ -23,20 +23,12 @@ declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) nounwind readnone
 declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) nounwind readnone
 
 define i16 @uaddlv4h_from_v8i8(ptr %A) nounwind {
-; CHECK-SD-LABEL: uaddlv4h_from_v8i8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d0, [x0]
-; CHECK-SD-NEXT:    uaddlv h0, v0.8b
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uaddlv4h_from_v8i8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d0, [x0]
-; CHECK-GI-NEXT:    uaddlp v0.4h, v0.8b
-; CHECK-GI-NEXT:    addv h0, v0.4h
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uaddlv4h_from_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    uaddlv h0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
   %tmp5 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %tmp3)
@@ -44,20 +36,12 @@ define i16 @uaddlv4h_from_v8i8(ptr %A) nounwind {
 }
 
 define i16 @uaddlv16b_from_v16i8(ptr %A) nounwind {
-; CHECK-SD-LABEL: uaddlv16b_from_v16i8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    uaddlv h0, v0.16b
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uaddlv16b_from_v16i8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    uaddlp v0.8h, v0.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uaddlv16b_from_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
   %tmp5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %tmp3)
@@ -65,20 +49,12 @@ define i16 @uaddlv16b_from_v16i8(ptr %A) nounwind {
 }
 
 define i32 @uaddlv8h_from_v8i16(ptr %A) nounwind {
-; CHECK-SD-LABEL: uaddlv8h_from_v8i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    uaddlv s0, v0.8h
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uaddlv8h_from_v8i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    uaddlp v0.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uaddlv8h_from_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    uaddlv s0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
   %tmp5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp3)
@@ -86,20 +62,12 @@ define i32 @uaddlv8h_from_v8i16(ptr %A) nounwind {
 }
 
 define i64 @uaddlv4s_from_v4i32(ptr %A) nounwind {
-; CHECK-SD-LABEL: uaddlv4s_from_v4i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-NEXT:    fmov x0, d0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uaddlv4s_from_v4i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    uaddlp v0.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uaddlv4s_from_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    uaddlv d0, v0.4s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
   %tmp5 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %tmp3)
@@ -107,20 +75,12 @@ define i64 @uaddlv4s_from_v4i32(ptr %A) nounwind {
 }
 
 define i32 @uaddlv4h_from_v4i16(ptr %A) nounwind {
-; CHECK-SD-LABEL: uaddlv4h_from_v4i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d0, [x0]
-; CHECK-SD-NEXT:    uaddlv s0, v0.4h
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uaddlv4h_from_v4i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d0, [x0]
-; CHECK-GI-NEXT:    uaddlp v0.2s, v0.4h
-; CHECK-GI-NEXT:    addp v0.2s, v0.2s, v0.2s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uaddlv4h_from_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    uaddlv s0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
   %tmp5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %tmp3)
@@ -130,20 +90,12 @@ define i32 @uaddlv4h_from_v4i16(ptr %A) nounwind {
 
 
 define i16 @saddlv4h_from_v8i8(ptr %A) nounwind {
-; CHECK-SD-LABEL: saddlv4h_from_v8i8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d0, [x0]
-; CHECK-SD-NEXT:    saddlv h0, v0.8b
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: saddlv4h_from_v8i8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d0, [x0]
-; CHECK-GI-NEXT:    saddlp v0.4h, v0.8b
-; CHECK-GI-NEXT:    addv h0, v0.4h
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: saddlv4h_from_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    saddlv h0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
   %tmp5 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %tmp3)
@@ -151,20 +103,12 @@ define i16 @saddlv4h_from_v8i8(ptr %A) nounwind {
 }
 
 define i16 @saddlv16b_from_v16i8(ptr %A) nounwind {
-; CHECK-SD-LABEL: saddlv16b_from_v16i8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    saddlv h0, v0.16b
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: saddlv16b_from_v16i8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    saddlp v0.8h, v0.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: saddlv16b_from_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    saddlv h0, v0.16b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
   %tmp5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %tmp3)
@@ -172,20 +116,12 @@ define i16 @saddlv16b_from_v16i8(ptr %A) nounwind {
 }
 
 define i32 @saddlv8h_from_v8i16(ptr %A) nounwind {
-; CHECK-SD-LABEL: saddlv8h_from_v8i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    saddlv s0, v0.8h
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: saddlv8h_from_v8i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    saddlp v0.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: saddlv8h_from_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
   %tmp5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp3)
@@ -193,20 +129,12 @@ define i32 @saddlv8h_from_v8i16(ptr %A) nounwind {
 }
 
 define i64 @saddlv4s_from_v4i32(ptr %A) nounwind {
-; CHECK-SD-LABEL: saddlv4s_from_v4i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    saddlv d0, v0.4s
-; CHECK-SD-NEXT:    fmov x0, d0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: saddlv4s_from_v4i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    saddlp v0.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: saddlv4s_from_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    saddlv d0, v0.4s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
   %tmp5 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %tmp3)
@@ -214,20 +142,12 @@ define i64 @saddlv4s_from_v4i32(ptr %A) nounwind {
 }
 
 define i32 @saddlv4h_from_v4i16(ptr %A) nounwind {
-; CHECK-SD-LABEL: saddlv4h_from_v4i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d0, [x0]
-; CHECK-SD-NEXT:    saddlv s0, v0.4h
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: saddlv4h_from_v4i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d0, [x0]
-; CHECK-GI-NEXT:    saddlp v0.2s, v0.4h
-; CHECK-GI-NEXT:    addp v0.2s, v0.2s, v0.2s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: saddlv4h_from_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
   %tmp5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %tmp3)
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 0a3ee98f843c80..b1231eeac1ea43 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -7,9 +7,8 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
 ; CHECK:       // %bb.0: // %Entry
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    uaddlv h1, v0.16b
-; CHECK-NEXT:    // implicit-def: $q0
-; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    // kill: def $q0 killed $h0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -38,19 +37,17 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
 ; CHECK-NEXT:    mov v0.d[0], x9
 ; CHECK-NEXT:    mov v0.d[1], x8
 ; CHECK-NEXT:    cnt v1.16b, v1.16b
-; CHECK-NEXT:    uaddlv h2, v1.16b
-; CHECK-NEXT:    // implicit-def: $q1
-; CHECK-NEXT:    fmov s1, s2
+; CHECK-NEXT:    uaddlv h1, v1.16b
+; CHECK-NEXT:    // kill: def $q1 killed $h1
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 killed $q1
-; CHECK-NEXT:    mov w10, wzr
 ; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    mov w10, wzr
 ; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    mov w8, w10
 ; CHECK-NEXT:    bfi x9, x8, #32, #32
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    uaddlv h1, v0.16b
-; CHECK-NEXT:    // implicit-def: $q0
-; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    // kill: def $q0 killed $h0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    mov w8, w0
@@ -76,16 +73,15 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
 ; CHECK-NEXT:    mov v0.d[0], x0
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    uaddlv h1, v0.16b
-; CHECK-NEXT:    // implicit-def: $q0
-; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    // kill: def $q0 killed $h0
+; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    // kill: def $x0 killed $w0
 ; CHECK-NEXT:    // kill: def $x8 killed $w8
 ; CHECK-NEXT:    bfi x0, x8, #32, #32
-; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    ret
 Entry:
   %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)

>From 08c2f403ee9929ba4364c9d37667080cf4d39791 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Fri, 19 Jan 2024 17:56:41 +0000
Subject: [PATCH 3/3] fixup! [AArch64][GlobalISel] Combine Vector Reduction Add
 Long

---
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 13ffbb13d8ae30..091b70e3c1f15e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1491,8 +1491,7 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
         MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
     Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy},
                                      {MidReg, ZeroReg})
-                          ->getOperand(0)
-                          .getReg();
+                          .getReg(0);
 
     if (DstTy.getScalarSizeInBits() < 32)
       MIB.buildTrunc(DstReg, ExtReg);



More information about the llvm-commits mailing list