[llvm] [GlobalISel][AArch64] Combine Vector Reduction Add Long (PR #76241)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 17 02:13:33 PST 2024
https://github.com/chuongg3 updated https://github.com/llvm/llvm-project/pull/76241
>From c933054069fd1e669ab2188b1008078405c32537 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Tue, 19 Dec 2023 13:28:36 +0000
Subject: [PATCH 1/2] [AArch64][GlobalISel] Pre-Commit Tests for Combine
ADDLV(ADDLP)
---
llvm/test/CodeGen/AArch64/arm64-vadd.ll | 883 ++++++++++++++++++------
llvm/test/CodeGen/AArch64/neon-addlv.ll | 259 ++++---
2 files changed, 871 insertions(+), 271 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
index ad089f38955be3b..38a568ac9191685 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
@@ -1,8 +1,17 @@
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=arm64-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI: warning: Instruction selection used fallback path for saddlp1d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uaddlp1d
define <8 x i8> @addhn8b(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn8b:
-;CHECK: addhn.8b
+; CHECK-LABEL: addhn8b:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: addhn v0.8b, v0.8h, v1.8h
+; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -10,8 +19,12 @@ define <8 x i8> @addhn8b(ptr %A, ptr %B) nounwind {
}
define <4 x i16> @addhn4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn4h:
-;CHECK: addhn.4h
+; CHECK-LABEL: addhn4h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -19,8 +32,12 @@ define <4 x i16> @addhn4h(ptr %A, ptr %B) nounwind {
}
define <2 x i32> @addhn2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2s:
-;CHECK: addhn.2s
+; CHECK-LABEL: addhn2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: addhn v0.2s, v0.2d, v1.2d
+; CHECK-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i64>, ptr %B
%tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -28,9 +45,12 @@ define <2 x i32> @addhn2s(ptr %A, ptr %B) nounwind {
}
define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
-;CHECK-LABEL: addhn2_16b:
-;CHECK: addhn.8b
-;CHECK-NEXT: addhn2.16b
+; CHECK-LABEL: addhn2_16b:
+; CHECK: // %bb.0:
+; CHECK-NEXT: addhn v2.8b, v0.8h, v1.8h
+; CHECK-NEXT: addhn2 v2.16b, v0.8h, v1.8h
+; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ret
%vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
%vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
%res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -38,9 +58,12 @@ define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
}
define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
-;CHECK-LABEL: addhn2_8h:
-;CHECK: addhn.4h
-;CHECK-NEXT: addhn2.8h
+; CHECK-LABEL: addhn2_8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: addhn v2.4h, v0.4s, v1.4s
+; CHECK-NEXT: addhn2 v2.8h, v0.4s, v1.4s
+; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ret
%vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
%vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
%res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -48,9 +71,12 @@ define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
}
define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
-;CHECK-LABEL: addhn2_4s:
-;CHECK: addhn.2s
-;CHECK-NEXT: addhn2.4s
+; CHECK-LABEL: addhn2_4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: addhn v2.2s, v0.2d, v1.2d
+; CHECK-NEXT: addhn2 v2.4s, v0.2d, v1.2d
+; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ret
%vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
%vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
%res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -63,8 +89,12 @@ declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind re
define <8 x i8> @raddhn8b(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: raddhn8b:
-;CHECK: raddhn.8b
+; CHECK-LABEL: raddhn8b:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: raddhn v0.8b, v0.8h, v1.8h
+; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -72,8 +102,12 @@ define <8 x i8> @raddhn8b(ptr %A, ptr %B) nounwind {
}
define <4 x i16> @raddhn4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: raddhn4h:
-;CHECK: raddhn.4h
+; CHECK-LABEL: raddhn4h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: raddhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -81,8 +115,12 @@ define <4 x i16> @raddhn4h(ptr %A, ptr %B) nounwind {
}
define <2 x i32> @raddhn2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: raddhn2s:
-;CHECK: raddhn.2s
+; CHECK-LABEL: raddhn2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: raddhn v0.2s, v0.2d, v1.2d
+; CHECK-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i64>, ptr %B
%tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -90,9 +128,12 @@ define <2 x i32> @raddhn2s(ptr %A, ptr %B) nounwind {
}
define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
-;CHECK-LABEL: raddhn2_16b:
-;CHECK: raddhn.8b
-;CHECK-NEXT: raddhn2.16b
+; CHECK-LABEL: raddhn2_16b:
+; CHECK: // %bb.0:
+; CHECK-NEXT: raddhn v2.8b, v0.8h, v1.8h
+; CHECK-NEXT: raddhn2 v2.16b, v0.8h, v1.8h
+; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ret
%vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
%vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
%res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -100,9 +141,12 @@ define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
}
define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
-;CHECK-LABEL: raddhn2_8h:
-;CHECK: raddhn.4h
-;CHECK-NEXT: raddhn2.8h
+; CHECK-LABEL: raddhn2_8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: raddhn v2.4h, v0.4s, v1.4s
+; CHECK-NEXT: raddhn2 v2.8h, v0.4s, v1.4s
+; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ret
%vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
%vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
%res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -110,9 +154,12 @@ define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
}
define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
-;CHECK-LABEL: raddhn2_4s:
-;CHECK: raddhn.2s
-;CHECK-NEXT: raddhn2.4s
+; CHECK-LABEL: raddhn2_4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: raddhn v2.2s, v0.2d, v1.2d
+; CHECK-NEXT: raddhn2 v2.4s, v0.2d, v1.2d
+; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ret
%vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
%vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
%res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -124,8 +171,12 @@ declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind
declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @saddl8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddl8h:
-;CHECK: saddl.8h
+; CHECK-LABEL: saddl8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
@@ -135,8 +186,12 @@ define <8 x i16> @saddl8h(ptr %A, ptr %B) nounwind {
}
define <4 x i32> @saddl4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddl4s:
-;CHECK: saddl.4s
+; CHECK-LABEL: saddl4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
@@ -146,8 +201,12 @@ define <4 x i32> @saddl4s(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @saddl2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddl2d:
-;CHECK: saddl.2d
+; CHECK-LABEL: saddl2d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
@@ -158,8 +217,9 @@ define <2 x i64> @saddl2d(ptr %A, ptr %B) nounwind {
define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind {
; CHECK-LABEL: saddl2_8h:
-; CHECK-NEXT: saddl2.8h v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: saddl2 v0.8h, v0.16b, v1.16b
+; CHECK-NEXT: ret
%tmp = bitcast <16 x i8> %a to <2 x i64>
%shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
@@ -174,8 +234,9 @@ define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind {
define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind {
; CHECK-LABEL: saddl2_4s:
-; CHECK-NEXT: saddl2.4s v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: saddl2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT: ret
%tmp = bitcast <8 x i16> %a to <2 x i64>
%shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
@@ -190,8 +251,9 @@ define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind {
define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind {
; CHECK-LABEL: saddl2_2d:
-; CHECK-NEXT: saddl2.2d v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: saddl2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT: ret
%tmp = bitcast <4 x i32> %a to <2 x i64>
%shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
@@ -205,8 +267,12 @@ define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind {
}
define <8 x i16> @uaddl8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddl8h:
-;CHECK: uaddl.8h
+; CHECK-LABEL: uaddl8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -216,8 +282,12 @@ define <8 x i16> @uaddl8h(ptr %A, ptr %B) nounwind {
}
define <4 x i32> @uaddl4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddl4s:
-;CHECK: uaddl.4s
+; CHECK-LABEL: uaddl4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -227,8 +297,12 @@ define <4 x i32> @uaddl4s(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @uaddl2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddl2d:
-;CHECK: uaddl.2d
+; CHECK-LABEL: uaddl2d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -240,8 +314,9 @@ define <2 x i64> @uaddl2d(ptr %A, ptr %B) nounwind {
define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind {
; CHECK-LABEL: uaddl2_8h:
-; CHECK-NEXT: uaddl2.8h v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddl2 v0.8h, v0.16b, v1.16b
+; CHECK-NEXT: ret
%tmp = bitcast <16 x i8> %a to <2 x i64>
%shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
@@ -256,8 +331,9 @@ define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind {
define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind {
; CHECK-LABEL: uaddl2_4s:
-; CHECK-NEXT: uaddl2.4s v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddl2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT: ret
%tmp = bitcast <8 x i16> %a to <2 x i64>
%shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
@@ -272,8 +348,9 @@ define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind {
define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind {
; CHECK-LABEL: uaddl2_2d:
-; CHECK-NEXT: uaddl2.2d v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddl2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT: ret
%tmp = bitcast <4 x i32> %a to <2 x i64>
%shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
%tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
@@ -287,8 +364,12 @@ define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind {
}
define <8 x i16> @uaddw8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw8h:
-;CHECK: uaddw.8h
+; CHECK-LABEL: uaddw8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b
+; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
@@ -297,8 +378,12 @@ define <8 x i16> @uaddw8h(ptr %A, ptr %B) nounwind {
}
define <4 x i32> @uaddw4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw4s:
-;CHECK: uaddw.4s
+; CHECK-LABEL: uaddw4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
@@ -307,8 +392,12 @@ define <4 x i32> @uaddw4s(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @uaddw2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw2d:
-;CHECK: uaddw.2d
+; CHECK-LABEL: uaddw2d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s
+; CHECK-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
@@ -317,8 +406,19 @@ define <2 x i64> @uaddw2d(ptr %A, ptr %B) nounwind {
}
define <8 x i16> @uaddw2_8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw2_8h:
-;CHECK: uaddw.8h
+; CHECK-SD-LABEL: uaddw2_8h:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: ldr d1, [x1, #8]
+; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v1.8b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddw2_8h:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: uaddw2 v0.8h, v0.8h, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <16 x i8>, ptr %B
@@ -330,8 +430,19 @@ define <8 x i16> @uaddw2_8h(ptr %A, ptr %B) nounwind {
}
define <4 x i32> @uaddw2_4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw2_4s:
-;CHECK: uaddw.4s
+; CHECK-SD-LABEL: uaddw2_4s:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: ldr d1, [x1, #8]
+; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v1.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddw2_4s:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
@@ -343,8 +454,19 @@ define <4 x i32> @uaddw2_4s(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @uaddw2_2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw2_2d:
-;CHECK: uaddw.2d
+; CHECK-SD-LABEL: uaddw2_2d:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: ldr d1, [x1, #8]
+; CHECK-SD-NEXT: uaddw v0.2d, v0.2d, v1.2s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddw2_2d:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: uaddw2 v0.2d, v0.2d, v1.4s
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
@@ -356,8 +478,12 @@ define <2 x i64> @uaddw2_2d(ptr %A, ptr %B) nounwind {
}
define <8 x i16> @saddw8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw8h:
-;CHECK: saddw.8h
+; CHECK-LABEL: saddw8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: saddw v0.8h, v0.8h, v1.8b
+; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
@@ -366,8 +492,12 @@ define <8 x i16> @saddw8h(ptr %A, ptr %B) nounwind {
}
define <4 x i32> @saddw4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw4s:
-;CHECK: saddw.4s
+; CHECK-LABEL: saddw4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
@@ -376,8 +506,12 @@ define <4 x i32> @saddw4s(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @saddw2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw2d:
-;CHECK: saddw.2d
+; CHECK-LABEL: saddw2d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: saddw v0.2d, v0.2d, v1.2s
+; CHECK-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
@@ -386,8 +520,19 @@ define <2 x i64> @saddw2d(ptr %A, ptr %B) nounwind {
}
define <8 x i16> @saddw2_8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw2_8h:
-;CHECK: saddw.8h
+; CHECK-SD-LABEL: saddw2_8h:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: ldr d1, [x1, #8]
+; CHECK-SD-NEXT: saddw v0.8h, v0.8h, v1.8b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddw2_8h:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: saddw2 v0.8h, v0.8h, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <16 x i8>, ptr %B
@@ -399,8 +544,19 @@ define <8 x i16> @saddw2_8h(ptr %A, ptr %B) nounwind {
}
define <4 x i32> @saddw2_4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw2_4s:
-;CHECK: saddw.4s
+; CHECK-SD-LABEL: saddw2_4s:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: ldr d1, [x1, #8]
+; CHECK-SD-NEXT: saddw v0.4s, v0.4s, v1.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddw2_4s:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: saddw2 v0.4s, v0.4s, v1.8h
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
@@ -412,8 +568,19 @@ define <4 x i32> @saddw2_4s(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @saddw2_2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw2_2d:
-;CHECK: saddw.2d
+; CHECK-SD-LABEL: saddw2_2d:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: ldr d1, [x1, #8]
+; CHECK-SD-NEXT: saddw v0.2d, v0.2d, v1.2s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddw2_2d:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: saddw2 v0.2d, v0.2d, v1.4s
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
@@ -425,48 +592,66 @@ define <2 x i64> @saddw2_2d(ptr %A, ptr %B) nounwind {
}
define <4 x i16> @saddlp4h(ptr %A) nounwind {
-;CHECK-LABEL: saddlp4h:
-;CHECK: saddlp.4h
+; CHECK-LABEL: saddlp4h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: saddlp v0.4h, v0.8b
+; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
ret <4 x i16> %tmp3
}
define <2 x i32> @saddlp2s(ptr %A) nounwind {
-;CHECK-LABEL: saddlp2s:
-;CHECK: saddlp.2s
+; CHECK-LABEL: saddlp2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: saddlp v0.2s, v0.4h
+; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
ret <2 x i32> %tmp3
}
define <1 x i64> @saddlp1d(ptr %A) nounwind {
-;CHECK-LABEL: saddlp1d:
-;CHECK: saddlp.1d
+; CHECK-LABEL: saddlp1d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: saddlp v0.1d, v0.2s
+; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
ret <1 x i64> %tmp3
}
define <8 x i16> @saddlp8h(ptr %A) nounwind {
-;CHECK-LABEL: saddlp8h:
-;CHECK: saddlp.8h
+; CHECK-LABEL: saddlp8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: saddlp v0.8h, v0.16b
+; CHECK-NEXT: ret
%tmp1 = load <16 x i8>, ptr %A
%tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
ret <8 x i16> %tmp3
}
define <4 x i32> @saddlp4s(ptr %A) nounwind {
-;CHECK-LABEL: saddlp4s:
-;CHECK: saddlp.4s
+; CHECK-LABEL: saddlp4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: saddlp v0.4s, v0.8h
+; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
ret <4 x i32> %tmp3
}
define <2 x i64> @saddlp2d(ptr %A) nounwind {
-;CHECK-LABEL: saddlp2d:
-;CHECK: saddlp.2d
+; CHECK-LABEL: saddlp2d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: saddlp v0.2d, v0.4s
+; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
ret <2 x i64> %tmp3
@@ -481,48 +666,66 @@ declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind read
declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
define <4 x i16> @uaddlp4h(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp4h:
-;CHECK: uaddlp.4h
+; CHECK-LABEL: uaddlp4h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
ret <4 x i16> %tmp3
}
define <2 x i32> @uaddlp2s(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp2s:
-;CHECK: uaddlp.2s
+; CHECK-LABEL: uaddlp2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: uaddlp v0.2s, v0.4h
+; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
ret <2 x i32> %tmp3
}
define <1 x i64> @uaddlp1d(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp1d:
-;CHECK: uaddlp.1d
+; CHECK-LABEL: uaddlp1d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: uaddlp v0.1d, v0.2s
+; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
ret <1 x i64> %tmp3
}
define <8 x i16> @uaddlp8h(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp8h:
-;CHECK: uaddlp.8h
+; CHECK-LABEL: uaddlp8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-NEXT: ret
%tmp1 = load <16 x i8>, ptr %A
%tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
ret <8 x i16> %tmp3
}
define <4 x i32> @uaddlp4s(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp4s:
-;CHECK: uaddlp.4s
+; CHECK-LABEL: uaddlp4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: uaddlp v0.4s, v0.8h
+; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
ret <4 x i32> %tmp3
}
define <2 x i64> @uaddlp2d(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp2d:
-;CHECK: uaddlp.2d
+; CHECK-LABEL: uaddlp2d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: uaddlp v0.2d, v0.4s
+; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
ret <2 x i64> %tmp3
@@ -537,8 +740,12 @@ declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind read
declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
define <4 x i16> @sadalp4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp4h:
-;CHECK: sadalp.4h
+; CHECK-LABEL: sadalp4h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: ldr d0, [x1]
+; CHECK-NEXT: sadalp v0.4h, v1.8b
+; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
%tmp4 = load <4 x i16>, ptr %B
@@ -547,8 +754,12 @@ define <4 x i16> @sadalp4h(ptr %A, ptr %B) nounwind {
}
define <2 x i32> @sadalp2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp2s:
-;CHECK: sadalp.2s
+; CHECK-LABEL: sadalp2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: ldr d0, [x1]
+; CHECK-NEXT: sadalp v0.2s, v1.4h
+; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
%tmp4 = load <2 x i32>, ptr %B
@@ -557,8 +768,12 @@ define <2 x i32> @sadalp2s(ptr %A, ptr %B) nounwind {
}
define <8 x i16> @sadalp8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp8h:
-;CHECK: sadalp.8h
+; CHECK-LABEL: sadalp8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: sadalp v0.8h, v1.16b
+; CHECK-NEXT: ret
%tmp1 = load <16 x i8>, ptr %A
%tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
%tmp4 = load <8 x i16>, ptr %B
@@ -567,8 +782,12 @@ define <8 x i16> @sadalp8h(ptr %A, ptr %B) nounwind {
}
define <4 x i32> @sadalp4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp4s:
-;CHECK: sadalp.4s
+; CHECK-LABEL: sadalp4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: sadalp v0.4s, v1.8h
+; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
%tmp4 = load <4 x i32>, ptr %B
@@ -577,8 +796,12 @@ define <4 x i32> @sadalp4s(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @sadalp2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp2d:
-;CHECK: sadalp.2d
+; CHECK-LABEL: sadalp2d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: sadalp v0.2d, v1.4s
+; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
%tmp4 = load <2 x i64>, ptr %B
@@ -587,8 +810,12 @@ define <2 x i64> @sadalp2d(ptr %A, ptr %B) nounwind {
}
define <4 x i16> @uadalp4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp4h:
-;CHECK: uadalp.4h
+; CHECK-LABEL: uadalp4h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: ldr d0, [x1]
+; CHECK-NEXT: uadalp v0.4h, v1.8b
+; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
%tmp4 = load <4 x i16>, ptr %B
@@ -597,8 +824,12 @@ define <4 x i16> @uadalp4h(ptr %A, ptr %B) nounwind {
}
define <2 x i32> @uadalp2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp2s:
-;CHECK: uadalp.2s
+; CHECK-LABEL: uadalp2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: ldr d0, [x1]
+; CHECK-NEXT: uadalp v0.2s, v1.4h
+; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
%tmp4 = load <2 x i32>, ptr %B
@@ -607,8 +838,12 @@ define <2 x i32> @uadalp2s(ptr %A, ptr %B) nounwind {
}
define <8 x i16> @uadalp8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp8h:
-;CHECK: uadalp.8h
+; CHECK-LABEL: uadalp8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: uadalp v0.8h, v1.16b
+; CHECK-NEXT: ret
%tmp1 = load <16 x i8>, ptr %A
%tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
%tmp4 = load <8 x i16>, ptr %B
@@ -617,8 +852,12 @@ define <8 x i16> @uadalp8h(ptr %A, ptr %B) nounwind {
}
define <4 x i32> @uadalp4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp4s:
-;CHECK: uadalp.4s
+; CHECK-LABEL: uadalp4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: uadalp v0.4s, v1.8h
+; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
%tmp4 = load <4 x i32>, ptr %B
@@ -627,8 +866,12 @@ define <4 x i32> @uadalp4s(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @uadalp2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp2d:
-;CHECK: uadalp.2d
+; CHECK-LABEL: uadalp2d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: uadalp v0.2d, v1.4s
+; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
%tmp4 = load <2 x i64>, ptr %B
@@ -637,8 +880,12 @@ define <2 x i64> @uadalp2d(ptr %A, ptr %B) nounwind {
}
define <8 x i8> @addp_8b(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_8b:
-;CHECK: addp.8b
+; CHECK-LABEL: addp_8b:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: addp v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -646,8 +893,12 @@ define <8 x i8> @addp_8b(ptr %A, ptr %B) nounwind {
}
define <16 x i8> @addp_16b(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_16b:
-;CHECK: addp.16b
+; CHECK-LABEL: addp_16b:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: addp v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
%tmp1 = load <16 x i8>, ptr %A
%tmp2 = load <16 x i8>, ptr %B
%tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -655,8 +906,12 @@ define <16 x i8> @addp_16b(ptr %A, ptr %B) nounwind {
}
define <4 x i16> @addp_4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_4h:
-;CHECK: addp.4h
+; CHECK-LABEL: addp_4h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: addp v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -664,8 +919,12 @@ define <4 x i16> @addp_4h(ptr %A, ptr %B) nounwind {
}
define <8 x i16> @addp_8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_8h:
-;CHECK: addp.8h
+; CHECK-LABEL: addp_8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: addp v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -673,8 +932,12 @@ define <8 x i16> @addp_8h(ptr %A, ptr %B) nounwind {
}
define <2 x i32> @addp_2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_2s:
-;CHECK: addp.2s
+; CHECK-LABEL: addp_2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: addp v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -682,8 +945,12 @@ define <2 x i32> @addp_2s(ptr %A, ptr %B) nounwind {
}
define <4 x i32> @addp_4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_4s:
-;CHECK: addp.4s
+; CHECK-LABEL: addp_4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: addp v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -691,8 +958,12 @@ define <4 x i32> @addp_4s(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @addp_2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_2d:
-;CHECK: addp.2d
+; CHECK-LABEL: addp_2d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: addp v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i64>, ptr %B
%tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -708,8 +979,12 @@ declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind r
declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x float> @faddp_2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: faddp_2s:
-;CHECK: faddp.2s
+; CHECK-LABEL: faddp_2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: faddp v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
%tmp1 = load <2 x float>, ptr %A
%tmp2 = load <2 x float>, ptr %B
%tmp3 = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
@@ -717,8 +992,12 @@ define <2 x float> @faddp_2s(ptr %A, ptr %B) nounwind {
}
define <4 x float> @faddp_4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: faddp_4s:
-;CHECK: faddp.4s
+; CHECK-LABEL: faddp_4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: faddp v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
%tmp1 = load <4 x float>, ptr %A
%tmp2 = load <4 x float>, ptr %B
%tmp3 = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
@@ -726,8 +1005,12 @@ define <4 x float> @faddp_4s(ptr %A, ptr %B) nounwind {
}
define <2 x double> @faddp_2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: faddp_2d:
-;CHECK: faddp.2d
+; CHECK-LABEL: faddp_2d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: faddp v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ret
%tmp1 = load <2 x double>, ptr %A
%tmp2 = load <2 x double>, ptr %B
%tmp3 = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
@@ -739,9 +1022,11 @@ declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nou
declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone
define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: uaddl_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: uaddl.2d
+; CHECK-LABEL: uaddl_duprhs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v1.2s, w0
+; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: ret
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
@@ -755,9 +1040,18 @@ define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
}
define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: uaddl2_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: uaddl2.2d
+; CHECK-SD-LABEL: uaddl2_duprhs:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: dup v1.4s, w0
+; CHECK-SD-NEXT: uaddl2 v0.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddl2_duprhs:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: dup v1.2s, w0
+; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT: ret
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
@@ -771,9 +1065,11 @@ define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
}
define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: saddl_duplhs
-; CHECK-NOT: ext.16b
-; CHECK: saddl.2d
+; CHECK-LABEL: saddl_duplhs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v1.2s, w0
+; CHECK-NEXT: saddl v0.2d, v1.2s, v0.2s
+; CHECK-NEXT: ret
%lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
%lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
@@ -787,9 +1083,18 @@ define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
}
define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: saddl2_duplhs
-; CHECK-NOT: ext.16b
-; CHECK: saddl2.2d
+; CHECK-SD-LABEL: saddl2_duplhs:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: dup v1.4s, w0
+; CHECK-SD-NEXT: saddl2 v0.2d, v1.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddl2_duplhs:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: dup v1.2s, w0
+; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT: saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT: ret
%lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
%lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
@@ -803,9 +1108,11 @@ define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
}
define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: usubl_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: usubl.2d
+; CHECK-LABEL: usubl_duprhs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v1.2s, w0
+; CHECK-NEXT: usubl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: ret
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
@@ -819,9 +1126,18 @@ define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
}
define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: usubl2_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: usubl2.2d
+; CHECK-SD-LABEL: usubl2_duprhs:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: dup v1.4s, w0
+; CHECK-SD-NEXT: usubl2 v0.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: usubl2_duprhs:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: dup v1.2s, w0
+; CHECK-GI-NEXT: mov d0, v0.d[1]
+; CHECK-GI-NEXT: usubl v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: ret
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
@@ -836,8 +1152,10 @@ define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: ssubl_duplhs:
-; CHECK-NOT: ext.16b
-; CHECK: ssubl.2d
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v1.2s, w0
+; CHECK-NEXT: ssubl v0.2d, v1.2s, v0.2s
+; CHECK-NEXT: ret
%lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
%lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
@@ -851,9 +1169,18 @@ define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
}
define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: ssubl2_duplhs:
-; CHECK-NOT: ext.16b
-; CHECK: ssubl2.2d
+; CHECK-SD-LABEL: ssubl2_duplhs:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: dup v1.4s, w0
+; CHECK-SD-NEXT: ssubl2 v0.2d, v1.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ssubl2_duplhs:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: dup v1.2s, w0
+; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT: ssubw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT: ret
%lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
%lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
@@ -867,8 +1194,20 @@ define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
}
define <8 x i8> @addhn8b_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn8b_natural:
-;CHECK: addhn.8b
+; CHECK-SD-LABEL: addhn8b_natural:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: ldr q1, [x1]
+; CHECK-SD-NEXT: addhn v0.8b, v0.8h, v1.8h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: addhn8b_natural:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%sum = add <8 x i16> %tmp1, %tmp2
@@ -878,8 +1217,20 @@ define <8 x i8> @addhn8b_natural(ptr %A, ptr %B) nounwind {
}
define <4 x i16> @addhn4h_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn4h_natural:
-;CHECK: addhn.4h
+; CHECK-SD-LABEL: addhn4h_natural:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: ldr q1, [x1]
+; CHECK-SD-NEXT: addhn v0.4h, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: addhn4h_natural:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%sum = add <4 x i32> %tmp1, %tmp2
@@ -889,8 +1240,20 @@ define <4 x i16> @addhn4h_natural(ptr %A, ptr %B) nounwind {
}
define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2s_natural:
-;CHECK: addhn.2s
+; CHECK-SD-LABEL: addhn2s_natural:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: ldr q1, [x1]
+; CHECK-SD-NEXT: addhn v0.2s, v0.2d, v1.2d
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: addhn2s_natural:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i64>, ptr %B
%sum = add <2 x i64> %tmp1, %tmp2
@@ -900,8 +1263,22 @@ define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind {
}
define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2_16b_natural:
-;CHECK: addhn2.16b
+; CHECK-SD-LABEL: addhn2_16b_natural:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q1, [x0]
+; CHECK-SD-NEXT: ldr q2, [x1]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: addhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: addhn2_16b_natural:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q1, [x0]
+; CHECK-GI-NEXT: ldr q2, [x1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: add v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT: shrn2 v0.16b, v1.8h, #8
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%sum = add <8 x i16> %tmp1, %tmp2
@@ -912,8 +1289,22 @@ define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
}
define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2_8h_natural:
-;CHECK: addhn2.8h
+; CHECK-SD-LABEL: addhn2_8h_natural:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q1, [x0]
+; CHECK-SD-NEXT: ldr q2, [x1]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: addhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: addhn2_8h_natural:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q1, [x0]
+; CHECK-GI-NEXT: ldr q2, [x1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: shrn2 v0.8h, v1.4s, #16
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%sum = add <4 x i32> %tmp1, %tmp2
@@ -924,8 +1315,22 @@ define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
}
define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2_4s_natural:
-;CHECK: addhn2.4s
+; CHECK-SD-LABEL: addhn2_4s_natural:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q1, [x0]
+; CHECK-SD-NEXT: ldr q2, [x1]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: addhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: addhn2_4s_natural:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q1, [x0]
+; CHECK-GI-NEXT: ldr q2, [x1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: add v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #32
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i64>, ptr %B
%sum = add <2 x i64> %tmp1, %tmp2
@@ -936,10 +1341,22 @@ define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
}
define <4 x i32> @addhn_addhn2_4s(ptr %A, ptr %B, ptr %C, ptr %D) nounwind {
-;CHECK-LABEL: addhn_addhn2_4s
-;CHECK: addhn.2s
-;CHECK: addhn2.4s
-;CHECK-NOT: uzp2.4s
+; CHECK-SD-LABEL: addhn_addhn2_4s:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q1, [x0]
+; CHECK-SD-NEXT: ldr q2, [x1]
+; CHECK-SD-NEXT: addhn v0.2s, v1.2d, v2.2d
+; CHECK-SD-NEXT: addhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: addhn_addhn2_4s:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: add v1.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT: shrn v0.2s, v1.2d, #32
+; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #32
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i64>, ptr %B
%sum1 = add <2 x i64> %tmp1, %tmp2
@@ -955,8 +1372,20 @@ define <4 x i32> @addhn_addhn2_4s(ptr %A, ptr %B, ptr %C, ptr %D) nounwind {
}
define <8 x i8> @subhn8b_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn8b_natural:
-;CHECK: subhn.8b
+; CHECK-SD-LABEL: subhn8b_natural:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: ldr q1, [x1]
+; CHECK-SD-NEXT: subhn v0.8b, v0.8h, v1.8h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: subhn8b_natural:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: sub v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%diff = sub <8 x i16> %tmp1, %tmp2
@@ -966,8 +1395,20 @@ define <8 x i8> @subhn8b_natural(ptr %A, ptr %B) nounwind {
}
define <4 x i16> @subhn4h_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn4h_natural:
-;CHECK: subhn.4h
+; CHECK-SD-LABEL: subhn4h_natural:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: ldr q1, [x1]
+; CHECK-SD-NEXT: subhn v0.4h, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: subhn4h_natural:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%diff = sub <4 x i32> %tmp1, %tmp2
@@ -977,8 +1418,20 @@ define <4 x i16> @subhn4h_natural(ptr %A, ptr %B) nounwind {
}
define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn2s_natural:
-;CHECK: subhn.2s
+; CHECK-SD-LABEL: subhn2s_natural:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: ldr q1, [x1]
+; CHECK-SD-NEXT: subhn v0.2s, v0.2d, v1.2d
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: subhn2s_natural:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i64>, ptr %B
%diff = sub <2 x i64> %tmp1, %tmp2
@@ -988,8 +1441,22 @@ define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind {
}
define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn2_16b_natural:
-;CHECK: subhn2.16b
+; CHECK-SD-LABEL: subhn2_16b_natural:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q1, [x0]
+; CHECK-SD-NEXT: ldr q2, [x1]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: subhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: subhn2_16b_natural:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q1, [x0]
+; CHECK-GI-NEXT: ldr q2, [x1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT: shrn2 v0.16b, v1.8h, #8
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%diff = sub <8 x i16> %tmp1, %tmp2
@@ -1000,8 +1467,22 @@ define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
}
define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn2_8h_natural:
-;CHECK: subhn2.8h
+; CHECK-SD-LABEL: subhn2_8h_natural:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q1, [x0]
+; CHECK-SD-NEXT: ldr q2, [x1]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: subhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: subhn2_8h_natural:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q1, [x0]
+; CHECK-GI-NEXT: ldr q2, [x1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: shrn2 v0.8h, v1.4s, #16
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%diff = sub <4 x i32> %tmp1, %tmp2
@@ -1012,8 +1493,22 @@ define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
}
define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn2_4s_natural:
-;CHECK: subhn2.4s
+; CHECK-SD-LABEL: subhn2_4s_natural:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q1, [x0]
+; CHECK-SD-NEXT: ldr q2, [x1]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: subhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: subhn2_4s_natural:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q1, [x0]
+; CHECK-GI-NEXT: ldr q2, [x1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: sub v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #32
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i64>, ptr %B
%diff = sub <2 x i64> %tmp1, %tmp2
diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll
index 0241091fae02542..d3f703257e47519 100644
--- a/llvm/test/CodeGen/AArch64/neon-addlv.ll
+++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll
@@ -1,5 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple aarch64-none-linux-gnu -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI: warning: Instruction selection used fallback path for uaddlv_v8i8_urshr
declare <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
@@ -20,12 +23,20 @@ declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) nounwind readnone
declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) nounwind readnone
define i16 @uaddlv4h_from_v8i8(ptr %A) nounwind {
-; CHECK-LABEL: uaddlv4h_from_v8i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: uaddlv h0, v0.8b
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: uaddlv4h_from_v8i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr d0, [x0]
+; CHECK-SD-NEXT: uaddlv h0, v0.8b
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddlv4h_from_v8i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-GI-NEXT: addv h0, v0.4h
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
%tmp5 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %tmp3)
@@ -33,12 +44,20 @@ define i16 @uaddlv4h_from_v8i8(ptr %A) nounwind {
}
define i16 @uaddlv16b_from_v16i8(ptr %A) nounwind {
-; CHECK-LABEL: uaddlv16b_from_v16i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: uaddlv h0, v0.16b
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: uaddlv16b_from_v16i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: uaddlv h0, v0.16b
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddlv16b_from_v16i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-GI-NEXT: addv h0, v0.8h
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
%tmp1 = load <16 x i8>, ptr %A
%tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
%tmp5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %tmp3)
@@ -46,12 +65,20 @@ define i16 @uaddlv16b_from_v16i8(ptr %A) nounwind {
}
define i32 @uaddlv8h_from_v8i16(ptr %A) nounwind {
-; CHECK-LABEL: uaddlv8h_from_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: uaddlv s0, v0.8h
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: uaddlv8h_from_v8i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: uaddlv s0, v0.8h
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddlv8h_from_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: uaddlp v0.4s, v0.8h
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
%tmp5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp3)
@@ -59,12 +86,20 @@ define i32 @uaddlv8h_from_v8i16(ptr %A) nounwind {
}
define i64 @uaddlv4s_from_v4i32(ptr %A) nounwind {
-; CHECK-LABEL: uaddlv4s_from_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: uaddlv d0, v0.4s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: uaddlv4s_from_v4i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: uaddlv d0, v0.4s
+; CHECK-SD-NEXT: fmov x0, d0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddlv4s_from_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: uaddlp v0.2d, v0.4s
+; CHECK-GI-NEXT: addp d0, v0.2d
+; CHECK-GI-NEXT: fmov x0, d0
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
%tmp5 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %tmp3)
@@ -72,12 +107,20 @@ define i64 @uaddlv4s_from_v4i32(ptr %A) nounwind {
}
define i32 @uaddlv4h_from_v4i16(ptr %A) nounwind {
-; CHECK-LABEL: uaddlv4h_from_v4i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: uaddlv s0, v0.4h
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: uaddlv4h_from_v4i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr d0, [x0]
+; CHECK-SD-NEXT: uaddlv s0, v0.4h
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddlv4h_from_v4i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: uaddlp v0.2s, v0.4h
+; CHECK-GI-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
%tmp5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %tmp3)
@@ -87,12 +130,20 @@ define i32 @uaddlv4h_from_v4i16(ptr %A) nounwind {
define i16 @saddlv4h_from_v8i8(ptr %A) nounwind {
-; CHECK-LABEL: saddlv4h_from_v8i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: saddlv h0, v0.8b
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: saddlv4h_from_v8i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr d0, [x0]
+; CHECK-SD-NEXT: saddlv h0, v0.8b
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddlv4h_from_v8i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: saddlp v0.4h, v0.8b
+; CHECK-GI-NEXT: addv h0, v0.4h
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
%tmp5 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %tmp3)
@@ -100,12 +151,20 @@ define i16 @saddlv4h_from_v8i8(ptr %A) nounwind {
}
define i16 @saddlv16b_from_v16i8(ptr %A) nounwind {
-; CHECK-LABEL: saddlv16b_from_v16i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: saddlv h0, v0.16b
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: saddlv16b_from_v16i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: saddlv h0, v0.16b
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddlv16b_from_v16i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: saddlp v0.8h, v0.16b
+; CHECK-GI-NEXT: addv h0, v0.8h
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
%tmp1 = load <16 x i8>, ptr %A
%tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
%tmp5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %tmp3)
@@ -113,12 +172,20 @@ define i16 @saddlv16b_from_v16i8(ptr %A) nounwind {
}
define i32 @saddlv8h_from_v8i16(ptr %A) nounwind {
-; CHECK-LABEL: saddlv8h_from_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: saddlv s0, v0.8h
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: saddlv8h_from_v8i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: saddlv s0, v0.8h
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddlv8h_from_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: saddlp v0.4s, v0.8h
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
%tmp5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp3)
@@ -126,12 +193,20 @@ define i32 @saddlv8h_from_v8i16(ptr %A) nounwind {
}
define i64 @saddlv4s_from_v4i32(ptr %A) nounwind {
-; CHECK-LABEL: saddlv4s_from_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: saddlv d0, v0.4s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: saddlv4s_from_v4i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr q0, [x0]
+; CHECK-SD-NEXT: saddlv d0, v0.4s
+; CHECK-SD-NEXT: fmov x0, d0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddlv4s_from_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: saddlp v0.2d, v0.4s
+; CHECK-GI-NEXT: addp d0, v0.2d
+; CHECK-GI-NEXT: fmov x0, d0
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
%tmp5 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %tmp3)
@@ -139,12 +214,20 @@ define i64 @saddlv4s_from_v4i32(ptr %A) nounwind {
}
define i32 @saddlv4h_from_v4i16(ptr %A) nounwind {
-; CHECK-LABEL: saddlv4h_from_v4i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: saddlv s0, v0.4h
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: saddlv4h_from_v4i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr d0, [x0]
+; CHECK-SD-NEXT: saddlv s0, v0.4h
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: saddlv4h_from_v4i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: saddlp v0.2s, v0.4h
+; CHECK-GI-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
%tmp5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %tmp3)
@@ -154,11 +237,18 @@ define i32 @saddlv4h_from_v4i16(ptr %A) nounwind {
declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>) nounwind readnone
define i32 @uaddlv_known_bits_v8i8(<8 x i8> %a) {
-; CHECK-LABEL: uaddlv_known_bits_v8i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: uaddlv h0, v0.8b
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: uaddlv_known_bits_v8i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: uaddlv h0, v0.8b
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddlv_known_bits_v8i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: uaddlv h0, v0.8b
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: and w0, w8, #0xffff
+; CHECK-GI-NEXT: ret
%tmp1 = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
%tmp2 = and i32 %tmp1, 65535
ret i32 %tmp2
@@ -167,11 +257,18 @@ define i32 @uaddlv_known_bits_v8i8(<8 x i8> %a) {
declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
define i32 @uaddlv_known_bits_v16i8(<16 x i8> %a) {
-; CHECK-LABEL: uaddlv_known_bits_v16i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uaddlv h0, v0.16b
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: uaddlv_known_bits_v16i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: uaddlv h0, v0.16b
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddlv_known_bits_v16i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: uaddlv h0, v0.16b
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: and w0, w8, #0xffff
+; CHECK-GI-NEXT: ret
entry:
%vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
%0 = and i32 %vaddlv.i, 65535
@@ -179,12 +276,20 @@ entry:
}
define dso_local <8 x i8> @uaddlv_v8i8_dup(<8 x i8> %a) {
-; CHECK-LABEL: uaddlv_v8i8_dup:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uaddlv h0, v0.8b
-; CHECK-NEXT: dup v0.8h, v0.h[0]
-; CHECK-NEXT: rshrn v0.8b, v0.8h, #3
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: uaddlv_v8i8_dup:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: uaddlv h0, v0.8b
+; CHECK-SD-NEXT: dup v0.8h, v0.h[0]
+; CHECK-SD-NEXT: rshrn v0.8b, v0.8h, #3
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddlv_v8i8_dup:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: uaddlv h0, v0.8b
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: dup v0.8h, w8
+; CHECK-GI-NEXT: rshrn v0.8b, v0.8h, #3
+; CHECK-GI-NEXT: ret
entry:
%vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
%0 = trunc i32 %vaddlv.i to i16
>From 49dfa8047666805370385cad1f9bb66af108809d Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Tue, 19 Dec 2023 14:23:03 +0000
Subject: [PATCH 2/2] [AArch64][GlobalISel] Combine Vector Reduction Add Long
ADDLV(ADDLP) => ADDLV
Removes unnecessary ADDLP instruction
ADDV(ADDLP) => ADDLV
Already exists for SDAG, adding for GlobalISel
---
llvm/lib/Target/AArch64/AArch64InstrGISel.td | 15 ++
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 23 ++
.../AArch64/GISel/AArch64LegalizerInfo.cpp | 52 +++++
.../AArch64/GlobalISel/legalize-ctpop.mir | 116 +++++-----
.../AArch64/GlobalISel/legalize-cttz.mir | 12 +-
.../test/CodeGen/AArch64/arm64-neon-across.ll | 32 ++-
llvm/test/CodeGen/AArch64/dp1.ll | 33 +--
llvm/test/CodeGen/AArch64/neon-addlv.ll | 200 ++++++------------
llvm/test/CodeGen/AArch64/popcount.ll | 26 +--
9 files changed, 268 insertions(+), 241 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index e53328d6553af3b..58ca52f37b63b7a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -227,6 +227,18 @@ def G_SMULL : AArch64GenericInstruction {
let hasSideEffects = 0;
}
+def G_UADDLP : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1);
+ let hasSideEffects = 0;
+}
+
+def G_SADDLP : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1);
+ let hasSideEffects = 0;
+}
+
def G_UADDLV : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src1);
@@ -294,6 +306,9 @@ def : GINodeEquiv<G_BSP, AArch64bsp>;
def : GINodeEquiv<G_UMULL, AArch64umull>;
def : GINodeEquiv<G_SMULL, AArch64smull>;
+def : GINodeEquiv<G_SADDLP, AArch64saddlp_n>;
+def : GINodeEquiv<G_UADDLP, AArch64uaddlp_n>;
+
def : GINodeEquiv<G_SADDLV, AArch64saddlv>;
def : GINodeEquiv<G_UADDLV, AArch64uaddlv>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ddbe840079a573b..74fcdd9d71ff632 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6664,6 +6664,26 @@ multiclass SIMDAcrossLaneLongPairIntrinsic<string Opc, SDPatternOperator addlp>
defm : SIMDAcrossLaneLongPairIntrinsic<"UADDLV", AArch64uaddlp>;
defm : SIMDAcrossLaneLongPairIntrinsic<"SADDLV", AArch64saddlp>;
+// Pattern is used for GlobalISel
+multiclass SIMDAcrossLaneLongPairIntrinsicGISel<string Opc, SDPatternOperator addlp> {
+ // Patterns for addv(addlp(x)) ==> addlv
+ def : Pat<(i16 (vecreduce_add (v4i16 (addlp (v8i8 V64:$Rn))))),
+ (!cast<Instruction>(Opc#"v8i8v") V64:$Rn)>;
+ def : Pat<(i16 (vecreduce_add (v8i16 (addlp (v16i8 V128:$Rn))))),
+ (!cast<Instruction>(Opc#"v16i8v") V128:$Rn)>;
+ def : Pat<(i32 (vecreduce_add (v4i32 (addlp (v8i16 V128:$Rn))))),
+ (!cast<Instruction>(Opc#"v8i16v") V128:$Rn)>;
+
+ // Patterns for addp(addlp(x))) ==> addlv
+ def : Pat<(i32 (vecreduce_add (v2i32 (addlp (v4i16 V64:$Rn))))),
+ (!cast<Instruction>(Opc#"v4i16v") V64:$Rn)>;
+ def : Pat<(i64 (vecreduce_add (v2i64 (addlp (v4i32 V128:$Rn))))),
+ (!cast<Instruction>(Opc#"v4i32v") V128:$Rn)>;
+}
+
+defm : SIMDAcrossLaneLongPairIntrinsicGISel<"UADDLV", AArch64uaddlp>;
+defm : SIMDAcrossLaneLongPairIntrinsicGISel<"SADDLV", AArch64saddlp>;
+
// Patterns for uaddlv(uaddlp(x)) ==> uaddlv
def : Pat<(i64 (int_aarch64_neon_uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
(i64 (EXTRACT_SUBREG
@@ -6675,6 +6695,9 @@ def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op)))
(v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
ssub))>;
+def : Pat<(v2i64 (AArch64uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
+ (v2i64 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub))>;
+
def : Pat<(v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub))>;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index e94f9d0c68ffe78..2b6501784fb1979 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1445,6 +1445,58 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return true;
}
+ case Intrinsic::aarch64_neon_uaddlp:
+ case Intrinsic::aarch64_neon_saddlp: {
+ MachineIRBuilder MIB(MI);
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+
+ unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
+ ? AArch64::G_UADDLP
+ : AArch64::G_SADDLP;
+ MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)});
+ MI.eraseFromParent();
+
+ return true;
+ }
+ case Intrinsic::aarch64_neon_uaddlv:
+ case Intrinsic::aarch64_neon_saddlv: {
+ MachineIRBuilder MIB(MI);
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+
+ unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
+ ? AArch64::G_UADDLV
+ : AArch64::G_SADDLV;
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(2).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+
+ LLT MidTy, ExtTy;
+ if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) {
+ MidTy = LLT::fixed_vector(4, 32);
+ ExtTy = LLT::scalar(32);
+ } else {
+ MidTy = LLT::fixed_vector(2, 64);
+ ExtTy = LLT::scalar(64);
+ }
+
+ Register MidReg =
+ MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg();
+ Register ZeroReg =
+ MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
+ Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy},
+ {MidReg, ZeroReg})
+ ->getOperand(0)
+ .getReg();
+
+ if (DstTy.getScalarSizeInBits() < 32)
+ MIB.buildTrunc(DstReg, ExtReg);
+ else
+ MIB.buildCopy(DstReg, ExtReg);
+
+ MI.eraseFromParent();
+
+ return true;
+ }
case Intrinsic::aarch64_neon_smax:
case Intrinsic::aarch64_neon_smin:
case Intrinsic::aarch64_neon_umax:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir
index ae0c29927afa6d4..fe28c3a47ad5ed9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir
@@ -69,7 +69,10 @@ body: |
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %copy(s32)
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[ZEXT]](s64)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
- ; CHECK-NEXT: %ctpop:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
+ ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C]](s64)
+ ; CHECK-NEXT: %ctpop:_(s32) = COPY [[EVEC]](s32)
; CHECK-NEXT: $w0 = COPY %ctpop(s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
;
@@ -98,8 +101,11 @@ body: |
; CHECK-NEXT: %copy:_(s64) = COPY $x0
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST %copy(s64)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
- ; CHECK-NEXT: %ctpop:_(s64) = G_ZEXT [[INT]](s32)
+ ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C]](s64)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+ ; CHECK-NEXT: %ctpop:_(s64) = G_ZEXT [[COPY]](s32)
; CHECK-NEXT: $x0 = COPY %ctpop(s64)
; CHECK-NEXT: RET_ReallyLR implicit $x0
;
@@ -131,12 +137,14 @@ body: |
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[COPY]](s64), [[COPY1]](s64)
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[MV]](s128)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<16 x s8>)
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[INT]](s32), [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C]](s64)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[C1]](s32)
; CHECK-NEXT: $x0 = COPY [[MV1]](s64)
- ; CHECK-NEXT: $x1 = COPY [[C1]](s64)
+ ; CHECK-NEXT: $x1 = COPY [[C]](s64)
; CHECK-NEXT: RET_ReallyLR implicit $x0, implicit $x1
;
; CHECK-CSSC-LABEL: name: s128_lower
@@ -177,9 +185,12 @@ body: |
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32)
- ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C1]](s64)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
;
; CHECK-CSSC-LABEL: name: widen_s16
@@ -216,9 +227,12 @@ body: |
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32)
- ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C1]](s64)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
;
; CHECK-CSSC-LABEL: name: widen_s8
@@ -255,9 +269,12 @@ body: |
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32)
- ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C1]](s64)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
;
; CHECK-CSSC-LABEL: name: widen_s3
@@ -293,9 +310,12 @@ body: |
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32)
- ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C1]](s64)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
;
; CHECK-CSSC-LABEL: name: different_sizes
@@ -329,8 +349,8 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<8 x s16>)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
- ; CHECK-NEXT: $q0 = COPY [[INT]](<8 x s16>)
+ ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+ ; CHECK-NEXT: $q0 = COPY [[UADDLP]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
;
; CHECK-CSSC-LABEL: name: custom_8x16
@@ -339,8 +359,8 @@ body: |
; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<8 x s16>)
; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
- ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
- ; CHECK-CSSC-NEXT: $q0 = COPY [[INT]](<8 x s16>)
+ ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+ ; CHECK-CSSC-NEXT: $q0 = COPY [[UADDLP]](<8 x s16>)
; CHECK-CSSC-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%1:_(<8 x s16>) = G_CTPOP %0(<8 x s16>)
@@ -361,9 +381,9 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<4 x s32>)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
- ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
- ; CHECK-NEXT: $q0 = COPY [[INT1]](<4 x s32>)
+ ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+ ; CHECK-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+ ; CHECK-NEXT: $q0 = COPY [[UADDLP1]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
;
; CHECK-CSSC-LABEL: name: custom_4x32
@@ -372,9 +392,9 @@ body: |
; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<4 x s32>)
; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
- ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
- ; CHECK-CSSC-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
- ; CHECK-CSSC-NEXT: $q0 = COPY [[INT1]](<4 x s32>)
+ ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+ ; CHECK-CSSC-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+ ; CHECK-CSSC-NEXT: $q0 = COPY [[UADDLP1]](<4 x s32>)
; CHECK-CSSC-NEXT: RET_ReallyLR implicit $q0
%0:_(<4 x s32>) = COPY $q0
%1:_(<4 x s32>) = G_CTPOP %0(<4 x s32>)
@@ -395,10 +415,10 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<2 x s64>)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
- ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
- ; CHECK-NEXT: [[INT2:%[0-9]+]]:_(<2 x s64>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT1]](<4 x s32>)
- ; CHECK-NEXT: $q0 = COPY [[INT2]](<2 x s64>)
+ ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+ ; CHECK-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+ ; CHECK-NEXT: [[UADDLP2:%[0-9]+]]:_(<2 x s64>) = G_UADDLP [[UADDLP1]]
+ ; CHECK-NEXT: $q0 = COPY [[UADDLP2]](<2 x s64>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
;
; CHECK-CSSC-LABEL: name: custom_2x64
@@ -407,10 +427,10 @@ body: |
; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<2 x s64>)
; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
- ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
- ; CHECK-CSSC-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
- ; CHECK-CSSC-NEXT: [[INT2:%[0-9]+]]:_(<2 x s64>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT1]](<4 x s32>)
- ; CHECK-CSSC-NEXT: $q0 = COPY [[INT2]](<2 x s64>)
+ ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+ ; CHECK-CSSC-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+ ; CHECK-CSSC-NEXT: [[UADDLP2:%[0-9]+]]:_(<2 x s64>) = G_UADDLP [[UADDLP1]]
+ ; CHECK-CSSC-NEXT: $q0 = COPY [[UADDLP2]](<2 x s64>)
; CHECK-CSSC-NEXT: RET_ReallyLR implicit $q0
%0:_(<2 x s64>) = COPY $q0
%1:_(<2 x s64>) = G_CTPOP %0(<2 x s64>)
@@ -431,8 +451,8 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<4 x s16>)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<8 x s8>)
- ; CHECK-NEXT: $d0 = COPY [[INT]](<4 x s16>)
+ ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<4 x s16>) = G_UADDLP [[CTPOP]]
+ ; CHECK-NEXT: $d0 = COPY [[UADDLP]](<4 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $d0
;
; CHECK-CSSC-LABEL: name: custom_4x16
@@ -441,8 +461,8 @@ body: |
; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<4 x s16>)
; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
- ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<8 x s8>)
- ; CHECK-CSSC-NEXT: $d0 = COPY [[INT]](<4 x s16>)
+ ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<4 x s16>) = G_UADDLP [[CTPOP]]
+ ; CHECK-CSSC-NEXT: $d0 = COPY [[UADDLP]](<4 x s16>)
; CHECK-CSSC-NEXT: RET_ReallyLR implicit $d0
%0:_(<4 x s16>) = COPY $d0
%1:_(<4 x s16>) = G_CTPOP %0(<4 x s16>)
@@ -463,9 +483,9 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<2 x s32>)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<8 x s8>)
- ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<4 x s16>)
- ; CHECK-NEXT: $d0 = COPY [[INT1]](<2 x s32>)
+ ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<4 x s16>) = G_UADDLP [[CTPOP]]
+ ; CHECK-NEXT: [[UADDLP1:%[0-9]+]]:_(<2 x s32>) = G_UADDLP [[UADDLP]]
+ ; CHECK-NEXT: $d0 = COPY [[UADDLP1]](<2 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $d0
;
; CHECK-CSSC-LABEL: name: custom_2x32
@@ -474,9 +494,9 @@ body: |
; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<2 x s32>)
; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
- ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<8 x s8>)
- ; CHECK-CSSC-NEXT: [[INT1:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<4 x s16>)
- ; CHECK-CSSC-NEXT: $d0 = COPY [[INT1]](<2 x s32>)
+ ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<4 x s16>) = G_UADDLP [[CTPOP]]
+ ; CHECK-CSSC-NEXT: [[UADDLP1:%[0-9]+]]:_(<2 x s32>) = G_UADDLP [[UADDLP]]
+ ; CHECK-CSSC-NEXT: $d0 = COPY [[UADDLP1]](<2 x s32>)
; CHECK-CSSC-NEXT: RET_ReallyLR implicit $d0
%0:_(<2 x s32>) = COPY $d0
%1:_(<2 x s32>) = G_CTPOP %0(<2 x s32>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir
index 535a8d811e43a70..8b39ebd986dd748 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir
@@ -147,9 +147,9 @@ body: |
; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[XOR]], [[ADD]]
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[AND]](<4 x s32>)
; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
- ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
- ; CHECK-NEXT: $q0 = COPY [[INT1]](<4 x s32>)
+ ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+ ; CHECK-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+ ; CHECK-NEXT: $q0 = COPY [[UADDLP1]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
;
; CHECK-CSSC-LABEL: name: v4s32
@@ -163,9 +163,9 @@ body: |
; CHECK-CSSC-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[XOR]], [[ADD]]
; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[AND]](<4 x s32>)
; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
- ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
- ; CHECK-CSSC-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
- ; CHECK-CSSC-NEXT: $q0 = COPY [[INT1]](<4 x s32>)
+ ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+ ; CHECK-CSSC-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+ ; CHECK-CSSC-NEXT: $q0 = COPY [[UADDLP1]](<4 x s32>)
; CHECK-CSSC-NEXT: RET_ReallyLR implicit $q0
%val:_(<4 x s32>) = COPY $q0
%1:_(<4 x s32>) = G_CTTZ %val(<4 x s32>)
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
index 218f4147787d1de..f7ff64228ecd347 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
@@ -81,11 +81,17 @@ declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
define i16 @test_vaddlv_s8(<8 x i8> %a) {
-; CHECK-LABEL: test_vaddlv_s8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: saddlv h0, v0.8b
-; CHECK-NEXT: smov w0, v0.h[0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_vaddlv_s8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: saddlv h0, v0.8b
+; CHECK-SD-NEXT: smov w0, v0.h[0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_vaddlv_s8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: saddlv h0, v0.8b
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
%0 = trunc i32 %saddlvv.i to i16
@@ -127,11 +133,17 @@ entry:
}
define i16 @test_vaddlvq_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vaddlvq_s8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: saddlv h0, v0.16b
-; CHECK-NEXT: smov w0, v0.h[0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_vaddlvq_s8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: saddlv h0, v0.16b
+; CHECK-SD-NEXT: smov w0, v0.h[0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_vaddlvq_s8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: saddlv h0, v0.16b
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
%0 = trunc i32 %saddlvv.i to i16
diff --git a/llvm/test/CodeGen/AArch64/dp1.ll b/llvm/test/CodeGen/AArch64/dp1.ll
index bb5b19e51995a4a..949dad7798a6ca5 100644
--- a/llvm/test/CodeGen/AArch64/dp1.ll
+++ b/llvm/test/CodeGen/AArch64/dp1.ll
@@ -197,27 +197,16 @@ define void @cttz_zeroundef_i64() {
}
define void @ctpop_i32() {
-; CHECK-SDAG-LABEL: ctpop_i32:
-; CHECK-SDAG: // %bb.0:
-; CHECK-SDAG-NEXT: adrp x8, :got:var32
-; CHECK-SDAG-NEXT: ldr x8, [x8, :got_lo12:var32]
-; CHECK-SDAG-NEXT: ldr w9, [x8]
-; CHECK-SDAG-NEXT: fmov d0, x9
-; CHECK-SDAG-NEXT: cnt v0.8b, v0.8b
-; CHECK-SDAG-NEXT: uaddlv h0, v0.8b
-; CHECK-SDAG-NEXT: str s0, [x8]
-; CHECK-SDAG-NEXT: ret
-;
-; CHECK-GISEL-LABEL: ctpop_i32:
-; CHECK-GISEL: // %bb.0:
-; CHECK-GISEL-NEXT: adrp x8, :got:var32
-; CHECK-GISEL-NEXT: ldr x8, [x8, :got_lo12:var32]
-; CHECK-GISEL-NEXT: ldr w9, [x8]
-; CHECK-GISEL-NEXT: fmov d0, x9
-; CHECK-GISEL-NEXT: cnt v0.8b, v0.8b
-; CHECK-GISEL-NEXT: uaddlv h0, v0.8b
-; CHECK-GISEL-NEXT: str s0, [x8]
-; CHECK-GISEL-NEXT: ret
+; CHECK-LABEL: ctpop_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, :got:var32
+; CHECK-NEXT: ldr x8, [x8, :got_lo12:var32]
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: fmov d0, x9
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: uaddlv h0, v0.8b
+; CHECK-NEXT: str s0, [x8]
+; CHECK-NEXT: ret
%val0_tmp = load i32, ptr @var32
%val4_tmp = call i32 @llvm.ctpop.i32(i32 %val0_tmp)
store volatile i32 %val4_tmp, ptr @var32
@@ -244,7 +233,7 @@ define void @ctpop_i64() {
; CHECK-GISEL-NEXT: fmov d0, x9
; CHECK-GISEL-NEXT: cnt v0.8b, v0.8b
; CHECK-GISEL-NEXT: uaddlv h0, v0.8b
-; CHECK-GISEL-NEXT: fmov w9, s0
+; CHECK-GISEL-NEXT: mov w9, v0.s[0]
; CHECK-GISEL-NEXT: str x9, [x8]
; CHECK-GISEL-NEXT: ret
%val0_tmp = load i64, ptr @var64
diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll
index d3f703257e47519..50f555b18ff07be 100644
--- a/llvm/test/CodeGen/AArch64/neon-addlv.ll
+++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll
@@ -23,20 +23,12 @@ declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) nounwind readnone
declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) nounwind readnone
define i16 @uaddlv4h_from_v8i8(ptr %A) nounwind {
-; CHECK-SD-LABEL: uaddlv4h_from_v8i8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr d0, [x0]
-; CHECK-SD-NEXT: uaddlv h0, v0.8b
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: uaddlv4h_from_v8i8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr d0, [x0]
-; CHECK-GI-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-GI-NEXT: addv h0, v0.4h
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: uaddlv4h_from_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: uaddlv h0, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
%tmp5 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %tmp3)
@@ -44,20 +36,12 @@ define i16 @uaddlv4h_from_v8i8(ptr %A) nounwind {
}
define i16 @uaddlv16b_from_v16i8(ptr %A) nounwind {
-; CHECK-SD-LABEL: uaddlv16b_from_v16i8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr q0, [x0]
-; CHECK-SD-NEXT: uaddlv h0, v0.16b
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: uaddlv16b_from_v16i8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr q0, [x0]
-; CHECK-GI-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-GI-NEXT: addv h0, v0.8h
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: uaddlv16b_from_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: uaddlv h0, v0.16b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%tmp1 = load <16 x i8>, ptr %A
%tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
%tmp5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %tmp3)
@@ -65,20 +49,12 @@ define i16 @uaddlv16b_from_v16i8(ptr %A) nounwind {
}
define i32 @uaddlv8h_from_v8i16(ptr %A) nounwind {
-; CHECK-SD-LABEL: uaddlv8h_from_v8i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr q0, [x0]
-; CHECK-SD-NEXT: uaddlv s0, v0.8h
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: uaddlv8h_from_v8i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr q0, [x0]
-; CHECK-GI-NEXT: uaddlp v0.4s, v0.8h
-; CHECK-GI-NEXT: addv s0, v0.4s
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: uaddlv8h_from_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: uaddlv s0, v0.8h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
%tmp5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp3)
@@ -86,20 +62,12 @@ define i32 @uaddlv8h_from_v8i16(ptr %A) nounwind {
}
define i64 @uaddlv4s_from_v4i32(ptr %A) nounwind {
-; CHECK-SD-LABEL: uaddlv4s_from_v4i32:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr q0, [x0]
-; CHECK-SD-NEXT: uaddlv d0, v0.4s
-; CHECK-SD-NEXT: fmov x0, d0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: uaddlv4s_from_v4i32:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr q0, [x0]
-; CHECK-GI-NEXT: uaddlp v0.2d, v0.4s
-; CHECK-GI-NEXT: addp d0, v0.2d
-; CHECK-GI-NEXT: fmov x0, d0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: uaddlv4s_from_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: uaddlv d0, v0.4s
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
%tmp5 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %tmp3)
@@ -107,20 +75,12 @@ define i64 @uaddlv4s_from_v4i32(ptr %A) nounwind {
}
define i32 @uaddlv4h_from_v4i16(ptr %A) nounwind {
-; CHECK-SD-LABEL: uaddlv4h_from_v4i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr d0, [x0]
-; CHECK-SD-NEXT: uaddlv s0, v0.4h
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: uaddlv4h_from_v4i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr d0, [x0]
-; CHECK-GI-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-GI-NEXT: addp v0.2s, v0.2s, v0.2s
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: uaddlv4h_from_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: uaddlv s0, v0.4h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
%tmp5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %tmp3)
@@ -130,20 +90,12 @@ define i32 @uaddlv4h_from_v4i16(ptr %A) nounwind {
define i16 @saddlv4h_from_v8i8(ptr %A) nounwind {
-; CHECK-SD-LABEL: saddlv4h_from_v8i8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr d0, [x0]
-; CHECK-SD-NEXT: saddlv h0, v0.8b
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: saddlv4h_from_v8i8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr d0, [x0]
-; CHECK-GI-NEXT: saddlp v0.4h, v0.8b
-; CHECK-GI-NEXT: addv h0, v0.4h
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: saddlv4h_from_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: saddlv h0, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
%tmp5 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %tmp3)
@@ -151,20 +103,12 @@ define i16 @saddlv4h_from_v8i8(ptr %A) nounwind {
}
define i16 @saddlv16b_from_v16i8(ptr %A) nounwind {
-; CHECK-SD-LABEL: saddlv16b_from_v16i8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr q0, [x0]
-; CHECK-SD-NEXT: saddlv h0, v0.16b
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: saddlv16b_from_v16i8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr q0, [x0]
-; CHECK-GI-NEXT: saddlp v0.8h, v0.16b
-; CHECK-GI-NEXT: addv h0, v0.8h
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: saddlv16b_from_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: saddlv h0, v0.16b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%tmp1 = load <16 x i8>, ptr %A
%tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
%tmp5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %tmp3)
@@ -172,20 +116,12 @@ define i16 @saddlv16b_from_v16i8(ptr %A) nounwind {
}
define i32 @saddlv8h_from_v8i16(ptr %A) nounwind {
-; CHECK-SD-LABEL: saddlv8h_from_v8i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr q0, [x0]
-; CHECK-SD-NEXT: saddlv s0, v0.8h
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: saddlv8h_from_v8i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr q0, [x0]
-; CHECK-GI-NEXT: saddlp v0.4s, v0.8h
-; CHECK-GI-NEXT: addv s0, v0.4s
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: saddlv8h_from_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: saddlv s0, v0.8h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
%tmp5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp3)
@@ -193,20 +129,12 @@ define i32 @saddlv8h_from_v8i16(ptr %A) nounwind {
}
define i64 @saddlv4s_from_v4i32(ptr %A) nounwind {
-; CHECK-SD-LABEL: saddlv4s_from_v4i32:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr q0, [x0]
-; CHECK-SD-NEXT: saddlv d0, v0.4s
-; CHECK-SD-NEXT: fmov x0, d0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: saddlv4s_from_v4i32:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr q0, [x0]
-; CHECK-GI-NEXT: saddlp v0.2d, v0.4s
-; CHECK-GI-NEXT: addp d0, v0.2d
-; CHECK-GI-NEXT: fmov x0, d0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: saddlv4s_from_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: saddlv d0, v0.4s
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
%tmp5 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %tmp3)
@@ -214,20 +142,12 @@ define i64 @saddlv4s_from_v4i32(ptr %A) nounwind {
}
define i32 @saddlv4h_from_v4i16(ptr %A) nounwind {
-; CHECK-SD-LABEL: saddlv4h_from_v4i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr d0, [x0]
-; CHECK-SD-NEXT: saddlv s0, v0.4h
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: saddlv4h_from_v4i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr d0, [x0]
-; CHECK-GI-NEXT: saddlp v0.2s, v0.4h
-; CHECK-GI-NEXT: addp v0.2s, v0.2s, v0.2s
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: saddlv4h_from_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: saddlv s0, v0.4h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
%tmp5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %tmp3)
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 0a3ee98f843c809..b1231eeac1ea431 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -7,9 +7,8 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
; CHECK: // %bb.0: // %Entry
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlv h1, v0.16b
-; CHECK-NEXT: // implicit-def: $q0
-; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: uaddlv h0, v0.16b
+; CHECK-NEXT: // kill: def $q0 killed $h0
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
@@ -38,19 +37,17 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
; CHECK-NEXT: mov v0.d[0], x9
; CHECK-NEXT: mov v0.d[1], x8
; CHECK-NEXT: cnt v1.16b, v1.16b
-; CHECK-NEXT: uaddlv h2, v1.16b
-; CHECK-NEXT: // implicit-def: $q1
-; CHECK-NEXT: fmov s1, s2
+; CHECK-NEXT: uaddlv h1, v1.16b
+; CHECK-NEXT: // kill: def $q1 killed $h1
; CHECK-NEXT: // kill: def $s1 killed $s1 killed $q1
-; CHECK-NEXT: mov w10, wzr
; CHECK-NEXT: fmov w0, s1
+; CHECK-NEXT: mov w10, wzr
; CHECK-NEXT: mov w9, w0
; CHECK-NEXT: mov w8, w10
; CHECK-NEXT: bfi x9, x8, #32, #32
; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlv h1, v0.16b
-; CHECK-NEXT: // implicit-def: $q0
-; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: uaddlv h0, v0.16b
+; CHECK-NEXT: // kill: def $q0 killed $h0
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: mov w8, w0
@@ -76,16 +73,15 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
; CHECK-NEXT: mov v0.d[0], x0
; CHECK-NEXT: mov v0.d[1], x1
; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlv h1, v0.16b
-; CHECK-NEXT: // implicit-def: $q0
-; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: uaddlv h0, v0.16b
+; CHECK-NEXT: // kill: def $q0 killed $h0
+; CHECK-NEXT: mov x1, xzr
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: // kill: def $x0 killed $w0
; CHECK-NEXT: // kill: def $x8 killed $w8
; CHECK-NEXT: bfi x0, x8, #32, #32
-; CHECK-NEXT: mov x1, xzr
; CHECK-NEXT: ret
Entry:
%1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)
More information about the llvm-commits
mailing list