[llvm] [GlobalIsel] Pust cast through build vector (PR #104634)
Thorsten Schütt via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 16 12:26:13 PDT 2024
https://github.com/tschuett updated https://github.com/llvm/llvm-project/pull/104634
>From 0f93b8e2173645043466ffc08f98699f8d239a1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Fri, 16 Aug 2024 20:57:01 +0200
Subject: [PATCH 1/2] [GlobalIsel] Pust cast through build vector
Credits: https://github.com/llvm/llvm-project/pull/100563
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 +
.../include/llvm/Target/GlobalISel/Combine.td | 17 +-
.../GlobalISel/CombinerHelperCasts.cpp | 39 +
.../AArch64/GISel/AArch64LegalizerInfo.cpp | 1 +
.../AArch64/GlobalISel/combine-cast.mir | 92 +++
.../GlobalISel/combine-extract-vec-elt.mir | 4 +-
.../AArch64/GlobalISel/combine-with-flags.mir | 45 +-
.../CodeGen/AArch64/arm64-subvector-extend.ll | 456 +++++++-----
llvm/test/CodeGen/AArch64/arm64-vadd.ll | 46 +-
llvm/test/CodeGen/AArch64/neon-extadd.ll | 376 ++++++----
llvm/test/CodeGen/AArch64/sext.ll | 354 ++++++----
llvm/test/CodeGen/AArch64/vecreduce-add.ll | 664 +++++++++++-------
llvm/test/CodeGen/AArch64/xtn.ll | 46 +-
llvm/test/CodeGen/AArch64/zext.ll | 263 ++++---
...mbine-shl-from-extend-narrow.postlegal.mir | 12 +-
...ombine-shl-from-extend-narrow.prelegal.mir | 10 +-
16 files changed, 1491 insertions(+), 937 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 32effc536eb35d..9b62d6067be39c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -900,6 +900,9 @@ class CombinerHelper {
bool matchExtOfExt(const MachineInstr &FirstMI, const MachineInstr &SecondMI,
BuildFnTy &MatchInfo);
+ bool matchCastOfBuildVector(const MachineInstr &CastMI,
+ const MachineInstr &BVMI, BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 814c5e789cb374..c95f542757c66b 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1857,6 +1857,18 @@ def anyext_of_anyext : ext_of_ext_opcodes<G_ANYEXT, G_ANYEXT>;
def anyext_of_zext : ext_of_ext_opcodes<G_ANYEXT, G_ZEXT>;
def anyext_of_sext : ext_of_ext_opcodes<G_ANYEXT, G_SEXT>;
+// Push cast through build vector.
+class buildvector_of_opcode<Instruction castOpcode> : GICombineRule <
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_BUILD_VECTOR $bv, GIVariadic<>:$unused):$Build,
+ (castOpcode $root, $bv):$Cast,
+ [{ return Helper.matchCastOfBuildVector(*${Cast}, *${Build}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${Cast}, ${matchinfo}); }])>;
+
+def buildvector_of_zext : buildvector_of_opcode<G_ZEXT>;
+def buildvector_of_anyext : buildvector_of_opcode<G_ANYEXT>;
+def buildvector_of_truncate : buildvector_of_opcode<G_TRUNC>;
+
def cast_combines: GICombineGroup<[
truncate_of_zext,
truncate_of_sext,
@@ -1870,7 +1882,10 @@ def cast_combines: GICombineGroup<[
sext_of_anyext,
anyext_of_anyext,
anyext_of_zext,
- anyext_of_sext
+ anyext_of_sext,
+ buildvector_of_zext,
+ buildvector_of_anyext,
+ buildvector_of_truncate
]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
index 494d8da84445d1..e3208c16cfd586 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -273,3 +273,42 @@ bool CombinerHelper::matchExtOfExt(const MachineInstr &FirstMI,
return false;
}
+
+bool CombinerHelper::matchCastOfBuildVector(const MachineInstr &CastMI,
+ const MachineInstr &BVMI,
+ BuildFnTy &MatchInfo) {
+ const GExtOrTruncOp *Cast = cast<GExtOrTruncOp>(&CastMI);
+ const GBuildVector *BV = cast<GBuildVector>(&BVMI);
+
+ if (!MRI.hasOneNonDBGUse(BV->getReg(0)))
+ return false;
+
+ Register Dst = Cast->getReg(0);
+ // The type of the new build vector.
+ LLT DstTy = MRI.getType(Dst);
+ // The scalar or element type of the new build vector.
+ LLT ElemTy = DstTy.getScalarType();
+ // The scalar or element type of the old build vector.
+ LLT InputElemTy = MRI.getType(BV->getReg(0)).getScalarType();
+
+ // Check legality of new build vector, the scalar casts, and profitability of
+ // the many casts.
+ if (!isLegalOrBeforeLegalizer(
+ {TargetOpcode::G_BUILD_VECTOR, {DstTy, ElemTy}}) ||
+ !isLegalOrBeforeLegalizer({Cast->getOpcode(), {ElemTy, InputElemTy}}) ||
+ !isCastFree(Cast->getOpcode(), ElemTy, InputElemTy))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ SmallVector<Register> Casts;
+ unsigned Elements = BV->getNumSources();
+ for (unsigned I = 0; I < Elements; ++I)
+ Casts.push_back(
+ B.buildInstr(Cast->getOpcode(), {ElemTy}, {BV->getSourceReg(I)})
+ .getReg(0));
+
+ B.buildBuildVector(Dst, Casts);
+ };
+
+ return true;
+}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index d3c5742cee3eb4..33a1fa1ad04fdf 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -953,6 +953,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampNumElements(0, v2s64, v2s64)
.minScalarOrElt(0, s8)
.widenVectorEltsToVectorMinSize(0, 64)
+ .widenScalarOrEltToNextPow2(0)
.minScalarSameAs(1, 0);
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
index 0f436127ea2eb6..9eef79a9c4bbee 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
@@ -129,3 +129,95 @@ body: |
%res:_(<2 x s64>) = G_SELECT %cond(<2 x s32>), %bv, %bv2
%small:_(<2 x s32>) = G_TRUNC %res(<2 x s64>)
$x0 = COPY %small(<2 x s32>)
+...
+---
+name: test_combine_trunc_build_vector
+legalized: true
+body: |
+ bb.1:
+ ; CHECK-PRE-LABEL: name: test_combine_trunc_build_vector
+ ; CHECK-PRE: %arg1:_(s64) = COPY $x0
+ ; CHECK-PRE-NEXT: %arg2:_(s64) = COPY $x0
+ ; CHECK-PRE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %arg1(s64)
+ ; CHECK-PRE-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %arg2(s64)
+ ; CHECK-PRE-NEXT: %small:_(<2 x s32>) = G_BUILD_VECTOR [[TRUNC]](s32), [[TRUNC1]](s32)
+ ; CHECK-PRE-NEXT: $x0 = COPY %small(<2 x s32>)
+ ;
+ ; CHECK-POST-LABEL: name: test_combine_trunc_build_vector
+ ; CHECK-POST: %arg1:_(s64) = COPY $x0
+ ; CHECK-POST-NEXT: %arg2:_(s64) = COPY $x0
+ ; CHECK-POST-NEXT: %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+ ; CHECK-POST-NEXT: %small:_(<2 x s32>) = G_TRUNC %bv(<2 x s64>)
+ ; CHECK-POST-NEXT: $x0 = COPY %small(<2 x s32>)
+ %arg1:_(s64) = COPY $x0
+ %arg2:_(s64) = COPY $x0
+ %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+ %small:_(<2 x s32>) = G_TRUNC %bv(<2 x s64>)
+ $x0 = COPY %small(<2 x s32>)
+...
+---
+name: test_combine_zext_build_vector
+legalized: true
+body: |
+ bb.1:
+ ; CHECK-PRE-LABEL: name: test_combine_zext_build_vector
+ ; CHECK-PRE: %arg1:_(s32) = COPY $w0
+ ; CHECK-PRE-NEXT: %arg2:_(s32) = COPY $w0
+ ; CHECK-PRE-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %arg1(s32)
+ ; CHECK-PRE-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT %arg2(s32)
+ ; CHECK-PRE-NEXT: %large:_(<2 x s64>) = G_BUILD_VECTOR [[ZEXT]](s64), [[ZEXT1]](s64)
+ ; CHECK-PRE-NEXT: $q0 = COPY %large(<2 x s64>)
+ ;
+ ; CHECK-POST-LABEL: name: test_combine_zext_build_vector
+ ; CHECK-POST: %arg1:_(s32) = COPY $w0
+ ; CHECK-POST-NEXT: %arg2:_(s32) = COPY $w0
+ ; CHECK-POST-NEXT: %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+ ; CHECK-POST-NEXT: %large:_(<2 x s64>) = G_ZEXT %bv(<2 x s32>)
+ ; CHECK-POST-NEXT: $q0 = COPY %large(<2 x s64>)
+ %arg1:_(s32) = COPY $w0
+ %arg2:_(s32) = COPY $w0
+ %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+ %large:_(<2 x s64>) = G_ZEXT %bv(<2 x s32>)
+ $q0 = COPY %large(<2 x s64>)
+...
+---
+name: test_combine_anyext_build_vector
+legalized: true
+body: |
+ bb.1:
+ ; CHECK-PRE-LABEL: name: test_combine_anyext_build_vector
+ ; CHECK-PRE: %arg1:_(s32) = COPY $w0
+ ; CHECK-PRE-NEXT: %arg2:_(s32) = COPY $w0
+ ; CHECK-PRE-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %arg1(s32)
+ ; CHECK-PRE-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT %arg2(s32)
+ ; CHECK-PRE-NEXT: %large:_(<2 x s64>) = G_BUILD_VECTOR [[ANYEXT]](s64), [[ANYEXT1]](s64)
+ ; CHECK-PRE-NEXT: $q0 = COPY %large(<2 x s64>)
+ ;
+ ; CHECK-POST-LABEL: name: test_combine_anyext_build_vector
+ ; CHECK-POST: %arg1:_(s32) = COPY $w0
+ ; CHECK-POST-NEXT: %arg2:_(s32) = COPY $w0
+ ; CHECK-POST-NEXT: %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+ ; CHECK-POST-NEXT: %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>)
+ ; CHECK-POST-NEXT: $q0 = COPY %large(<2 x s64>)
+ %arg1:_(s32) = COPY $w0
+ %arg2:_(s32) = COPY $w0
+ %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+ %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>)
+ $q0 = COPY %large(<2 x s64>)
+...
+---
+name: test_combine_sext_build_vector
+legalized: true
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_combine_sext_build_vector
+ ; CHECK: %arg1:_(s32) = COPY $w0
+ ; CHECK-NEXT: %arg2:_(s32) = COPY $w0
+ ; CHECK-NEXT: %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+ ; CHECK-NEXT: %large:_(<2 x s64>) = G_SEXT %bv(<2 x s32>)
+ ; CHECK-NEXT: $q0 = COPY %large(<2 x s64>)
+ %arg1:_(s32) = COPY $w0
+ %arg2:_(s32) = COPY $w0
+ %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+ %large:_(<2 x s64>) = G_SEXT %bv(<2 x s32>)
+ $q0 = COPY %large(<2 x s64>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
index 70241e71aa593f..c98dcf6ccb7966 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
@@ -49,8 +49,8 @@ body: |
; CHECK: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %arg1:_(s64) = COPY $x0
- ; CHECK-NEXT: %extract:_(s32) = G_TRUNC %arg1(s64)
- ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %extract(s32)
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %arg1(s64)
+ ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[TRUNC]](s32)
; CHECK-NEXT: $x0 = COPY %zext(s64)
; CHECK-NEXT: RET_ReallyLR implicit $x0
%arg1:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir
index 6eece5c56258dc..8cb44605246ffa 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir
@@ -60,8 +60,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
- ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY]](s32), [[COPY1]](s32)
- ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>)
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; CHECK-NEXT: %trunc:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16), [[TRUNC1]](s16)
+ ; CHECK-NEXT: %zext:_(<4 x s32>) = G_ZEXT %trunc(<4 x s16>)
+ ; CHECK-NEXT: $q0 = COPY %zext(<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:_(s32) = COPY $w0
%1:_(s32) = COPY $w1
@@ -165,8 +168,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
- ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>)
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+ ; CHECK-NEXT: %t:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
+ ; CHECK-NEXT: %s:_(<4 x s32>) = G_SEXT %t(<4 x s16>)
+ ; CHECK-NEXT: $q0 = COPY %s(<4 x s32>)
%0:_(s32) = COPY $w0
%1:_(s32) = COPY $w1
%2:_(s32) = COPY $w2
@@ -188,8 +196,11 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
- ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; CHECK-NEXT: %t:_(<4 x s16>) = G_TRUNC %bv0(<4 x s32>)
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+ ; CHECK-NEXT: %t:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
; CHECK-NEXT: %z:_(<4 x s32>) = G_ZEXT %t(<4 x s16>)
; CHECK-NEXT: $q0 = COPY %z(<4 x s32>)
%0:_(s32) = COPY $w0
@@ -213,8 +224,11 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
- ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; CHECK-NEXT: %t:_(<4 x s16>) = nsw G_TRUNC %bv0(<4 x s32>)
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+ ; CHECK-NEXT: %t:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
; CHECK-NEXT: %z:_(<4 x s32>) = G_ZEXT %t(<4 x s16>)
; CHECK-NEXT: $q0 = COPY %z(<4 x s32>)
%0:_(s32) = COPY $w0
@@ -238,8 +252,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
- ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>)
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+ ; CHECK-NEXT: %t:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
+ ; CHECK-NEXT: %z:_(<4 x s32>) = G_ZEXT %t(<4 x s16>)
+ ; CHECK-NEXT: $q0 = COPY %z(<4 x s32>)
%0:_(s32) = COPY $w0
%1:_(s32) = COPY $w1
%2:_(s32) = COPY $w2
@@ -259,8 +278,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
- ; CHECK-NEXT: %bv0:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64)
- ; CHECK-NEXT: %z:_(<2 x s32>) = nuw G_TRUNC %bv0(<2 x s64>)
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+ ; CHECK-NEXT: %t:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+ ; CHECK-NEXT: %z:_(<2 x s32>) = G_ZEXT %t(<2 x s16>)
; CHECK-NEXT: $d0 = COPY %z(<2 x s32>)
%0:_(s64) = COPY $x0
%1:_(s64) = COPY $x1
diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index abf2e1272d6450..1f5654d59926dc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -466,62 +466,92 @@ define <32 x i8> @sext_v32i1(<32 x i1> %arg) {
;
; CHECK-GI-LABEL: sext_v32i1:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fmov s17, w0
-; CHECK-GI-NEXT: fmov s19, w4
-; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s21, [sp, #8]
-; CHECK-GI-NEXT: ldr s1, [sp, #32]
-; CHECK-GI-NEXT: ldr s22, [sp, #40]
-; CHECK-GI-NEXT: ldr s2, [sp, #64]
-; CHECK-GI-NEXT: ldr s23, [sp, #72]
-; CHECK-GI-NEXT: ldr s3, [sp, #96]
-; CHECK-GI-NEXT: ldr s24, [sp, #104]
-; CHECK-GI-NEXT: mov.s v17[1], w1
-; CHECK-GI-NEXT: mov.s v19[1], w5
-; CHECK-GI-NEXT: ldr s5, [sp, #128]
-; CHECK-GI-NEXT: ldr s20, [sp, #136]
-; CHECK-GI-NEXT: mov.s v0[1], v21[0]
-; CHECK-GI-NEXT: ldr s7, [sp, #160]
-; CHECK-GI-NEXT: ldr s25, [sp, #168]
-; CHECK-GI-NEXT: mov.s v1[1], v22[0]
-; CHECK-GI-NEXT: mov.s v2[1], v23[0]
-; CHECK-GI-NEXT: mov.s v3[1], v24[0]
-; CHECK-GI-NEXT: mov.s v5[1], v20[0]
-; CHECK-GI-NEXT: mov.s v7[1], v25[0]
-; CHECK-GI-NEXT: ldr s16, [sp, #16]
-; CHECK-GI-NEXT: ldr s18, [sp, #48]
-; CHECK-GI-NEXT: ldr s20, [sp, #80]
-; CHECK-GI-NEXT: ldr s21, [sp, #112]
-; CHECK-GI-NEXT: ldr s22, [sp, #144]
-; CHECK-GI-NEXT: ldr s23, [sp, #176]
-; CHECK-GI-NEXT: mov.s v17[2], w2
-; CHECK-GI-NEXT: mov.s v19[2], w6
-; CHECK-GI-NEXT: mov.s v0[2], v16[0]
-; CHECK-GI-NEXT: mov.s v1[2], v18[0]
-; CHECK-GI-NEXT: mov.s v2[2], v20[0]
-; CHECK-GI-NEXT: mov.s v3[2], v21[0]
-; CHECK-GI-NEXT: mov.s v5[2], v22[0]
-; CHECK-GI-NEXT: mov.s v7[2], v23[0]
-; CHECK-GI-NEXT: ldr s4, [sp, #24]
-; CHECK-GI-NEXT: ldr s6, [sp, #56]
-; CHECK-GI-NEXT: ldr s16, [sp, #88]
-; CHECK-GI-NEXT: ldr s18, [sp, #120]
-; CHECK-GI-NEXT: ldr s20, [sp, #152]
-; CHECK-GI-NEXT: ldr s21, [sp, #184]
-; CHECK-GI-NEXT: mov.s v17[3], w3
-; CHECK-GI-NEXT: mov.s v19[3], w7
-; CHECK-GI-NEXT: mov.s v0[3], v4[0]
-; CHECK-GI-NEXT: mov.s v1[3], v6[0]
-; CHECK-GI-NEXT: mov.s v2[3], v16[0]
-; CHECK-GI-NEXT: mov.s v3[3], v18[0]
-; CHECK-GI-NEXT: mov.s v5[3], v20[0]
-; CHECK-GI-NEXT: mov.s v7[3], v21[0]
-; CHECK-GI-NEXT: uzp1.8h v4, v17, v19
-; CHECK-GI-NEXT: uzp1.8h v0, v0, v1
-; CHECK-GI-NEXT: uzp1.8h v1, v2, v3
-; CHECK-GI-NEXT: uzp1.8h v2, v5, v7
-; CHECK-GI-NEXT: uzp1.16b v0, v4, v0
-; CHECK-GI-NEXT: uzp1.16b v1, v1, v2
+; CHECK-GI-NEXT: ldr w9, [sp, #64]
+; CHECK-GI-NEXT: ldr w8, [sp, #72]
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s2, w1
+; CHECK-GI-NEXT: fmov s1, w9
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #80]
+; CHECK-GI-NEXT: ldr w9, [sp, #128]
+; CHECK-GI-NEXT: mov.b v0[1], v2[0]
+; CHECK-GI-NEXT: fmov s2, w2
+; CHECK-GI-NEXT: mov.b v1[1], v3[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #88]
+; CHECK-GI-NEXT: mov.b v0[2], v2[0]
+; CHECK-GI-NEXT: fmov s2, w3
+; CHECK-GI-NEXT: mov.b v1[2], v3[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #96]
+; CHECK-GI-NEXT: mov.b v0[3], v2[0]
+; CHECK-GI-NEXT: fmov s2, w4
+; CHECK-GI-NEXT: mov.b v1[3], v3[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #104]
+; CHECK-GI-NEXT: mov.b v0[4], v2[0]
+; CHECK-GI-NEXT: fmov s2, w5
+; CHECK-GI-NEXT: mov.b v1[4], v3[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #112]
+; CHECK-GI-NEXT: mov.b v0[5], v2[0]
+; CHECK-GI-NEXT: fmov s2, w6
+; CHECK-GI-NEXT: mov.b v1[5], v3[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #120]
+; CHECK-GI-NEXT: mov.b v0[6], v2[0]
+; CHECK-GI-NEXT: fmov s2, w7
+; CHECK-GI-NEXT: mov.b v1[6], v3[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp]
+; CHECK-GI-NEXT: mov.b v0[7], v2[0]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #8]
+; CHECK-GI-NEXT: mov.b v1[7], v3[0]
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #136]
+; CHECK-GI-NEXT: mov.b v0[8], v2[0]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-NEXT: mov.b v1[8], v3[0]
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #144]
+; CHECK-GI-NEXT: mov.b v0[9], v2[0]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #24]
+; CHECK-GI-NEXT: mov.b v1[9], v3[0]
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #152]
+; CHECK-GI-NEXT: mov.b v0[10], v2[0]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #32]
+; CHECK-GI-NEXT: mov.b v1[10], v3[0]
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #160]
+; CHECK-GI-NEXT: mov.b v0[11], v2[0]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #40]
+; CHECK-GI-NEXT: mov.b v1[11], v3[0]
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #168]
+; CHECK-GI-NEXT: mov.b v0[12], v2[0]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #48]
+; CHECK-GI-NEXT: mov.b v1[12], v3[0]
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #176]
+; CHECK-GI-NEXT: mov.b v0[13], v2[0]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #56]
+; CHECK-GI-NEXT: mov.b v1[13], v3[0]
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #184]
+; CHECK-GI-NEXT: mov.b v0[14], v2[0]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: mov.b v1[14], v3[0]
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: mov.b v0[15], v2[0]
+; CHECK-GI-NEXT: mov.b v1[15], v3[0]
; CHECK-GI-NEXT: shl.16b v0, v0, #7
; CHECK-GI-NEXT: shl.16b v1, v1, #7
; CHECK-GI-NEXT: sshr.16b v0, v0, #7
@@ -807,140 +837,198 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) {
;
; CHECK-GI-LABEL: sext_v64i1:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-GI-NEXT: str x29, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
; CHECK-GI-NEXT: .cfi_offset w29, -16
-; CHECK-GI-NEXT: .cfi_offset b8, -24
-; CHECK-GI-NEXT: .cfi_offset b9, -32
-; CHECK-GI-NEXT: ldr s0, [sp, #32]
-; CHECK-GI-NEXT: ldr s4, [sp, #40]
-; CHECK-GI-NEXT: ldr s2, [sp, #96]
-; CHECK-GI-NEXT: ldr s5, [sp, #104]
-; CHECK-GI-NEXT: ldr s1, [sp, #64]
-; CHECK-GI-NEXT: ldr s23, [sp, #72]
-; CHECK-GI-NEXT: mov.s v0[1], v4[0]
-; CHECK-GI-NEXT: ldr s28, [sp, #200]
-; CHECK-GI-NEXT: ldr s3, [sp, #128]
-; CHECK-GI-NEXT: mov.s v2[1], v5[0]
-; CHECK-GI-NEXT: mov.s v1[1], v23[0]
-; CHECK-GI-NEXT: ldr s5, [sp, #192]
-; CHECK-GI-NEXT: ldr s7, [sp, #136]
-; CHECK-GI-NEXT: ldr s4, [sp, #160]
-; CHECK-GI-NEXT: ldr s24, [sp, #168]
-; CHECK-GI-NEXT: mov.s v5[1], v28[0]
-; CHECK-GI-NEXT: ldr s6, [sp, #48]
-; CHECK-GI-NEXT: ldr s21, [sp, #80]
-; CHECK-GI-NEXT: mov.s v3[1], v7[0]
-; CHECK-GI-NEXT: mov.s v4[1], v24[0]
-; CHECK-GI-NEXT: ldr s16, [sp, #112]
-; CHECK-GI-NEXT: ldr s29, [sp, #208]
-; CHECK-GI-NEXT: mov.s v0[2], v6[0]
-; CHECK-GI-NEXT: mov.s v1[2], v21[0]
-; CHECK-GI-NEXT: ldr s6, [sp, #224]
-; CHECK-GI-NEXT: ldr s30, [sp, #232]
-; CHECK-GI-NEXT: mov.s v2[2], v16[0]
-; CHECK-GI-NEXT: ldr s20, [sp, #144]
-; CHECK-GI-NEXT: ldr s27, [sp, #176]
-; CHECK-GI-NEXT: mov.s v5[2], v29[0]
-; CHECK-GI-NEXT: mov.s v6[1], v30[0]
-; CHECK-GI-NEXT: ldr s18, [sp, #88]
-; CHECK-GI-NEXT: ldr s19, [sp, #120]
-; CHECK-GI-NEXT: ldr s7, [sp, #256]
-; CHECK-GI-NEXT: ldr s31, [sp, #264]
-; CHECK-GI-NEXT: mov.s v3[2], v20[0]
-; CHECK-GI-NEXT: mov.s v4[2], v27[0]
-; CHECK-GI-NEXT: ldr s25, [sp, #216]
-; CHECK-GI-NEXT: ldr s26, [sp, #240]
-; CHECK-GI-NEXT: ldr s17, [sp, #56]
-; CHECK-GI-NEXT: ldr s22, [sp, #152]
-; CHECK-GI-NEXT: mov.s v1[3], v18[0]
-; CHECK-GI-NEXT: ldr s23, [sp, #184]
-; CHECK-GI-NEXT: mov.s v2[3], v19[0]
-; CHECK-GI-NEXT: ldr s18, [sp, #320]
-; CHECK-GI-NEXT: ldr s27, [sp, #328]
-; CHECK-GI-NEXT: mov.s v7[1], v31[0]
-; CHECK-GI-NEXT: ldr s19, [sp, #352]
-; CHECK-GI-NEXT: ldr s29, [sp, #360]
-; CHECK-GI-NEXT: mov.s v5[3], v25[0]
-; CHECK-GI-NEXT: mov.s v6[2], v26[0]
-; CHECK-GI-NEXT: fmov s25, w0
-; CHECK-GI-NEXT: fmov s26, w4
-; CHECK-GI-NEXT: ldr s28, [sp, #272]
-; CHECK-GI-NEXT: mov.s v0[3], v17[0]
-; CHECK-GI-NEXT: ldr s17, [sp, #288]
-; CHECK-GI-NEXT: ldr s8, [sp, #296]
-; CHECK-GI-NEXT: mov.s v3[3], v22[0]
-; CHECK-GI-NEXT: ldr s20, [sp, #384]
-; CHECK-GI-NEXT: mov.s v4[3], v23[0]
-; CHECK-GI-NEXT: ldr s30, [sp, #392]
-; CHECK-GI-NEXT: ldr s22, [sp, #416]
-; CHECK-GI-NEXT: ldr s31, [sp, #424]
-; CHECK-GI-NEXT: ldr s23, [sp, #448]
-; CHECK-GI-NEXT: mov.s v18[1], v27[0]
-; CHECK-GI-NEXT: mov.s v19[1], v29[0]
-; CHECK-GI-NEXT: ldr s27, [sp, #456]
-; CHECK-GI-NEXT: ldr s24, [sp, #336]
-; CHECK-GI-NEXT: mov.s v17[1], v8[0]
-; CHECK-GI-NEXT: mov.s v7[2], v28[0]
-; CHECK-GI-NEXT: mov.s v25[1], w1
-; CHECK-GI-NEXT: mov.s v26[1], w5
-; CHECK-GI-NEXT: mov.s v20[1], v30[0]
-; CHECK-GI-NEXT: ldr s28, [sp, #368]
-; CHECK-GI-NEXT: mov.s v22[1], v31[0]
-; CHECK-GI-NEXT: mov.s v23[1], v27[0]
-; CHECK-GI-NEXT: ldr s9, [sp, #304]
-; CHECK-GI-NEXT: ldr s27, [sp, #400]
-; CHECK-GI-NEXT: mov.s v18[2], v24[0]
-; CHECK-GI-NEXT: ldr s24, [sp, #432]
-; CHECK-GI-NEXT: mov.s v19[2], v28[0]
-; CHECK-GI-NEXT: ldr s28, [sp, #464]
-; CHECK-GI-NEXT: ldr s16, [sp, #248]
-; CHECK-GI-NEXT: ldr s21, [sp, #280]
-; CHECK-GI-NEXT: mov.s v17[2], v9[0]
-; CHECK-GI-NEXT: mov.s v25[2], w2
-; CHECK-GI-NEXT: mov.s v26[2], w6
-; CHECK-GI-NEXT: mov.s v20[2], v27[0]
-; CHECK-GI-NEXT: mov.s v22[2], v24[0]
-; CHECK-GI-NEXT: mov.s v23[2], v28[0]
-; CHECK-GI-NEXT: ldr s29, [sp, #312]
-; CHECK-GI-NEXT: ldr s27, [sp, #344]
-; CHECK-GI-NEXT: ldr s24, [sp, #376]
-; CHECK-GI-NEXT: ldr s28, [sp, #408]
-; CHECK-GI-NEXT: mov.s v6[3], v16[0]
-; CHECK-GI-NEXT: ldr s16, [sp, #440]
-; CHECK-GI-NEXT: mov.s v7[3], v21[0]
-; CHECK-GI-NEXT: ldr s21, [sp, #472]
-; CHECK-GI-NEXT: mov.s v25[3], w3
-; CHECK-GI-NEXT: mov.s v26[3], w7
-; CHECK-GI-NEXT: mov.s v17[3], v29[0]
-; CHECK-GI-NEXT: mov.s v18[3], v27[0]
-; CHECK-GI-NEXT: mov.s v19[3], v24[0]
-; CHECK-GI-NEXT: mov.s v20[3], v28[0]
-; CHECK-GI-NEXT: mov.s v22[3], v16[0]
-; CHECK-GI-NEXT: mov.s v23[3], v21[0]
-; CHECK-GI-NEXT: uzp1.8h v0, v0, v1
-; CHECK-GI-NEXT: uzp1.8h v1, v2, v3
-; CHECK-GI-NEXT: uzp1.8h v2, v4, v5
-; CHECK-GI-NEXT: uzp1.8h v3, v6, v7
-; CHECK-GI-NEXT: ldr x29, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT: uzp1.8h v16, v25, v26
-; CHECK-GI-NEXT: uzp1.8h v4, v17, v18
-; CHECK-GI-NEXT: uzp1.8h v5, v19, v20
-; CHECK-GI-NEXT: uzp1.8h v6, v22, v23
-; CHECK-GI-NEXT: uzp1.16b v1, v1, v2
-; CHECK-GI-NEXT: uzp1.16b v0, v16, v0
-; CHECK-GI-NEXT: uzp1.16b v2, v3, v4
-; CHECK-GI-NEXT: uzp1.16b v3, v5, v6
+; CHECK-GI-NEXT: ldr w9, [sp, #80]
+; CHECK-GI-NEXT: ldr w11, [sp, #88]
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s3, w1
+; CHECK-GI-NEXT: ldr w8, [sp, #208]
+; CHECK-GI-NEXT: ldr w10, [sp, #216]
+; CHECK-GI-NEXT: fmov s1, w9
+; CHECK-GI-NEXT: fmov s4, w11
+; CHECK-GI-NEXT: ldr w9, [sp, #336]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: fmov s5, w10
+; CHECK-GI-NEXT: ldr w11, [sp, #344]
+; CHECK-GI-NEXT: mov.b v0[1], v3[0]
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #224]
+; CHECK-GI-NEXT: mov.b v1[1], v4[0]
+; CHECK-GI-NEXT: fmov s4, w2
+; CHECK-GI-NEXT: fmov s6, w11
+; CHECK-GI-NEXT: mov.b v2[1], v5[0]
+; CHECK-GI-NEXT: ldr w8, [sp, #96]
+; CHECK-GI-NEXT: ldr w10, [sp, #352]
+; CHECK-GI-NEXT: ldr w11, [sp, #16]
+; CHECK-GI-NEXT: mov.b v0[2], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #232]
+; CHECK-GI-NEXT: mov.b v3[1], v6[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w8, [sp, #104]
+; CHECK-GI-NEXT: ldr w10, [sp, #360]
+; CHECK-GI-NEXT: mov.b v2[2], v4[0]
+; CHECK-GI-NEXT: fmov s4, w3
+; CHECK-GI-NEXT: mov.b v1[2], v5[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #112]
+; CHECK-GI-NEXT: mov.b v3[2], v6[0]
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #368]
+; CHECK-GI-NEXT: mov.b v0[3], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #240]
+; CHECK-GI-NEXT: mov.b v1[3], v5[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #120]
+; CHECK-GI-NEXT: mov.b v2[3], v4[0]
+; CHECK-GI-NEXT: fmov s4, w4
+; CHECK-GI-NEXT: mov.b v3[3], v6[0]
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #376]
+; CHECK-GI-NEXT: mov.b v0[4], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #248]
+; CHECK-GI-NEXT: mov.b v1[4], v5[0]
+; CHECK-GI-NEXT: mov.b v3[4], v6[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w8, [sp, #128]
+; CHECK-GI-NEXT: ldr w10, [sp, #384]
+; CHECK-GI-NEXT: mov.b v2[4], v4[0]
+; CHECK-GI-NEXT: fmov s4, w5
+; CHECK-GI-NEXT: mov.b v1[5], v5[0]
+; CHECK-GI-NEXT: mov.b v3[5], v6[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: mov.b v0[5], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #256]
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w8, [sp, #136]
+; CHECK-GI-NEXT: ldr w10, [sp, #392]
+; CHECK-GI-NEXT: mov.b v2[5], v4[0]
+; CHECK-GI-NEXT: fmov s4, w6
+; CHECK-GI-NEXT: mov.b v1[6], v5[0]
+; CHECK-GI-NEXT: mov.b v3[6], v6[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w8, [sp, #144]
+; CHECK-GI-NEXT: ldr w10, [sp, #400]
+; CHECK-GI-NEXT: mov.b v0[6], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #264]
+; CHECK-GI-NEXT: mov.b v1[7], v5[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #152]
+; CHECK-GI-NEXT: mov.b v3[7], v6[0]
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #408]
+; CHECK-GI-NEXT: mov.b v2[6], v4[0]
+; CHECK-GI-NEXT: fmov s4, w7
+; CHECK-GI-NEXT: mov.b v1[8], v5[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #160]
+; CHECK-GI-NEXT: mov.b v0[7], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #272]
+; CHECK-GI-NEXT: mov.b v3[8], v6[0]
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #416]
+; CHECK-GI-NEXT: mov.b v2[7], v4[0]
+; CHECK-GI-NEXT: fmov s4, w11
+; CHECK-GI-NEXT: ldr w11, [sp, #24]
+; CHECK-GI-NEXT: mov.b v1[9], v5[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #168]
+; CHECK-GI-NEXT: mov.b v3[9], v6[0]
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #424]
+; CHECK-GI-NEXT: mov.b v0[8], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #280]
+; CHECK-GI-NEXT: mov.b v1[10], v5[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #176]
+; CHECK-GI-NEXT: mov.b v2[8], v4[0]
+; CHECK-GI-NEXT: fmov s4, w11
+; CHECK-GI-NEXT: ldr w11, [sp, #32]
+; CHECK-GI-NEXT: mov.b v3[10], v6[0]
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #432]
+; CHECK-GI-NEXT: mov.b v0[9], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #288]
+; CHECK-GI-NEXT: mov.b v1[11], v5[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #184]
+; CHECK-GI-NEXT: mov.b v3[11], v6[0]
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #440]
+; CHECK-GI-NEXT: mov.b v2[9], v4[0]
+; CHECK-GI-NEXT: fmov s4, w11
+; CHECK-GI-NEXT: ldr w11, [sp, #40]
+; CHECK-GI-NEXT: mov.b v1[12], v5[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #192]
+; CHECK-GI-NEXT: mov.b v0[10], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #296]
+; CHECK-GI-NEXT: mov.b v3[12], v6[0]
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #448]
+; CHECK-GI-NEXT: mov.b v2[10], v4[0]
+; CHECK-GI-NEXT: fmov s4, w11
+; CHECK-GI-NEXT: ldr w11, [sp, #48]
+; CHECK-GI-NEXT: mov.b v1[13], v5[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #200]
+; CHECK-GI-NEXT: mov.b v3[13], v6[0]
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #456]
+; CHECK-GI-NEXT: mov.b v0[11], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #304]
+; CHECK-GI-NEXT: fmov s7, w10
+; CHECK-GI-NEXT: mov.b v1[14], v5[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: mov.b v2[11], v4[0]
+; CHECK-GI-NEXT: fmov s4, w11
+; CHECK-GI-NEXT: ldr w11, [sp, #56]
+; CHECK-GI-NEXT: mov.b v3[14], v6[0]
+; CHECK-GI-NEXT: mov.b v0[12], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #312]
+; CHECK-GI-NEXT: mov.b v1[15], v5[0]
+; CHECK-GI-NEXT: mov.b v3[15], v7[0]
+; CHECK-GI-NEXT: mov.b v2[12], v4[0]
+; CHECK-GI-NEXT: fmov s4, w11
+; CHECK-GI-NEXT: ldr w11, [sp, #64]
; CHECK-GI-NEXT: shl.16b v1, v1, #7
-; CHECK-GI-NEXT: shl.16b v0, v0, #7
-; CHECK-GI-NEXT: shl.16b v2, v2, #7
+; CHECK-GI-NEXT: mov.b v0[13], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #320]
; CHECK-GI-NEXT: shl.16b v3, v3, #7
; CHECK-GI-NEXT: sshr.16b v1, v1, #7
+; CHECK-GI-NEXT: mov.b v2[13], v4[0]
+; CHECK-GI-NEXT: fmov s4, w11
+; CHECK-GI-NEXT: ldr w11, [sp, #72]
+; CHECK-GI-NEXT: sshr.16b v3, v3, #7
+; CHECK-GI-NEXT: mov.b v0[14], v4[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #328]
+; CHECK-GI-NEXT: fmov s6, w9
+; CHECK-GI-NEXT: mov.b v2[14], v4[0]
+; CHECK-GI-NEXT: fmov s4, w11
+; CHECK-GI-NEXT: mov.b v0[15], v4[0]
+; CHECK-GI-NEXT: mov.b v2[15], v6[0]
+; CHECK-GI-NEXT: shl.16b v0, v0, #7
+; CHECK-GI-NEXT: shl.16b v2, v2, #7
; CHECK-GI-NEXT: sshr.16b v0, v0, #7
; CHECK-GI-NEXT: sshr.16b v2, v2, #7
-; CHECK-GI-NEXT: sshr.16b v3, v3, #7
-; CHECK-GI-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-GI-NEXT: ret
%res = sext <64 x i1> %arg to <64 x i8>
ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
index 38a568ac919168..a724958474cfb8 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
@@ -1022,11 +1022,18 @@ declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nou
declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone
define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: uaddl_duprhs:
-; CHECK: // %bb.0:
-; CHECK-NEXT: dup v1.2s, w0
-; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: uaddl_duprhs:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: dup v1.2s, w0
+; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uaddl_duprhs:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, w0
+; CHECK-GI-NEXT: dup v1.2d, x8
+; CHECK-GI-NEXT: uaddw v0.2d, v1.2d, v0.2s
+; CHECK-GI-NEXT: ret
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
@@ -1048,8 +1055,8 @@ define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
;
; CHECK-GI-LABEL: uaddl2_duprhs:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: dup v1.2s, w0
-; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT: mov w8, w0
+; CHECK-GI-NEXT: dup v1.2d, x8
; CHECK-GI-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-NEXT: ret
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
@@ -1108,11 +1115,19 @@ define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
}
define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: usubl_duprhs:
-; CHECK: // %bb.0:
-; CHECK-NEXT: dup v1.2s, w0
-; CHECK-NEXT: usubl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: usubl_duprhs:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: dup v1.2s, w0
+; CHECK-SD-NEXT: usubl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: usubl_duprhs:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, w0
+; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: dup v1.2d, x8
+; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT: ret
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
@@ -1134,9 +1149,10 @@ define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
;
; CHECK-GI-LABEL: usubl2_duprhs:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: dup v1.2s, w0
-; CHECK-GI-NEXT: mov d0, v0.d[1]
-; CHECK-GI-NEXT: usubl v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: mov w8, w0
+; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT: dup v1.2d, x8
+; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d
; CHECK-GI-NEXT: ret
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll
index 402682c89124bd..6f4b090fb22bd6 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll
@@ -1266,95 +1266,133 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) {
;
; CHECK-GI-LABEL: v20:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s4, [sp, #8]
-; CHECK-GI-NEXT: fmov s1, w0
-; CHECK-GI-NEXT: ldr s2, [sp, #32]
-; CHECK-GI-NEXT: ldr s19, [sp, #40]
-; CHECK-GI-NEXT: fmov s3, w4
-; CHECK-GI-NEXT: mov v0.s[1], v4.s[0]
-; CHECK-GI-NEXT: ldr s16, [sp, #96]
-; CHECK-GI-NEXT: ldr s22, [sp, #104]
-; CHECK-GI-NEXT: mov v2.s[1], v19.s[0]
-; CHECK-GI-NEXT: ldr s19, [sp, #128]
-; CHECK-GI-NEXT: ldr s23, [sp, #136]
-; CHECK-GI-NEXT: ldr s18, [sp, #16]
-; CHECK-GI-NEXT: mov v1.s[1], w1
-; CHECK-GI-NEXT: mov v3.s[1], w5
-; CHECK-GI-NEXT: mov v16.s[1], v22.s[0]
-; CHECK-GI-NEXT: mov v19.s[1], v23.s[0]
-; CHECK-GI-NEXT: ldr s4, [sp, #64]
-; CHECK-GI-NEXT: ldr s21, [sp, #72]
-; CHECK-GI-NEXT: mov v0.s[2], v18.s[0]
-; CHECK-GI-NEXT: ldr s18, [sp, #160]
-; CHECK-GI-NEXT: ldr s24, [sp, #168]
-; CHECK-GI-NEXT: ldr s20, [sp, #192]
-; CHECK-GI-NEXT: ldr s25, [sp, #200]
-; CHECK-GI-NEXT: ldr s22, [sp, #224]
-; CHECK-GI-NEXT: ldr s27, [sp, #232]
-; CHECK-GI-NEXT: ldr s23, [sp, #112]
-; CHECK-GI-NEXT: ldr s26, [sp, #144]
-; CHECK-GI-NEXT: mov v18.s[1], v24.s[0]
-; CHECK-GI-NEXT: mov v20.s[1], v25.s[0]
-; CHECK-GI-NEXT: mov v4.s[1], v21.s[0]
-; CHECK-GI-NEXT: mov v22.s[1], v27.s[0]
-; CHECK-GI-NEXT: mov v1.s[2], w2
-; CHECK-GI-NEXT: ldr s17, [sp, #48]
-; CHECK-GI-NEXT: mov v3.s[2], w6
-; CHECK-GI-NEXT: mov v16.s[2], v23.s[0]
-; CHECK-GI-NEXT: mov v19.s[2], v26.s[0]
-; CHECK-GI-NEXT: ldr s7, [sp, #80]
-; CHECK-GI-NEXT: ldr s21, [sp, #176]
-; CHECK-GI-NEXT: ldr s24, [sp, #208]
-; CHECK-GI-NEXT: ldr s25, [sp, #240]
-; CHECK-GI-NEXT: mov v2.s[2], v17.s[0]
-; CHECK-GI-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-NEXT: ldr s23, [sp, #152]
-; CHECK-GI-NEXT: ldr s5, [sp, #24]
-; CHECK-GI-NEXT: mov v18.s[2], v21.s[0]
-; CHECK-GI-NEXT: mov v20.s[2], v24.s[0]
-; CHECK-GI-NEXT: mov v4.s[2], v7.s[0]
-; CHECK-GI-NEXT: mov v22.s[2], v25.s[0]
-; CHECK-GI-NEXT: mov v1.s[3], w3
-; CHECK-GI-NEXT: mov v3.s[3], w7
-; CHECK-GI-NEXT: mov v16.s[3], v17.s[0]
-; CHECK-GI-NEXT: mov v19.s[3], v23.s[0]
-; CHECK-GI-NEXT: ldr s6, [sp, #56]
-; CHECK-GI-NEXT: ldr s7, [sp, #184]
-; CHECK-GI-NEXT: ldr s21, [sp, #216]
-; CHECK-GI-NEXT: ldr s17, [sp, #88]
-; CHECK-GI-NEXT: mov v0.s[3], v5.s[0]
-; CHECK-GI-NEXT: ldr s5, [sp, #248]
-; CHECK-GI-NEXT: mov v2.s[3], v6.s[0]
-; CHECK-GI-NEXT: mov v18.s[3], v7.s[0]
-; CHECK-GI-NEXT: mov v20.s[3], v21.s[0]
-; CHECK-GI-NEXT: mov v4.s[3], v17.s[0]
-; CHECK-GI-NEXT: mov v22.s[3], v5.s[0]
-; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v3.8h
-; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff
-; CHECK-GI-NEXT: uzp1 v5.8h, v16.8h, v19.8h
-; CHECK-GI-NEXT: dup v6.4s, w8
-; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-GI-NEXT: uzp1 v2.8h, v18.8h, v20.8h
-; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v6.8h
-; CHECK-GI-NEXT: uzp1 v6.8h, v22.8h, v6.8h
-; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT: and v5.16b, v5.16b, v3.16b
-; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
-; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT: add v1.8h, v1.8h, v5.8h
-; CHECK-GI-NEXT: and v4.16b, v4.16b, v3.16b
-; CHECK-GI-NEXT: and v3.16b, v6.16b, v3.16b
-; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h
-; CHECK-GI-NEXT: ushll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT: add v3.4h, v4.4h, v3.4h
+; CHECK-GI-NEXT: ldr w9, [sp, #64]
+; CHECK-GI-NEXT: ldr w10, [sp, #72]
+; CHECK-GI-NEXT: and w13, w2, #0xff
+; CHECK-GI-NEXT: ldr w11, [sp, #80]
+; CHECK-GI-NEXT: ldr w12, [sp, #88]
+; CHECK-GI-NEXT: fmov s19, w13
+; CHECK-GI-NEXT: fmov s0, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #224]
+; CHECK-GI-NEXT: fmov s16, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #232]
+; CHECK-GI-NEXT: fmov s3, w11
+; CHECK-GI-NEXT: ldr w11, [sp, #240]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #248]
+; CHECK-GI-NEXT: fmov s1, w12
+; CHECK-GI-NEXT: fmov s7, w10
+; CHECK-GI-NEXT: and w10, w1, #0xff
+; CHECK-GI-NEXT: fmov s5, w11
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: and w9, w0, #0xff
+; CHECK-GI-NEXT: ldrb w11, [sp]
+; CHECK-GI-NEXT: ldrb w12, [sp, #8]
+; CHECK-GI-NEXT: fmov s6, w9
+; CHECK-GI-NEXT: fmov s20, w10
+; CHECK-GI-NEXT: ldrb w9, [sp, #96]
+; CHECK-GI-NEXT: ldrb w10, [sp, #104]
+; CHECK-GI-NEXT: fmov s17, w11
+; CHECK-GI-NEXT: fmov s21, w12
+; CHECK-GI-NEXT: ldrb w11, [sp, #160]
+; CHECK-GI-NEXT: mov v0.b[1], v16.b[0]
+; CHECK-GI-NEXT: fmov s18, w9
+; CHECK-GI-NEXT: fmov s22, w10
+; CHECK-GI-NEXT: ldrb w9, [sp, #168]
+; CHECK-GI-NEXT: mov v6.h[1], v20.h[0]
+; CHECK-GI-NEXT: fmov s20, w11
+; CHECK-GI-NEXT: ldrb w10, [sp, #16]
+; CHECK-GI-NEXT: mov v17.h[1], v21.h[0]
+; CHECK-GI-NEXT: fmov s21, w9
+; CHECK-GI-NEXT: ldrb w9, [sp, #112]
+; CHECK-GI-NEXT: mov v18.h[1], v22.h[0]
+; CHECK-GI-NEXT: fmov s23, w10
+; CHECK-GI-NEXT: ldrb w10, [sp, #176]
+; CHECK-GI-NEXT: and w11, w3, #0xff
+; CHECK-GI-NEXT: mov v2.b[1], v7.b[0]
+; CHECK-GI-NEXT: mov v0.b[2], v3.b[0]
+; CHECK-GI-NEXT: mov v6.h[2], v19.h[0]
+; CHECK-GI-NEXT: fmov s19, w9
+; CHECK-GI-NEXT: mov v20.h[1], v21.h[0]
+; CHECK-GI-NEXT: ldrb w9, [sp, #24]
+; CHECK-GI-NEXT: fmov s22, w11
+; CHECK-GI-NEXT: mov v17.h[2], v23.h[0]
+; CHECK-GI-NEXT: and w11, w4, #0xff
+; CHECK-GI-NEXT: mov v18.h[2], v19.h[0]
+; CHECK-GI-NEXT: fmov s19, w10
+; CHECK-GI-NEXT: ldrb w10, [sp, #120]
+; CHECK-GI-NEXT: fmov s23, w9
+; CHECK-GI-NEXT: ldrb w9, [sp, #184]
+; CHECK-GI-NEXT: mov v6.h[3], v22.h[0]
+; CHECK-GI-NEXT: fmov s21, w11
+; CHECK-GI-NEXT: and w11, w6, #0xff
+; CHECK-GI-NEXT: mov v2.b[2], v5.b[0]
+; CHECK-GI-NEXT: mov v20.h[2], v19.h[0]
+; CHECK-GI-NEXT: fmov s19, w10
+; CHECK-GI-NEXT: fmov s16, w9
+; CHECK-GI-NEXT: ldrb w9, [sp, #128]
+; CHECK-GI-NEXT: and w10, w5, #0xff
+; CHECK-GI-NEXT: mov v17.h[3], v23.h[0]
+; CHECK-GI-NEXT: mov v6.h[4], v21.h[0]
+; CHECK-GI-NEXT: mov v0.b[3], v1.b[0]
+; CHECK-GI-NEXT: mov v18.h[3], v19.h[0]
+; CHECK-GI-NEXT: fmov s19, w9
+; CHECK-GI-NEXT: ldrb w9, [sp, #192]
+; CHECK-GI-NEXT: mov v20.h[3], v16.h[0]
+; CHECK-GI-NEXT: fmov s16, w10
+; CHECK-GI-NEXT: ldrb w10, [sp, #32]
+; CHECK-GI-NEXT: mov v2.b[3], v4.b[0]
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: mov v18.h[4], v19.h[0]
+; CHECK-GI-NEXT: fmov s19, w10
+; CHECK-GI-NEXT: ldrb w10, [sp, #136]
+; CHECK-GI-NEXT: mov v6.h[5], v16.h[0]
+; CHECK-GI-NEXT: fmov s16, w10
+; CHECK-GI-NEXT: ldrb w10, [sp, #48]
+; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT: mov v17.h[4], v19.h[0]
+; CHECK-GI-NEXT: fmov s19, w9
+; CHECK-GI-NEXT: ldrb w9, [sp, #40]
+; CHECK-GI-NEXT: mov v18.h[5], v16.h[0]
+; CHECK-GI-NEXT: fmov s16, w9
+; CHECK-GI-NEXT: ldrb w9, [sp, #144]
+; CHECK-GI-NEXT: mov v20.h[4], v19.h[0]
+; CHECK-GI-NEXT: fmov s19, w11
+; CHECK-GI-NEXT: ldrb w11, [sp, #200]
+; CHECK-GI-NEXT: add v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT: fmov s7, w11
+; CHECK-GI-NEXT: mov v17.h[5], v16.h[0]
+; CHECK-GI-NEXT: fmov s16, w9
+; CHECK-GI-NEXT: ldrb w11, [sp, #208]
+; CHECK-GI-NEXT: mov v6.h[6], v19.h[0]
+; CHECK-GI-NEXT: ldrb w9, [sp, #56]
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: mov v20.h[5], v7.h[0]
+; CHECK-GI-NEXT: fmov s7, w10
+; CHECK-GI-NEXT: mov v18.h[6], v16.h[0]
+; CHECK-GI-NEXT: fmov s16, w11
+; CHECK-GI-NEXT: ldrb w10, [sp, #152]
+; CHECK-GI-NEXT: and w11, w7, #0xff
+; CHECK-GI-NEXT: fmov s3, w11
+; CHECK-GI-NEXT: str q0, [x8, #64]
+; CHECK-GI-NEXT: fmov s5, w10
+; CHECK-GI-NEXT: ldrb w10, [sp, #216]
+; CHECK-GI-NEXT: mov v17.h[6], v7.h[0]
+; CHECK-GI-NEXT: mov v20.h[6], v16.h[0]
+; CHECK-GI-NEXT: fmov s7, w9
+; CHECK-GI-NEXT: mov v6.h[7], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w10
+; CHECK-GI-NEXT: mov v18.h[7], v5.h[0]
+; CHECK-GI-NEXT: mov v17.h[7], v7.h[0]
+; CHECK-GI-NEXT: mov v20.h[7], v3.h[0]
+; CHECK-GI-NEXT: add v1.8h, v6.8h, v18.8h
+; CHECK-GI-NEXT: add v3.8h, v17.8h, v20.8h
+; CHECK-GI-NEXT: ushll v4.4s, v1.4h, #0
; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0
-; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT: stp q2, q1, [x8]
; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0
-; CHECK-GI-NEXT: stp q4, q0, [x8, #32]
-; CHECK-GI-NEXT: str q2, [x8, #64]
+; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT: stp q4, q1, [x8]
+; CHECK-GI-NEXT: stp q2, q3, [x8, #32]
; CHECK-GI-NEXT: ret
entry:
%s0s = zext <20 x i8> %s0 to <20 x i32>
@@ -1459,69 +1497,107 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) {
;
; CHECK-GI-LABEL: i12:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s1, w0
-; CHECK-GI-NEXT: fmov s4, w4
-; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s20, [sp, #8]
-; CHECK-GI-NEXT: ldr s2, [sp, #32]
-; CHECK-GI-NEXT: ldr s21, [sp, #40]
-; CHECK-GI-NEXT: ldr s16, [sp, #64]
-; CHECK-GI-NEXT: ldr s22, [sp, #72]
-; CHECK-GI-NEXT: ldr s17, [sp, #96]
-; CHECK-GI-NEXT: ldr s23, [sp, #104]
-; CHECK-GI-NEXT: mov v1.s[1], w1
-; CHECK-GI-NEXT: mov v4.s[1], w5
-; CHECK-GI-NEXT: ldr s18, [sp, #128]
-; CHECK-GI-NEXT: ldr s24, [sp, #136]
-; CHECK-GI-NEXT: mov v0.s[1], v20.s[0]
-; CHECK-GI-NEXT: ldr s19, [sp, #160]
-; CHECK-GI-NEXT: ldr s25, [sp, #168]
-; CHECK-GI-NEXT: mov v2.s[1], v21.s[0]
-; CHECK-GI-NEXT: mov v16.s[1], v22.s[0]
-; CHECK-GI-NEXT: mov v17.s[1], v23.s[0]
-; CHECK-GI-NEXT: mov v18.s[1], v24.s[0]
-; CHECK-GI-NEXT: mov v19.s[1], v25.s[0]
-; CHECK-GI-NEXT: ldr s6, [sp, #16]
-; CHECK-GI-NEXT: ldr s7, [sp, #48]
-; CHECK-GI-NEXT: ldr s20, [sp, #80]
-; CHECK-GI-NEXT: ldr s21, [sp, #112]
-; CHECK-GI-NEXT: ldr s22, [sp, #144]
-; CHECK-GI-NEXT: ldr s23, [sp, #176]
-; CHECK-GI-NEXT: mov v1.s[2], w2
-; CHECK-GI-NEXT: mov v4.s[2], w6
-; CHECK-GI-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-NEXT: mov v2.s[2], v7.s[0]
-; CHECK-GI-NEXT: mov v16.s[2], v20.s[0]
-; CHECK-GI-NEXT: mov v17.s[2], v21.s[0]
-; CHECK-GI-NEXT: mov v18.s[2], v22.s[0]
-; CHECK-GI-NEXT: mov v19.s[2], v23.s[0]
-; CHECK-GI-NEXT: ldr s3, [sp, #24]
-; CHECK-GI-NEXT: ldr s5, [sp, #56]
-; CHECK-GI-NEXT: ldr s6, [sp, #88]
-; CHECK-GI-NEXT: ldr s7, [sp, #120]
-; CHECK-GI-NEXT: ldr s20, [sp, #152]
-; CHECK-GI-NEXT: ldr s21, [sp, #184]
-; CHECK-GI-NEXT: mov v1.s[3], w3
-; CHECK-GI-NEXT: mov v4.s[3], w7
-; CHECK-GI-NEXT: movi v22.4s, #15, msl #8
-; CHECK-GI-NEXT: mov v0.s[3], v3.s[0]
-; CHECK-GI-NEXT: mov v2.s[3], v5.s[0]
-; CHECK-GI-NEXT: mov v16.s[3], v6.s[0]
-; CHECK-GI-NEXT: mov v17.s[3], v7.s[0]
-; CHECK-GI-NEXT: mov v18.s[3], v20.s[0]
-; CHECK-GI-NEXT: mov v19.s[3], v21.s[0]
-; CHECK-GI-NEXT: and v1.16b, v1.16b, v22.16b
-; CHECK-GI-NEXT: and v3.16b, v4.16b, v22.16b
-; CHECK-GI-NEXT: and v4.16b, v0.16b, v22.16b
-; CHECK-GI-NEXT: and v5.16b, v2.16b, v22.16b
-; CHECK-GI-NEXT: and v0.16b, v16.16b, v22.16b
-; CHECK-GI-NEXT: and v2.16b, v17.16b, v22.16b
-; CHECK-GI-NEXT: and v6.16b, v18.16b, v22.16b
-; CHECK-GI-NEXT: and v7.16b, v19.16b, v22.16b
-; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT: add v1.4s, v3.4s, v2.4s
-; CHECK-GI-NEXT: add v2.4s, v4.4s, v6.4s
-; CHECK-GI-NEXT: add v3.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: ldr w8, [sp]
+; CHECK-GI-NEXT: fmov s2, w5
+; CHECK-GI-NEXT: ldr w9, [sp, #8]
+; CHECK-GI-NEXT: ldr w11, [sp, #32]
+; CHECK-GI-NEXT: ldr w12, [sp, #40]
+; CHECK-GI-NEXT: fmov s5, w7
+; CHECK-GI-NEXT: ldr w10, [sp, #16]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: fmov s1, w4
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: fmov s4, w12
+; CHECK-GI-NEXT: ldr w12, [sp, #96]
+; CHECK-GI-NEXT: ldr w13, [sp, #104]
+; CHECK-GI-NEXT: ldr w14, [sp, #128]
+; CHECK-GI-NEXT: ldr w15, [sp, #136]
+; CHECK-GI-NEXT: ldr w16, [sp, #160]
+; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w2
+; CHECK-GI-NEXT: fmov s7, w13
+; CHECK-GI-NEXT: fmov s16, w15
+; CHECK-GI-NEXT: ldr w17, [sp, #168]
+; CHECK-GI-NEXT: ldr w9, [sp, #24]
+; CHECK-GI-NEXT: ldr w13, [sp, #176]
+; CHECK-GI-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w6
+; CHECK-GI-NEXT: fmov s17, w17
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #56]
+; CHECK-GI-NEXT: mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w11
+; CHECK-GI-NEXT: ldr w11, [sp, #48]
+; CHECK-GI-NEXT: mov v1.h[3], v5.h[0]
+; CHECK-GI-NEXT: fmov s5, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #64]
+; CHECK-GI-NEXT: mov v3.h[1], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w3
+; CHECK-GI-NEXT: mov v2.h[2], v5.h[0]
+; CHECK-GI-NEXT: fmov s5, w11
+; CHECK-GI-NEXT: ldr w11, [sp, #72]
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: fmov s6, w11
+; CHECK-GI-NEXT: mov v0.h[3], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: mov v3.h[2], v5.h[0]
+; CHECK-GI-NEXT: fmov s5, w10
+; CHECK-GI-NEXT: ldr w9, [sp, #80]
+; CHECK-GI-NEXT: ldr w10, [sp, #112]
+; CHECK-GI-NEXT: ldr w11, [sp, #144]
+; CHECK-GI-NEXT: mov v2.h[3], v4.h[0]
+; CHECK-GI-NEXT: mov v5.h[1], v6.h[0]
+; CHECK-GI-NEXT: fmov s6, w12
+; CHECK-GI-NEXT: fmov s18, w11
+; CHECK-GI-NEXT: ldr w12, [sp, #88]
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: mov v6.h[1], v7.h[0]
+; CHECK-GI-NEXT: fmov s7, w14
+; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: mov v7.h[1], v16.h[0]
+; CHECK-GI-NEXT: fmov s16, w16
+; CHECK-GI-NEXT: mov v16.h[1], v17.h[0]
+; CHECK-GI-NEXT: fmov s17, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #152]
+; CHECK-GI-NEXT: mov v7.h[2], v18.h[0]
+; CHECK-GI-NEXT: fmov s18, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #120]
+; CHECK-GI-NEXT: mov v5.h[2], v17.h[0]
+; CHECK-GI-NEXT: fmov s17, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #184]
+; CHECK-GI-NEXT: mov v3.h[3], v18.h[0]
+; CHECK-GI-NEXT: fmov s4, w8
+; CHECK-GI-NEXT: fmov s18, w10
+; CHECK-GI-NEXT: mov v6.h[2], v17.h[0]
+; CHECK-GI-NEXT: fmov s17, w13
+; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: mov v16.h[2], v17.h[0]
+; CHECK-GI-NEXT: fmov s17, w12
+; CHECK-GI-NEXT: mov v6.h[3], v4.h[0]
+; CHECK-GI-NEXT: movi v4.4s, #15, msl #8
+; CHECK-GI-NEXT: mov v5.h[3], v17.h[0]
+; CHECK-GI-NEXT: fmov s17, w9
+; CHECK-GI-NEXT: mov v16.h[3], v18.h[0]
+; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-GI-NEXT: mov v7.h[3], v17.h[0]
+; CHECK-GI-NEXT: and v2.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT: and v3.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0
+; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0
+; CHECK-GI-NEXT: and v6.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0
+; CHECK-GI-NEXT: and v5.16b, v5.16b, v4.16b
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT: and v7.16b, v7.16b, v4.16b
+; CHECK-GI-NEXT: and v4.16b, v16.16b, v4.16b
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v5.4s
+; CHECK-GI-NEXT: add v2.4s, v2.4s, v7.4s
+; CHECK-GI-NEXT: add v3.4s, v3.4s, v4.4s
; CHECK-GI-NEXT: ret
entry:
%s0s = zext <16 x i12> %s0 to <16 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 5237a3491de9b4..529a3b72e09714 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -219,12 +219,21 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) {
;
; CHECK-GI-LABEL: sext_v3i8_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
-; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
+; CHECK-GI-NEXT: lsl w8, w0, #8
+; CHECK-GI-NEXT: lsl w9, w1, #8
+; CHECK-GI-NEXT: lsl w10, w2, #8
+; CHECK-GI-NEXT: sxth w8, w8
+; CHECK-GI-NEXT: sxth w9, w9
+; CHECK-GI-NEXT: sxth w10, w10
+; CHECK-GI-NEXT: asr w8, w8, #8
+; CHECK-GI-NEXT: asr w9, w9, #8
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: fmov s1, w9
+; CHECK-GI-NEXT: asr w8, w10, #8
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i8> %a to <3 x i16>
@@ -244,16 +253,12 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) {
;
; CHECK-GI-LABEL: sext_v3i8_v3i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #24 // =0x18
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v1.s[2], w8
-; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: neg v1.4s, v1.4s
-; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: sxtb w8, w0
+; CHECK-GI-NEXT: sxtb w9, w1
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: sxtb w8, w2
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i8> %a to <3 x i32>
@@ -280,16 +285,15 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
;
; CHECK-GI-LABEL: sext_v3i8_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT: sxtb x8, w2
-; CHECK-GI-NEXT: fmov d2, x8
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT: shl v0.2d, v0.2d, #56
-; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #56
-; CHECK-GI-NEXT: mov d1, v0.d[1]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: sxtb x8, w0
+; CHECK-GI-NEXT: sxtb x9, w1
+; CHECK-GI-NEXT: sxtb x10, w2
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: fmov d1, x9
+; CHECK-GI-NEXT: fmov d2, x10
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i8> %a to <3 x i64>
@@ -382,12 +386,21 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v3i10_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: shl v0.4h, v0.4h, #6
-; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #6
+; CHECK-GI-NEXT: lsl w8, w0, #6
+; CHECK-GI-NEXT: lsl w9, w1, #6
+; CHECK-GI-NEXT: lsl w10, w2, #6
+; CHECK-GI-NEXT: sxth w8, w8
+; CHECK-GI-NEXT: sxth w9, w9
+; CHECK-GI-NEXT: sxth w10, w10
+; CHECK-GI-NEXT: asr w8, w8, #6
+; CHECK-GI-NEXT: asr w9, w9, #6
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: fmov s1, w9
+; CHECK-GI-NEXT: asr w8, w10, #6
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i10> %a to <3 x i16>
@@ -407,16 +420,12 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v3i10_v3i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #22 // =0x16
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v1.s[2], w8
-; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: neg v1.4s, v1.4s
-; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: sbfx w8, w0, #0, #10
+; CHECK-GI-NEXT: sbfx w9, w1, #0, #10
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: sbfx w8, w2, #0, #10
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i10> %a to <3 x i32>
@@ -443,16 +452,15 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v3i10_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT: sbfx x8, x2, #0, #10
-; CHECK-GI-NEXT: fmov d2, x8
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT: shl v0.2d, v0.2d, #54
-; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #54
-; CHECK-GI-NEXT: mov d1, v0.d[1]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: sbfx x8, x0, #0, #10
+; CHECK-GI-NEXT: sbfx x9, x1, #0, #10
+; CHECK-GI-NEXT: sbfx x10, x2, #0, #10
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: fmov d1, x9
+; CHECK-GI-NEXT: fmov d2, x10
; CHECK-GI-NEXT: ret
entry:
%c = sext <3 x i10> %a to <3 x i64>
@@ -1024,34 +1032,48 @@ define <16 x i16> @sext_v16i10_v16i16(<16 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v16i10_v16i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s4, w0
-; CHECK-GI-NEXT: fmov s5, w4
-; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s1, [sp, #8]
-; CHECK-GI-NEXT: ldr s2, [sp, #32]
-; CHECK-GI-NEXT: ldr s3, [sp, #40]
-; CHECK-GI-NEXT: mov v4.s[1], w1
-; CHECK-GI-NEXT: mov v5.s[1], w5
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT: ldr s1, [sp, #16]
-; CHECK-GI-NEXT: ldr s3, [sp, #48]
-; CHECK-GI-NEXT: mov v4.s[2], w2
-; CHECK-GI-NEXT: mov v5.s[2], w6
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT: mov v2.s[2], v3.s[0]
-; CHECK-GI-NEXT: ldr s1, [sp, #24]
-; CHECK-GI-NEXT: ldr s3, [sp, #56]
-; CHECK-GI-NEXT: mov v4.s[3], w3
-; CHECK-GI-NEXT: mov v5.s[3], w7
-; CHECK-GI-NEXT: mov v0.s[3], v1.s[0]
-; CHECK-GI-NEXT: mov v2.s[3], v3.s[0]
-; CHECK-GI-NEXT: uzp1 v1.8h, v4.8h, v5.8h
-; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT: ldr w8, [sp]
+; CHECK-GI-NEXT: ldr w9, [sp, #8]
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s2, w1
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w2
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #24]
+; CHECK-GI-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w3
+; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #32]
+; CHECK-GI-NEXT: mov v0.h[3], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w4
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #40]
+; CHECK-GI-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w5
+; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #48]
+; CHECK-GI-NEXT: mov v0.h[5], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w6
+; CHECK-GI-NEXT: mov v1.h[5], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #56]
+; CHECK-GI-NEXT: mov v0.h[6], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w7
+; CHECK-GI-NEXT: mov v1.h[6], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: mov v0.h[7], v2.h[0]
+; CHECK-GI-NEXT: mov v1.h[7], v3.h[0]
+; CHECK-GI-NEXT: shl v0.8h, v0.8h, #6
; CHECK-GI-NEXT: shl v1.8h, v1.8h, #6
-; CHECK-GI-NEXT: shl v2.8h, v0.8h, #6
-; CHECK-GI-NEXT: sshr v0.8h, v1.8h, #6
-; CHECK-GI-NEXT: sshr v1.8h, v2.8h, #6
+; CHECK-GI-NEXT: sshr v0.8h, v0.8h, #6
+; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #6
; CHECK-GI-NEXT: ret
entry:
%c = sext <16 x i10> %a to <16 x i16>
@@ -1101,36 +1123,54 @@ define <16 x i32> @sext_v16i10_v16i32(<16 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v16i10_v16i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s4, w0
-; CHECK-GI-NEXT: fmov s5, w4
-; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s1, [sp, #8]
-; CHECK-GI-NEXT: ldr s2, [sp, #32]
-; CHECK-GI-NEXT: ldr s3, [sp, #40]
-; CHECK-GI-NEXT: mov v4.s[1], w1
-; CHECK-GI-NEXT: mov v5.s[1], w5
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT: ldr s1, [sp, #16]
-; CHECK-GI-NEXT: ldr s3, [sp, #48]
-; CHECK-GI-NEXT: mov v4.s[2], w2
-; CHECK-GI-NEXT: mov v5.s[2], w6
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT: mov v2.s[2], v3.s[0]
-; CHECK-GI-NEXT: ldr s1, [sp, #24]
-; CHECK-GI-NEXT: ldr s3, [sp, #56]
-; CHECK-GI-NEXT: mov v4.s[3], w3
-; CHECK-GI-NEXT: mov v5.s[3], w7
-; CHECK-GI-NEXT: mov v0.s[3], v1.s[0]
-; CHECK-GI-NEXT: mov v2.s[3], v3.s[0]
-; CHECK-GI-NEXT: shl v1.4s, v4.4s, #22
-; CHECK-GI-NEXT: shl v3.4s, v5.4s, #22
-; CHECK-GI-NEXT: shl v4.4s, v0.4s, #22
-; CHECK-GI-NEXT: shl v5.4s, v2.4s, #22
-; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #22
-; CHECK-GI-NEXT: sshr v1.4s, v3.4s, #22
-; CHECK-GI-NEXT: sshr v2.4s, v4.4s, #22
-; CHECK-GI-NEXT: sshr v3.4s, v5.4s, #22
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: ldr w8, [sp]
+; CHECK-GI-NEXT: fmov s2, w5
+; CHECK-GI-NEXT: ldr w9, [sp, #8]
+; CHECK-GI-NEXT: ldr w10, [sp, #32]
+; CHECK-GI-NEXT: ldr w11, [sp, #40]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: fmov s1, w4
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: fmov s5, w10
+; CHECK-GI-NEXT: fmov s6, w11
+; CHECK-GI-NEXT: ldr w9, [sp, #48]
+; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w2
+; CHECK-GI-NEXT: mov v3.h[1], v4.h[0]
+; CHECK-GI-NEXT: mov v5.h[1], v6.h[0]
+; CHECK-GI-NEXT: fmov s4, w8
+; CHECK-GI-NEXT: fmov s6, w9
+; CHECK-GI-NEXT: ldr w8, [sp, #24]
+; CHECK-GI-NEXT: ldr w9, [sp, #56]
+; CHECK-GI-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w6
+; CHECK-GI-NEXT: mov v3.h[2], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w8
+; CHECK-GI-NEXT: mov v5.h[2], v6.h[0]
+; CHECK-GI-NEXT: fmov s6, w9
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w3
+; CHECK-GI-NEXT: mov v3.h[3], v4.h[0]
+; CHECK-GI-NEXT: mov v0.h[3], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w7
+; CHECK-GI-NEXT: mov v5.h[3], v6.h[0]
+; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
+; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v3.4s, v5.4h, #0
+; CHECK-GI-NEXT: shl v2.4s, v2.4s, #22
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: shl v0.4s, v0.4s, #22
+; CHECK-GI-NEXT: shl v3.4s, v3.4s, #22
+; CHECK-GI-NEXT: sshr v2.4s, v2.4s, #22
+; CHECK-GI-NEXT: shl v1.4s, v1.4s, #22
+; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #22
+; CHECK-GI-NEXT: sshr v3.4s, v3.4s, #22
+; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #22
; CHECK-GI-NEXT: ret
entry:
%c = sext <16 x i10> %a to <16 x i32>
@@ -1188,49 +1228,69 @@ define <16 x i64> @sext_v16i10_v16i64(<16 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v16i10_v16i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s7, w0
-; CHECK-GI-NEXT: fmov s17, w2
-; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: fmov s18, w4
-; CHECK-GI-NEXT: fmov s19, w6
-; CHECK-GI-NEXT: ldr s1, [sp, #8]
-; CHECK-GI-NEXT: ldr s2, [sp, #16]
-; CHECK-GI-NEXT: ldr s3, [sp, #24]
-; CHECK-GI-NEXT: ldr s4, [sp, #32]
-; CHECK-GI-NEXT: ldr s5, [sp, #40]
-; CHECK-GI-NEXT: ldr s6, [sp, #48]
-; CHECK-GI-NEXT: ldr s16, [sp, #56]
-; CHECK-GI-NEXT: mov v7.s[1], w1
-; CHECK-GI-NEXT: mov v17.s[1], w3
-; CHECK-GI-NEXT: mov v18.s[1], w5
-; CHECK-GI-NEXT: mov v19.s[1], w7
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT: mov v6.s[1], v16.s[0]
-; CHECK-GI-NEXT: ushll v1.2d, v7.2s, #0
-; CHECK-GI-NEXT: ushll v3.2d, v17.2s, #0
-; CHECK-GI-NEXT: ushll v5.2d, v18.2s, #0
-; CHECK-GI-NEXT: ushll v7.2d, v19.2s, #0
-; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-GI-NEXT: ushll v4.2d, v4.2s, #0
-; CHECK-GI-NEXT: ushll v6.2d, v6.2s, #0
-; CHECK-GI-NEXT: shl v1.2d, v1.2d, #54
-; CHECK-GI-NEXT: shl v3.2d, v3.2d, #54
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: ldr w8, [sp]
+; CHECK-GI-NEXT: fmov s2, w5
+; CHECK-GI-NEXT: ldr w9, [sp, #8]
+; CHECK-GI-NEXT: ldr w10, [sp, #32]
+; CHECK-GI-NEXT: ldr w11, [sp, #40]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: fmov s1, w4
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: fmov s4, w11
+; CHECK-GI-NEXT: ldr w9, [sp, #48]
+; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #24]
+; CHECK-GI-NEXT: mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w10
+; CHECK-GI-NEXT: mov v3.h[1], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w2
+; CHECK-GI-NEXT: mov v2.h[2], v5.h[0]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: mov v0.h[2], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w6
+; CHECK-GI-NEXT: mov v2.h[3], v5.h[0]
+; CHECK-GI-NEXT: mov v1.h[2], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #56]
+; CHECK-GI-NEXT: mov v3.h[2], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w3
+; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: mov v0.h[3], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w7
+; CHECK-GI-NEXT: ushll v6.2d, v2.2s, #0
+; CHECK-GI-NEXT: ushll2 v2.2d, v2.4s, #0
+; CHECK-GI-NEXT: mov v1.h[3], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: shl v6.2d, v6.2d, #54
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: shl v18.2d, v2.2d, #54
+; CHECK-GI-NEXT: mov v3.h[3], v4.h[0]
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0
+; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: ushll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT: ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT: shl v4.2d, v4.2d, #54
+; CHECK-GI-NEXT: shl v16.2d, v0.2d, #54
+; CHECK-GI-NEXT: ushll v7.2d, v3.2s, #0
+; CHECK-GI-NEXT: ushll2 v3.2d, v3.4s, #0
; CHECK-GI-NEXT: shl v5.2d, v5.2d, #54
+; CHECK-GI-NEXT: shl v17.2d, v1.2d, #54
+; CHECK-GI-NEXT: sshr v0.2d, v4.2d, #54
+; CHECK-GI-NEXT: sshr v1.2d, v16.2d, #54
+; CHECK-GI-NEXT: sshr v4.2d, v6.2d, #54
; CHECK-GI-NEXT: shl v7.2d, v7.2d, #54
-; CHECK-GI-NEXT: shl v16.2d, v0.2d, #54
-; CHECK-GI-NEXT: shl v17.2d, v2.2d, #54
-; CHECK-GI-NEXT: shl v18.2d, v4.2d, #54
-; CHECK-GI-NEXT: shl v19.2d, v6.2d, #54
-; CHECK-GI-NEXT: sshr v0.2d, v1.2d, #54
-; CHECK-GI-NEXT: sshr v1.2d, v3.2d, #54
+; CHECK-GI-NEXT: shl v19.2d, v3.2d, #54
; CHECK-GI-NEXT: sshr v2.2d, v5.2d, #54
-; CHECK-GI-NEXT: sshr v3.2d, v7.2d, #54
-; CHECK-GI-NEXT: sshr v4.2d, v16.2d, #54
-; CHECK-GI-NEXT: sshr v5.2d, v17.2d, #54
-; CHECK-GI-NEXT: sshr v6.2d, v18.2d, #54
+; CHECK-GI-NEXT: sshr v3.2d, v17.2d, #54
+; CHECK-GI-NEXT: sshr v5.2d, v18.2d, #54
+; CHECK-GI-NEXT: sshr v6.2d, v7.2d, #54
; CHECK-GI-NEXT: sshr v7.2d, v19.2d, #54
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index c81fd26a775256..54ada05c904487 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -3812,51 +3812,72 @@ define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) {
;
; CHECK-GI-LABEL: add_v24i8_v24i16_zext:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s4, w0
-; CHECK-GI-NEXT: fmov s5, w4
-; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-NEXT: ldr s1, [sp, #32]
-; CHECK-GI-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-NEXT: ldr s2, [sp, #64]
-; CHECK-GI-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-NEXT: ldr s3, [sp, #96]
-; CHECK-GI-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-NEXT: mov v4.s[1], w1
-; CHECK-GI-NEXT: mov v5.s[1], w5
-; CHECK-GI-NEXT: mov v0.s[1], v6.s[0]
-; CHECK-GI-NEXT: mov v1.s[1], v7.s[0]
-; CHECK-GI-NEXT: mov v2.s[1], v16.s[0]
-; CHECK-GI-NEXT: mov v3.s[1], v17.s[0]
-; CHECK-GI-NEXT: ldr s6, [sp, #16]
-; CHECK-GI-NEXT: ldr s7, [sp, #48]
-; CHECK-GI-NEXT: ldr s16, [sp, #80]
-; CHECK-GI-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-NEXT: mov v4.s[2], w2
-; CHECK-GI-NEXT: mov v5.s[2], w6
-; CHECK-GI-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-NEXT: mov v1.s[2], v7.s[0]
-; CHECK-GI-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-NEXT: mov v3.s[2], v17.s[0]
-; CHECK-GI-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-NEXT: ldr s7, [sp, #56]
-; CHECK-GI-NEXT: ldr s16, [sp, #88]
-; CHECK-GI-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-NEXT: mov v4.s[3], w3
-; CHECK-GI-NEXT: mov v5.s[3], w7
-; CHECK-GI-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-GI-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-GI-NEXT: mov v2.s[3], v16.s[0]
-; CHECK-GI-NEXT: mov v3.s[3], v17.s[0]
-; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-NEXT: xtn v1.8b, v1.8h
-; CHECK-GI-NEXT: uaddlv h0, v0.16b
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: ldr w8, [sp]
+; CHECK-GI-NEXT: ldr w9, [sp, #64]
+; CHECK-GI-NEXT: ldr w10, [sp, #72]
+; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w2
+; CHECK-GI-NEXT: fmov s2, w10
+; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w3
+; CHECK-GI-NEXT: mov v0.b[3], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w4
+; CHECK-GI-NEXT: mov v0.b[4], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w5
+; CHECK-GI-NEXT: mov v0.b[5], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w6
+; CHECK-GI-NEXT: mov v0.b[6], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w7
+; CHECK-GI-NEXT: mov v0.b[7], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #8]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-NEXT: mov v0.b[8], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #80]
+; CHECK-GI-NEXT: mov v1.b[1], v2.b[0]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #88]
+; CHECK-GI-NEXT: mov v0.b[9], v3.b[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #24]
+; CHECK-GI-NEXT: mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #96]
+; CHECK-GI-NEXT: mov v0.b[10], v3.b[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #32]
+; CHECK-GI-NEXT: mov v1.b[3], v2.b[0]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #104]
+; CHECK-GI-NEXT: mov v0.b[11], v3.b[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #40]
+; CHECK-GI-NEXT: mov v1.b[4], v2.b[0]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #112]
+; CHECK-GI-NEXT: mov v0.b[12], v3.b[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #48]
+; CHECK-GI-NEXT: mov v1.b[5], v2.b[0]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #120]
+; CHECK-GI-NEXT: mov v0.b[13], v3.b[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #56]
+; CHECK-GI-NEXT: mov v1.b[6], v2.b[0]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: mov v0.b[14], v3.b[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: mov v1.b[7], v2.b[0]
+; CHECK-GI-NEXT: mov v0.b[15], v3.b[0]
; CHECK-GI-NEXT: uaddlv h1, v1.8b
-; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
@@ -3938,51 +3959,72 @@ define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) {
;
; CHECK-GI-LABEL: add_v24i8_v24i16_sext:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s4, w0
-; CHECK-GI-NEXT: fmov s5, w4
-; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-NEXT: ldr s1, [sp, #32]
-; CHECK-GI-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-NEXT: ldr s2, [sp, #64]
-; CHECK-GI-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-NEXT: ldr s3, [sp, #96]
-; CHECK-GI-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-NEXT: mov v4.s[1], w1
-; CHECK-GI-NEXT: mov v5.s[1], w5
-; CHECK-GI-NEXT: mov v0.s[1], v6.s[0]
-; CHECK-GI-NEXT: mov v1.s[1], v7.s[0]
-; CHECK-GI-NEXT: mov v2.s[1], v16.s[0]
-; CHECK-GI-NEXT: mov v3.s[1], v17.s[0]
-; CHECK-GI-NEXT: ldr s6, [sp, #16]
-; CHECK-GI-NEXT: ldr s7, [sp, #48]
-; CHECK-GI-NEXT: ldr s16, [sp, #80]
-; CHECK-GI-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-NEXT: mov v4.s[2], w2
-; CHECK-GI-NEXT: mov v5.s[2], w6
-; CHECK-GI-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-NEXT: mov v1.s[2], v7.s[0]
-; CHECK-GI-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-NEXT: mov v3.s[2], v17.s[0]
-; CHECK-GI-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-NEXT: ldr s7, [sp, #56]
-; CHECK-GI-NEXT: ldr s16, [sp, #88]
-; CHECK-GI-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-NEXT: mov v4.s[3], w3
-; CHECK-GI-NEXT: mov v5.s[3], w7
-; CHECK-GI-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-GI-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-GI-NEXT: mov v2.s[3], v16.s[0]
-; CHECK-GI-NEXT: mov v3.s[3], v17.s[0]
-; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-NEXT: xtn v1.8b, v1.8h
-; CHECK-GI-NEXT: saddlv h0, v0.16b
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: ldr w8, [sp]
+; CHECK-GI-NEXT: ldr w9, [sp, #64]
+; CHECK-GI-NEXT: ldr w10, [sp, #72]
+; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w2
+; CHECK-GI-NEXT: fmov s2, w10
+; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w3
+; CHECK-GI-NEXT: mov v0.b[3], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w4
+; CHECK-GI-NEXT: mov v0.b[4], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w5
+; CHECK-GI-NEXT: mov v0.b[5], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w6
+; CHECK-GI-NEXT: mov v0.b[6], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w7
+; CHECK-GI-NEXT: mov v0.b[7], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #8]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-NEXT: mov v0.b[8], v1.b[0]
+; CHECK-GI-NEXT: fmov s1, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #80]
+; CHECK-GI-NEXT: mov v1.b[1], v2.b[0]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #88]
+; CHECK-GI-NEXT: mov v0.b[9], v3.b[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #24]
+; CHECK-GI-NEXT: mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #96]
+; CHECK-GI-NEXT: mov v0.b[10], v3.b[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #32]
+; CHECK-GI-NEXT: mov v1.b[3], v2.b[0]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #104]
+; CHECK-GI-NEXT: mov v0.b[11], v3.b[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #40]
+; CHECK-GI-NEXT: mov v1.b[4], v2.b[0]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #112]
+; CHECK-GI-NEXT: mov v0.b[12], v3.b[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #48]
+; CHECK-GI-NEXT: mov v1.b[5], v2.b[0]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #120]
+; CHECK-GI-NEXT: mov v0.b[13], v3.b[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #56]
+; CHECK-GI-NEXT: mov v1.b[6], v2.b[0]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: mov v0.b[14], v3.b[0]
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: mov v1.b[7], v2.b[0]
+; CHECK-GI-NEXT: mov v0.b[15], v3.b[0]
; CHECK-GI-NEXT: saddlv h1, v1.8b
-; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
@@ -4125,106 +4167,149 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
;
; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: fmov s4, w0
-; CHECK-GI-BASE-NEXT: fmov s5, w4
-; CHECK-GI-BASE-NEXT: ldr s0, [sp]
-; CHECK-GI-BASE-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-BASE-NEXT: ldr s1, [sp, #32]
-; CHECK-GI-BASE-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-BASE-NEXT: ldr s2, [sp, #64]
-; CHECK-GI-BASE-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-BASE-NEXT: ldr s3, [sp, #96]
-; CHECK-GI-BASE-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-BASE-NEXT: mov v4.s[1], w1
-; CHECK-GI-BASE-NEXT: mov v5.s[1], w5
-; CHECK-GI-BASE-NEXT: mov v0.s[1], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v1.s[1], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v2.s[1], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[1], v17.s[0]
-; CHECK-GI-BASE-NEXT: ldr s6, [sp, #16]
-; CHECK-GI-BASE-NEXT: ldr s7, [sp, #48]
-; CHECK-GI-BASE-NEXT: ldr s16, [sp, #80]
-; CHECK-GI-BASE-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-BASE-NEXT: mov v4.s[2], w2
-; CHECK-GI-BASE-NEXT: mov v5.s[2], w6
-; CHECK-GI-BASE-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v1.s[2], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[2], v17.s[0]
-; CHECK-GI-BASE-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-BASE-NEXT: ldr s7, [sp, #56]
-; CHECK-GI-BASE-NEXT: ldr s16, [sp, #88]
-; CHECK-GI-BASE-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-BASE-NEXT: mov v4.s[3], w3
-; CHECK-GI-BASE-NEXT: mov v5.s[3], w7
-; CHECK-GI-BASE-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v2.s[3], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[3], v17.s[0]
-; CHECK-GI-BASE-NEXT: uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-BASE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT: uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-BASE-NEXT: xtn v1.8b, v1.8h
-; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
+; CHECK-GI-BASE-NEXT: fmov s0, w0
+; CHECK-GI-BASE-NEXT: fmov s1, w1
+; CHECK-GI-BASE-NEXT: ldr w8, [sp]
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #64]
+; CHECK-GI-BASE-NEXT: ldr w10, [sp, #72]
+; CHECK-GI-BASE-NEXT: mov v0.b[1], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w2
+; CHECK-GI-BASE-NEXT: fmov s2, w10
+; CHECK-GI-BASE-NEXT: mov v0.b[2], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w3
+; CHECK-GI-BASE-NEXT: mov v0.b[3], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w4
+; CHECK-GI-BASE-NEXT: mov v0.b[4], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w5
+; CHECK-GI-BASE-NEXT: mov v0.b[5], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w6
+; CHECK-GI-BASE-NEXT: mov v0.b[6], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w7
+; CHECK-GI-BASE-NEXT: mov v0.b[7], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #8]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-BASE-NEXT: mov v0.b[8], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w9
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #80]
+; CHECK-GI-BASE-NEXT: mov v1.b[1], v2.b[0]
+; CHECK-GI-BASE-NEXT: fmov s2, w9
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #88]
+; CHECK-GI-BASE-NEXT: mov v0.b[9], v3.b[0]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #24]
+; CHECK-GI-BASE-NEXT: mov v1.b[2], v2.b[0]
+; CHECK-GI-BASE-NEXT: fmov s2, w9
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #96]
+; CHECK-GI-BASE-NEXT: mov v0.b[10], v3.b[0]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #32]
+; CHECK-GI-BASE-NEXT: mov v1.b[3], v2.b[0]
+; CHECK-GI-BASE-NEXT: fmov s2, w9
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #104]
+; CHECK-GI-BASE-NEXT: mov v0.b[11], v3.b[0]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #40]
+; CHECK-GI-BASE-NEXT: mov v1.b[4], v2.b[0]
+; CHECK-GI-BASE-NEXT: fmov s2, w9
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #112]
+; CHECK-GI-BASE-NEXT: mov v0.b[12], v3.b[0]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #48]
+; CHECK-GI-BASE-NEXT: mov v1.b[5], v2.b[0]
+; CHECK-GI-BASE-NEXT: fmov s2, w9
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #120]
+; CHECK-GI-BASE-NEXT: mov v0.b[13], v3.b[0]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #56]
+; CHECK-GI-BASE-NEXT: mov v1.b[6], v2.b[0]
+; CHECK-GI-BASE-NEXT: fmov s2, w9
+; CHECK-GI-BASE-NEXT: mov v0.b[14], v3.b[0]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: mov v1.b[7], v2.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[15], v3.b[0]
; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b
-; CHECK-GI-BASE-NEXT: fmov w8, s0
+; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w9, s1
+; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT: fmov s4, w0
-; CHECK-GI-DOT-NEXT: fmov s5, w4
-; CHECK-GI-DOT-NEXT: ldr s0, [sp]
-; CHECK-GI-DOT-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-DOT-NEXT: ldr s1, [sp, #32]
-; CHECK-GI-DOT-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-DOT-NEXT: ldr s2, [sp, #64]
-; CHECK-GI-DOT-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-DOT-NEXT: ldr s3, [sp, #96]
-; CHECK-GI-DOT-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-DOT-NEXT: mov v4.s[1], w1
-; CHECK-GI-DOT-NEXT: mov v5.s[1], w5
-; CHECK-GI-DOT-NEXT: mov v0.s[1], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v1.s[1], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[1], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[1], v17.s[0]
-; CHECK-GI-DOT-NEXT: ldr s6, [sp, #16]
-; CHECK-GI-DOT-NEXT: ldr s7, [sp, #48]
-; CHECK-GI-DOT-NEXT: ldr s16, [sp, #80]
-; CHECK-GI-DOT-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-DOT-NEXT: mov v4.s[2], w2
-; CHECK-GI-DOT-NEXT: mov v5.s[2], w6
-; CHECK-GI-DOT-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v1.s[2], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[2], v17.s[0]
-; CHECK-GI-DOT-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-DOT-NEXT: ldr s7, [sp, #56]
-; CHECK-GI-DOT-NEXT: ldr s16, [sp, #88]
-; CHECK-GI-DOT-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-DOT-NEXT: mov v4.s[3], w3
-; CHECK-GI-DOT-NEXT: mov v5.s[3], w7
-; CHECK-GI-DOT-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[3], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[3], v17.s[0]
-; CHECK-GI-DOT-NEXT: uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-DOT-NEXT: fmov s0, w0
+; CHECK-GI-DOT-NEXT: fmov s1, w1
+; CHECK-GI-DOT-NEXT: ldr w8, [sp]
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #64]
+; CHECK-GI-DOT-NEXT: ldr w10, [sp, #72]
+; CHECK-GI-DOT-NEXT: movi v4.8b, #1
+; CHECK-GI-DOT-NEXT: fmov s2, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #8]
+; CHECK-GI-DOT-NEXT: mov v0.b[1], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w2
+; CHECK-GI-DOT-NEXT: fmov s3, w10
+; CHECK-GI-DOT-NEXT: mov v0.b[2], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w3
+; CHECK-GI-DOT-NEXT: mov v0.b[3], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w4
+; CHECK-GI-DOT-NEXT: mov v0.b[4], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w5
+; CHECK-GI-DOT-NEXT: mov v0.b[5], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w6
+; CHECK-GI-DOT-NEXT: mov v0.b[6], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w7
+; CHECK-GI-DOT-NEXT: mov v0.b[7], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w9
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #80]
+; CHECK-GI-DOT-NEXT: mov v1.b[1], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w9
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #88]
+; CHECK-GI-DOT-NEXT: mov v0.b[8], v2.b[0]
+; CHECK-GI-DOT-NEXT: fmov s2, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-DOT-NEXT: mov v1.b[2], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w9
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #96]
+; CHECK-GI-DOT-NEXT: mov v0.b[9], v2.b[0]
+; CHECK-GI-DOT-NEXT: fmov s2, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #24]
+; CHECK-GI-DOT-NEXT: mov v1.b[3], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w9
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #104]
+; CHECK-GI-DOT-NEXT: mov v0.b[10], v2.b[0]
+; CHECK-GI-DOT-NEXT: fmov s2, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #32]
+; CHECK-GI-DOT-NEXT: mov v1.b[4], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w9
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #112]
+; CHECK-GI-DOT-NEXT: mov v0.b[11], v2.b[0]
+; CHECK-GI-DOT-NEXT: fmov s2, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #40]
+; CHECK-GI-DOT-NEXT: mov v1.b[5], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w9
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #120]
+; CHECK-GI-DOT-NEXT: mov v0.b[12], v2.b[0]
+; CHECK-GI-DOT-NEXT: fmov s2, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #48]
+; CHECK-GI-DOT-NEXT: fmov s5, w9
+; CHECK-GI-DOT-NEXT: mov v1.b[6], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #56]
+; CHECK-GI-DOT-NEXT: mov v0.b[13], v2.b[0]
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
-; CHECK-GI-DOT-NEXT: movi v3.8b, #1
-; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: xtn v1.8b, v1.8h
-; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0]
-; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v3.16b
-; CHECK-GI-DOT-NEXT: udot v4.4s, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s
+; CHECK-GI-DOT-NEXT: mov v1.b[7], v5.b[0]
+; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: mov v0.b[14], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w8
+; CHECK-GI-DOT-NEXT: mov v4.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT: fmov d1, d1
+; CHECK-GI-DOT-NEXT: mov v0.b[15], v3.b[0]
+; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v4.16b
+; CHECK-GI-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v3.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
@@ -4398,106 +4483,149 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
;
; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: fmov s4, w0
-; CHECK-GI-BASE-NEXT: fmov s5, w4
-; CHECK-GI-BASE-NEXT: ldr s0, [sp]
-; CHECK-GI-BASE-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-BASE-NEXT: ldr s1, [sp, #32]
-; CHECK-GI-BASE-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-BASE-NEXT: ldr s2, [sp, #64]
-; CHECK-GI-BASE-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-BASE-NEXT: ldr s3, [sp, #96]
-; CHECK-GI-BASE-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-BASE-NEXT: mov v4.s[1], w1
-; CHECK-GI-BASE-NEXT: mov v5.s[1], w5
-; CHECK-GI-BASE-NEXT: mov v0.s[1], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v1.s[1], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v2.s[1], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[1], v17.s[0]
-; CHECK-GI-BASE-NEXT: ldr s6, [sp, #16]
-; CHECK-GI-BASE-NEXT: ldr s7, [sp, #48]
-; CHECK-GI-BASE-NEXT: ldr s16, [sp, #80]
-; CHECK-GI-BASE-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-BASE-NEXT: mov v4.s[2], w2
-; CHECK-GI-BASE-NEXT: mov v5.s[2], w6
-; CHECK-GI-BASE-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v1.s[2], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[2], v17.s[0]
-; CHECK-GI-BASE-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-BASE-NEXT: ldr s7, [sp, #56]
-; CHECK-GI-BASE-NEXT: ldr s16, [sp, #88]
-; CHECK-GI-BASE-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-BASE-NEXT: mov v4.s[3], w3
-; CHECK-GI-BASE-NEXT: mov v5.s[3], w7
-; CHECK-GI-BASE-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v2.s[3], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[3], v17.s[0]
-; CHECK-GI-BASE-NEXT: uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-BASE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT: uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-BASE-NEXT: xtn v1.8b, v1.8h
-; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
+; CHECK-GI-BASE-NEXT: fmov s0, w0
+; CHECK-GI-BASE-NEXT: fmov s1, w1
+; CHECK-GI-BASE-NEXT: ldr w8, [sp]
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #64]
+; CHECK-GI-BASE-NEXT: ldr w10, [sp, #72]
+; CHECK-GI-BASE-NEXT: mov v0.b[1], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w2
+; CHECK-GI-BASE-NEXT: fmov s2, w10
+; CHECK-GI-BASE-NEXT: mov v0.b[2], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w3
+; CHECK-GI-BASE-NEXT: mov v0.b[3], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w4
+; CHECK-GI-BASE-NEXT: mov v0.b[4], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w5
+; CHECK-GI-BASE-NEXT: mov v0.b[5], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w6
+; CHECK-GI-BASE-NEXT: mov v0.b[6], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w7
+; CHECK-GI-BASE-NEXT: mov v0.b[7], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #8]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-BASE-NEXT: mov v0.b[8], v1.b[0]
+; CHECK-GI-BASE-NEXT: fmov s1, w9
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #80]
+; CHECK-GI-BASE-NEXT: mov v1.b[1], v2.b[0]
+; CHECK-GI-BASE-NEXT: fmov s2, w9
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #88]
+; CHECK-GI-BASE-NEXT: mov v0.b[9], v3.b[0]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #24]
+; CHECK-GI-BASE-NEXT: mov v1.b[2], v2.b[0]
+; CHECK-GI-BASE-NEXT: fmov s2, w9
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #96]
+; CHECK-GI-BASE-NEXT: mov v0.b[10], v3.b[0]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #32]
+; CHECK-GI-BASE-NEXT: mov v1.b[3], v2.b[0]
+; CHECK-GI-BASE-NEXT: fmov s2, w9
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #104]
+; CHECK-GI-BASE-NEXT: mov v0.b[11], v3.b[0]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #40]
+; CHECK-GI-BASE-NEXT: mov v1.b[4], v2.b[0]
+; CHECK-GI-BASE-NEXT: fmov s2, w9
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #112]
+; CHECK-GI-BASE-NEXT: mov v0.b[12], v3.b[0]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #48]
+; CHECK-GI-BASE-NEXT: mov v1.b[5], v2.b[0]
+; CHECK-GI-BASE-NEXT: fmov s2, w9
+; CHECK-GI-BASE-NEXT: ldr w9, [sp, #120]
+; CHECK-GI-BASE-NEXT: mov v0.b[13], v3.b[0]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: ldr w8, [sp, #56]
+; CHECK-GI-BASE-NEXT: mov v1.b[6], v2.b[0]
+; CHECK-GI-BASE-NEXT: fmov s2, w9
+; CHECK-GI-BASE-NEXT: mov v0.b[14], v3.b[0]
+; CHECK-GI-BASE-NEXT: fmov s3, w8
+; CHECK-GI-BASE-NEXT: mov v1.b[7], v2.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[15], v3.b[0]
; CHECK-GI-BASE-NEXT: saddlv h1, v1.8b
-; CHECK-GI-BASE-NEXT: fmov w8, s0
+; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w9, s1
+; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT: fmov s4, w0
-; CHECK-GI-DOT-NEXT: fmov s5, w4
-; CHECK-GI-DOT-NEXT: ldr s0, [sp]
-; CHECK-GI-DOT-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-DOT-NEXT: ldr s1, [sp, #32]
-; CHECK-GI-DOT-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-DOT-NEXT: ldr s2, [sp, #64]
-; CHECK-GI-DOT-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-DOT-NEXT: ldr s3, [sp, #96]
-; CHECK-GI-DOT-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-DOT-NEXT: mov v4.s[1], w1
-; CHECK-GI-DOT-NEXT: mov v5.s[1], w5
-; CHECK-GI-DOT-NEXT: mov v0.s[1], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v1.s[1], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[1], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[1], v17.s[0]
-; CHECK-GI-DOT-NEXT: ldr s6, [sp, #16]
-; CHECK-GI-DOT-NEXT: ldr s7, [sp, #48]
-; CHECK-GI-DOT-NEXT: ldr s16, [sp, #80]
-; CHECK-GI-DOT-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-DOT-NEXT: mov v4.s[2], w2
-; CHECK-GI-DOT-NEXT: mov v5.s[2], w6
-; CHECK-GI-DOT-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v1.s[2], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[2], v17.s[0]
-; CHECK-GI-DOT-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-DOT-NEXT: ldr s7, [sp, #56]
-; CHECK-GI-DOT-NEXT: ldr s16, [sp, #88]
-; CHECK-GI-DOT-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-DOT-NEXT: mov v4.s[3], w3
-; CHECK-GI-DOT-NEXT: mov v5.s[3], w7
-; CHECK-GI-DOT-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[3], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[3], v17.s[0]
-; CHECK-GI-DOT-NEXT: uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-DOT-NEXT: fmov s0, w0
+; CHECK-GI-DOT-NEXT: fmov s1, w1
+; CHECK-GI-DOT-NEXT: ldr w8, [sp]
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #64]
+; CHECK-GI-DOT-NEXT: ldr w10, [sp, #72]
+; CHECK-GI-DOT-NEXT: movi v4.8b, #1
+; CHECK-GI-DOT-NEXT: fmov s2, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #8]
+; CHECK-GI-DOT-NEXT: mov v0.b[1], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w2
+; CHECK-GI-DOT-NEXT: fmov s3, w10
+; CHECK-GI-DOT-NEXT: mov v0.b[2], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w3
+; CHECK-GI-DOT-NEXT: mov v0.b[3], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w4
+; CHECK-GI-DOT-NEXT: mov v0.b[4], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w5
+; CHECK-GI-DOT-NEXT: mov v0.b[5], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w6
+; CHECK-GI-DOT-NEXT: mov v0.b[6], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w7
+; CHECK-GI-DOT-NEXT: mov v0.b[7], v1.b[0]
+; CHECK-GI-DOT-NEXT: fmov s1, w9
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #80]
+; CHECK-GI-DOT-NEXT: mov v1.b[1], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w9
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #88]
+; CHECK-GI-DOT-NEXT: mov v0.b[8], v2.b[0]
+; CHECK-GI-DOT-NEXT: fmov s2, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-DOT-NEXT: mov v1.b[2], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w9
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #96]
+; CHECK-GI-DOT-NEXT: mov v0.b[9], v2.b[0]
+; CHECK-GI-DOT-NEXT: fmov s2, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #24]
+; CHECK-GI-DOT-NEXT: mov v1.b[3], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w9
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #104]
+; CHECK-GI-DOT-NEXT: mov v0.b[10], v2.b[0]
+; CHECK-GI-DOT-NEXT: fmov s2, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #32]
+; CHECK-GI-DOT-NEXT: mov v1.b[4], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w9
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #112]
+; CHECK-GI-DOT-NEXT: mov v0.b[11], v2.b[0]
+; CHECK-GI-DOT-NEXT: fmov s2, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #40]
+; CHECK-GI-DOT-NEXT: mov v1.b[5], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w9
+; CHECK-GI-DOT-NEXT: ldr w9, [sp, #120]
+; CHECK-GI-DOT-NEXT: mov v0.b[12], v2.b[0]
+; CHECK-GI-DOT-NEXT: fmov s2, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #48]
+; CHECK-GI-DOT-NEXT: fmov s5, w9
+; CHECK-GI-DOT-NEXT: mov v1.b[6], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w8
+; CHECK-GI-DOT-NEXT: ldr w8, [sp, #56]
+; CHECK-GI-DOT-NEXT: mov v0.b[13], v2.b[0]
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
-; CHECK-GI-DOT-NEXT: movi v3.8b, #1
-; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: xtn v1.8b, v1.8h
-; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0]
-; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v3.16b
-; CHECK-GI-DOT-NEXT: sdot v4.4s, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s
+; CHECK-GI-DOT-NEXT: mov v1.b[7], v5.b[0]
+; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: mov v0.b[14], v3.b[0]
+; CHECK-GI-DOT-NEXT: fmov s3, w8
+; CHECK-GI-DOT-NEXT: mov v4.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT: fmov d1, d1
+; CHECK-GI-DOT-NEXT: mov v0.b[15], v3.b[0]
+; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v4.16b
+; CHECK-GI-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v3.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index 3c86f4bf9eb213..e536ba240453e2 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -127,19 +127,12 @@ entry:
}
define <2 x i8> @xtn_v2i128_v2i8(<2 x i128> %a) {
-; CHECK-SD-LABEL: xtn_v2i128_v2i8:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fmov s0, w0
-; CHECK-SD-NEXT: mov v0.s[1], w2
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: xtn_v2i128_v2i8:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov d0, x0
-; CHECK-GI-NEXT: mov v0.d[1], x2
-; CHECK-GI-NEXT: xtn v0.2s, v0.2d
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: xtn_v2i128_v2i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: mov v0.s[1], w2
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
entry:
%arg1 = trunc <2 x i128> %a to <2 x i8>
ret <2 x i8> %arg1
@@ -174,9 +167,11 @@ define <2 x i16> @xtn_v2i128_v2i16(<2 x i128> %a) {
;
; CHECK-GI-LABEL: xtn_v2i128_v2i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov d0, x0
-; CHECK-GI-NEXT: mov v0.d[1], x2
-; CHECK-GI-NEXT: xtn v0.2s, v0.2d
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w2
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%arg1 = trunc <2 x i128> %a to <2 x i16>
@@ -194,19 +189,12 @@ entry:
}
define <2 x i32> @xtn_v2i128_v2i32(<2 x i128> %a) {
-; CHECK-SD-LABEL: xtn_v2i128_v2i32:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fmov s0, w0
-; CHECK-SD-NEXT: mov v0.s[1], w2
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: xtn_v2i128_v2i32:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov d0, x0
-; CHECK-GI-NEXT: mov v0.d[1], x2
-; CHECK-GI-NEXT: xtn v0.2s, v0.2d
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: xtn_v2i128_v2i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: mov v0.s[1], w2
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
entry:
%arg1 = trunc <2 x i128> %a to <2 x i32>
ret <2 x i32> %arg1
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 716d2398996be2..bb968c8eb00fcb 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -242,16 +242,15 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) {
;
; CHECK-GI-LABEL: zext_v3i8_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: mov w8, #255 // =0xff
+; CHECK-GI-NEXT: and w8, w0, #0xff
+; CHECK-GI-NEXT: and w9, w1, #0xff
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: fmov s1, w9
+; CHECK-GI-NEXT: and w8, w2, #0xff
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v2.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v2.h[1], v1.h[0]
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: mov v2.h[2], v1.h[0]
-; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i8> %a to <3 x i16>
@@ -271,14 +270,12 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) {
;
; CHECK-GI-LABEL: zext_v3i8_v3i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #255 // =0xff
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v1.s[2], w8
-; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: and w8, w0, #0xff
+; CHECK-GI-NEXT: and w9, w1, #0xff
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: and w8, w2, #0xff
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i8> %a to <3 x i32>
@@ -305,16 +302,15 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) {
;
; CHECK-GI-LABEL: zext_v3i8_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: movi v1.2d, #0x000000000000ff
+; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT: and x8, x2, #0xff
-; CHECK-GI-NEXT: fmov d2, x8
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT: mov d1, v0.d[1]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: and x8, x0, #0xff
+; CHECK-GI-NEXT: and x9, x1, #0xff
+; CHECK-GI-NEXT: and x10, x2, #0xff
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: fmov d1, x9
+; CHECK-GI-NEXT: fmov d2, x10
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i8> %a to <3 x i64>
@@ -407,16 +403,15 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v3i10_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: mov w8, #1023 // =0x3ff
+; CHECK-GI-NEXT: and w8, w0, #0x3ff
+; CHECK-GI-NEXT: and w9, w1, #0x3ff
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: fmov s1, w9
+; CHECK-GI-NEXT: and w8, w2, #0x3ff
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v2.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v2.h[1], v1.h[0]
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: mov v2.h[2], v1.h[0]
-; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i10> %a to <3 x i16>
@@ -436,14 +431,12 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v3i10_v3i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #1023 // =0x3ff
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v1.s[2], w8
-; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: and w8, w0, #0x3ff
+; CHECK-GI-NEXT: and w9, w1, #0x3ff
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: and w8, w2, #0x3ff
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i10> %a to <3 x i32>
@@ -469,17 +462,15 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v3i10_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: adrp x8, .LCPI27_0
+; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI27_0]
-; CHECK-GI-NEXT: and x8, x2, #0x3ff
-; CHECK-GI-NEXT: fmov d2, x8
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT: mov d1, v0.d[1]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: and x8, x0, #0x3ff
+; CHECK-GI-NEXT: and x9, x1, #0x3ff
+; CHECK-GI-NEXT: and x10, x2, #0x3ff
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: fmov d1, x9
+; CHECK-GI-NEXT: fmov d2, x10
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i10> %a to <3 x i64>
@@ -1098,33 +1089,51 @@ define <16 x i32> @zext_v16i10_v16i32(<16 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v16i10_v16i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s4, w0
-; CHECK-GI-NEXT: fmov s5, w4
-; CHECK-GI-NEXT: ldr s2, [sp]
-; CHECK-GI-NEXT: ldr s0, [sp, #8]
-; CHECK-GI-NEXT: ldr s3, [sp, #32]
-; CHECK-GI-NEXT: ldr s1, [sp, #40]
-; CHECK-GI-NEXT: movi v6.4s, #3, msl #8
-; CHECK-GI-NEXT: mov v4.s[1], w1
-; CHECK-GI-NEXT: mov v5.s[1], w5
-; CHECK-GI-NEXT: mov v2.s[1], v0.s[0]
-; CHECK-GI-NEXT: mov v3.s[1], v1.s[0]
-; CHECK-GI-NEXT: ldr s0, [sp, #16]
-; CHECK-GI-NEXT: ldr s1, [sp, #48]
-; CHECK-GI-NEXT: mov v4.s[2], w2
-; CHECK-GI-NEXT: mov v5.s[2], w6
-; CHECK-GI-NEXT: mov v2.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v3.s[2], v1.s[0]
-; CHECK-GI-NEXT: ldr s0, [sp, #24]
-; CHECK-GI-NEXT: ldr s1, [sp, #56]
-; CHECK-GI-NEXT: mov v4.s[3], w3
-; CHECK-GI-NEXT: mov v5.s[3], w7
-; CHECK-GI-NEXT: mov v2.s[3], v0.s[0]
-; CHECK-GI-NEXT: mov v3.s[3], v1.s[0]
-; CHECK-GI-NEXT: and v0.16b, v4.16b, v6.16b
-; CHECK-GI-NEXT: and v1.16b, v5.16b, v6.16b
-; CHECK-GI-NEXT: and v2.16b, v2.16b, v6.16b
-; CHECK-GI-NEXT: and v3.16b, v3.16b, v6.16b
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: ldr w8, [sp]
+; CHECK-GI-NEXT: fmov s3, w5
+; CHECK-GI-NEXT: ldr w9, [sp, #8]
+; CHECK-GI-NEXT: ldr w10, [sp, #32]
+; CHECK-GI-NEXT: ldr w11, [sp, #40]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: fmov s1, w4
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: fmov s5, w10
+; CHECK-GI-NEXT: fmov s6, w11
+; CHECK-GI-NEXT: ldr w9, [sp, #48]
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w2
+; CHECK-GI-NEXT: mov v2.h[1], v4.h[0]
+; CHECK-GI-NEXT: mov v5.h[1], v6.h[0]
+; CHECK-GI-NEXT: fmov s4, w8
+; CHECK-GI-NEXT: fmov s6, w9
+; CHECK-GI-NEXT: ldr w8, [sp, #24]
+; CHECK-GI-NEXT: ldr w9, [sp, #56]
+; CHECK-GI-NEXT: mov v0.h[2], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w6
+; CHECK-GI-NEXT: mov v2.h[2], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w8
+; CHECK-GI-NEXT: mov v5.h[2], v6.h[0]
+; CHECK-GI-NEXT: fmov s6, w9
+; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w3
+; CHECK-GI-NEXT: mov v2.h[3], v4.h[0]
+; CHECK-GI-NEXT: mov v0.h[3], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w7
+; CHECK-GI-NEXT: mov v5.h[3], v6.h[0]
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: movi v3.4s, #3, msl #8
+; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v4.4s, v5.4h, #0
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: and v3.16b, v4.16b, v3.16b
; CHECK-GI-NEXT: ret
entry:
%c = zext <16 x i10> %a to <16 x i32>
@@ -1176,44 +1185,64 @@ define <16 x i64> @zext_v16i10_v16i64(<16 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v16i10_v16i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s16, w0
-; CHECK-GI-NEXT: fmov s17, w2
-; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: fmov s18, w4
-; CHECK-GI-NEXT: fmov s19, w6
-; CHECK-GI-NEXT: ldr s1, [sp, #8]
-; CHECK-GI-NEXT: ldr s2, [sp, #16]
-; CHECK-GI-NEXT: ldr s3, [sp, #24]
-; CHECK-GI-NEXT: ldr s4, [sp, #32]
-; CHECK-GI-NEXT: ldr s5, [sp, #40]
-; CHECK-GI-NEXT: ldr s6, [sp, #48]
-; CHECK-GI-NEXT: ldr s7, [sp, #56]
-; CHECK-GI-NEXT: mov v16.s[1], w1
-; CHECK-GI-NEXT: mov v17.s[1], w3
-; CHECK-GI-NEXT: mov v18.s[1], w5
-; CHECK-GI-NEXT: mov v19.s[1], w7
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT: mov v6.s[1], v7.s[0]
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: ldr w8, [sp]
+; CHECK-GI-NEXT: fmov s2, w5
+; CHECK-GI-NEXT: ldr w9, [sp, #8]
+; CHECK-GI-NEXT: ldr w10, [sp, #32]
+; CHECK-GI-NEXT: ldr w11, [sp, #40]
+; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: fmov s1, w4
+; CHECK-GI-NEXT: fmov s3, w9
+; CHECK-GI-NEXT: fmov s4, w11
+; CHECK-GI-NEXT: ldr w9, [sp, #48]
+; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #24]
+; CHECK-GI-NEXT: mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT: fmov s3, w10
+; CHECK-GI-NEXT: mov v3.h[1], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w2
+; CHECK-GI-NEXT: mov v2.h[2], v5.h[0]
+; CHECK-GI-NEXT: fmov s5, w8
; CHECK-GI-NEXT: adrp x8, .LCPI54_0
-; CHECK-GI-NEXT: ushll v1.2d, v16.2s, #0
-; CHECK-GI-NEXT: ushll v3.2d, v17.2s, #0
-; CHECK-GI-NEXT: ushll v5.2d, v18.2s, #0
-; CHECK-GI-NEXT: ushll v7.2d, v19.2s, #0
-; CHECK-GI-NEXT: ushll v16.2d, v0.2s, #0
-; CHECK-GI-NEXT: ushll v18.2d, v2.2s, #0
-; CHECK-GI-NEXT: ushll v19.2d, v4.2s, #0
-; CHECK-GI-NEXT: ushll v20.2d, v6.2s, #0
-; CHECK-GI-NEXT: ldr q17, [x8, :lo12:.LCPI54_0]
-; CHECK-GI-NEXT: and v0.16b, v1.16b, v17.16b
-; CHECK-GI-NEXT: and v1.16b, v3.16b, v17.16b
-; CHECK-GI-NEXT: and v2.16b, v5.16b, v17.16b
-; CHECK-GI-NEXT: and v3.16b, v7.16b, v17.16b
-; CHECK-GI-NEXT: and v4.16b, v16.16b, v17.16b
-; CHECK-GI-NEXT: and v5.16b, v18.16b, v17.16b
-; CHECK-GI-NEXT: and v6.16b, v19.16b, v17.16b
-; CHECK-GI-NEXT: and v7.16b, v20.16b, v17.16b
+; CHECK-GI-NEXT: ldr q7, [x8, :lo12:.LCPI54_0]
+; CHECK-GI-NEXT: mov v0.h[2], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w6
+; CHECK-GI-NEXT: mov v2.h[3], v5.h[0]
+; CHECK-GI-NEXT: mov v1.h[2], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ldr w9, [sp, #56]
+; CHECK-GI-NEXT: mov v3.h[2], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w3
+; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: mov v0.h[3], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w7
+; CHECK-GI-NEXT: ushll v17.2d, v2.2s, #0
+; CHECK-GI-NEXT: ushll2 v18.2d, v2.4s, #0
+; CHECK-GI-NEXT: mov v1.h[3], v4.h[0]
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: mov v3.h[3], v4.h[0]
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0
+; CHECK-GI-NEXT: ushll2 v5.2d, v0.4s, #0
+; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: ushll v6.2d, v1.2s, #0
+; CHECK-GI-NEXT: ushll2 v16.2d, v1.4s, #0
+; CHECK-GI-NEXT: and v0.16b, v4.16b, v7.16b
+; CHECK-GI-NEXT: and v1.16b, v5.16b, v7.16b
+; CHECK-GI-NEXT: and v4.16b, v17.16b, v7.16b
+; CHECK-GI-NEXT: and v5.16b, v18.16b, v7.16b
+; CHECK-GI-NEXT: ushll v19.2d, v3.2s, #0
+; CHECK-GI-NEXT: ushll2 v20.2d, v3.4s, #0
+; CHECK-GI-NEXT: and v2.16b, v6.16b, v7.16b
+; CHECK-GI-NEXT: and v3.16b, v16.16b, v7.16b
+; CHECK-GI-NEXT: and v6.16b, v19.16b, v7.16b
+; CHECK-GI-NEXT: and v7.16b, v20.16b, v7.16b
; CHECK-GI-NEXT: ret
entry:
%c = zext <16 x i10> %a to <16 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
index 6a291510fe66c1..90ee4b266f41dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
@@ -374,22 +374,22 @@ body: |
; GFX6-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX6: liveins: $vgpr0
; GFX6-NEXT: {{ $}}
- ; GFX6-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX6-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
; GFX6-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX6-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
+ ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX6-NEXT: %extend:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32)
; GFX6-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
;
; GFX9-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX9-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
; GFX9-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX9-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX9-NEXT: %extend:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32)
; GFX9-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
%zero:_(s16) = G_CONSTANT i16 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
index 6ceb41199af6da..29b66288b3e4b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
@@ -246,22 +246,20 @@ body: |
; GFX6-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX6: liveins: $vgpr0, $vgpr1
; GFX6-NEXT: {{ $}}
- ; GFX6-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX6-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
; GFX6-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX6-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
+ ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX6-NEXT: %extend:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
; GFX6-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
;
; GFX9-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX9-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
; GFX9-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX9-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX9-NEXT: %extend:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
; GFX9-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
%zero:_(s16) = G_CONSTANT i16 0
>From 56938a159bfc8a25f35c06bf854f6a3b8b01d47e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Fri, 16 Aug 2024 21:25:42 +0200
Subject: [PATCH 2/2] multi use
---
.../AArch64/GlobalISel/combine-cast.mir | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
index 9eef79a9c4bbee..026b18139c2a48 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
@@ -221,3 +221,22 @@ body: |
%bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
%large:_(<2 x s64>) = G_SEXT %bv(<2 x s32>)
$q0 = COPY %large(<2 x s64>)
+...
+---
+name: test_combine_anyext_build_vector_multi_use
+legalized: true
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_combine_anyext_build_vector_multi_use
+ ; CHECK: %arg1:_(s32) = COPY $w0
+ ; CHECK-NEXT: %arg2:_(s32) = COPY $w0
+ ; CHECK-NEXT: %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+ ; CHECK-NEXT: %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>)
+ ; CHECK-NEXT: $q0 = COPY %large(<2 x s64>)
+ ; CHECK-NEXT: $d0 = COPY %bv(<2 x s32>)
+ %arg1:_(s32) = COPY $w0
+ %arg2:_(s32) = COPY $w0
+ %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+ %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>)
+ $q0 = COPY %large(<2 x s64>)
+ $d0 = COPY %bv(<2 x s32>)
More information about the llvm-commits
mailing list