[llvm] [AArch64][GloablISel] Refactor Combine G_CONCAT_VECTOR (PR #80866)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 7 02:25:43 PST 2024
https://github.com/chuongg3 updated https://github.com/llvm/llvm-project/pull/80866
>From 5e36dff851caa191f4230b095eee0ab48e0313b5 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Mon, 5 Feb 2024 16:42:20 +0000
Subject: [PATCH 1/2] [AArch64][GloablISel] Refactor Combine G_CONCAT_VECTOR
The combine now works using tablegen and checks if new instruction is
legal before creating it.
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 24 +-
.../include/llvm/Target/GlobalISel/Combine.td | 11 +-
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 40 +-
llvm/lib/Target/AArch64/AArch64Combine.td | 4 +-
.../GISel/AArch64O0PreLegalizerCombiner.cpp | 2 -
.../GISel/AArch64PreLegalizerCombiner.cpp | 2 -
.../AMDGPU/AMDGPUPreLegalizerCombiner.cpp | 2 -
llvm/test/CodeGen/AArch64/itofp.ll | 570 +++++++++---------
.../AArch64/neon-bitwise-instructions.ll | 14 +-
llvm/test/CodeGen/AArch64/vecreduce-add.ll | 352 +++++------
10 files changed, 510 insertions(+), 511 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 10eeafdd09a8e..133b5d2c4e6b7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -224,22 +224,22 @@ class CombinerHelper {
/// - concat_vector(undef, undef) => undef
/// - concat_vector(build_vector(A, B), build_vector(C, D)) =>
/// build_vector(A, B, C, D)
- ///
- /// \pre MI.getOpcode() == G_CONCAT_VECTORS.
- bool tryCombineConcatVectors(MachineInstr &MI);
+ /// ==========================================================
/// Check if the G_CONCAT_VECTORS \p MI is undef or if it
/// can be flattened into a build_vector.
- /// In the first case \p IsUndef will be true.
- /// In the second case \p Ops will contain the operands needed
- /// to produce the flattened build_vector.
+ /// In the first case \p bool will be true.
+ /// In the second case \p SmallVector<Register> will contain the operands
+ /// needed to produce the flattened build_vector.
///
/// \pre MI.getOpcode() == G_CONCAT_VECTORS.
- bool matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
- SmallVectorImpl<Register> &Ops);
- /// Replace \p MI with a flattened build_vector with \p Ops or an
- /// implicit_def if IsUndef is true.
- void applyCombineConcatVectors(MachineInstr &MI, bool IsUndef,
- const ArrayRef<Register> Ops);
+ bool
+ matchCombineConcatVectors(MachineInstr &MI,
+ std::pair<bool, SmallVector<Register>> &matchinfo);
+ /// Replace \p MI with a flattened build_vector with \p SmallVector<Register>
+ /// or an implicit_def if \p bool is true.
+ void
+ applyCombineConcatVectors(MachineInstr &MI,
+ std::pair<bool, SmallVector<Register>> &matchinfo);
/// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS.
/// Returns true if MI changed.
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9b0e1b0d7c4f9..77a6faaf837d5 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1253,6 +1253,14 @@ def match_ors : GICombineRule<
[{ return Helper.matchOr(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+// Combines concat operations
+def concat_matchinfo : GIDefMatchData<"std::pair<bool, SmallVector<Register>>">;
+def combine_concat_vector : GICombineRule<
+ (defs root:$root, concat_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_CONCAT_VECTORS):$root,
+ [{ return Helper.matchCombineConcatVectors(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyCombineConcatVectors(*${root}, ${matchinfo}); }])>;
+
// FIXME: These should use the custom predicate feature once it lands.
def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
undef_to_negative_one,
@@ -1326,7 +1334,8 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
intdiv_combines, mulh_combines, redundant_neg_operands,
and_or_disjoint_mask, fma_combines, fold_binop_into_select,
sub_add_reg, select_to_minmax, redundant_binop_in_equality,
- fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors]>;
+ fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
+ combine_concat_vector]>;
// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1b199cfd41d23..71b383b54a141 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -222,21 +222,11 @@ void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
replaceRegWith(MRI, DstReg, SrcReg);
}
-bool CombinerHelper::tryCombineConcatVectors(MachineInstr &MI) {
- bool IsUndef = false;
- SmallVector<Register, 4> Ops;
- if (matchCombineConcatVectors(MI, IsUndef, Ops)) {
- applyCombineConcatVectors(MI, IsUndef, Ops);
- return true;
- }
- return false;
-}
-
-bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
- SmallVectorImpl<Register> &Ops) {
+bool CombinerHelper::matchCombineConcatVectors(
+ MachineInstr &MI, std::pair<bool, SmallVector<Register>> &matchinfo) {
assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
"Invalid instruction");
- IsUndef = true;
+ matchinfo.first = true;
MachineInstr *Undef = nullptr;
// Walk over all the operands of concat vectors and check if they are
@@ -246,13 +236,15 @@ bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
Register Reg = MO.getReg();
MachineInstr *Def = MRI.getVRegDef(Reg);
assert(Def && "Operand not defined");
+ if (!MRI.hasOneNonDBGUse(Reg))
+ return false;
switch (Def->getOpcode()) {
case TargetOpcode::G_BUILD_VECTOR:
- IsUndef = false;
+ matchinfo.first = false;
// Remember the operands of the build_vector to fold
// them into the yet-to-build flattened concat vectors.
for (const MachineOperand &BuildVecMO : Def->uses())
- Ops.push_back(BuildVecMO.getReg());
+ matchinfo.second.push_back(BuildVecMO.getReg());
break;
case TargetOpcode::G_IMPLICIT_DEF: {
LLT OpType = MRI.getType(Reg);
@@ -268,17 +260,25 @@ bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
// for the flattening.
for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
EltIdx != EltEnd; ++EltIdx)
- Ops.push_back(Undef->getOperand(0).getReg());
+ matchinfo.second.push_back(Undef->getOperand(0).getReg());
break;
}
default:
return false;
}
}
+
+ // Check if the combine is illegal
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_BUILD_VECTOR,
+ {DstTy, MRI.getType(matchinfo.second[0])}})) {
+ return false;
+ }
+
return true;
}
void CombinerHelper::applyCombineConcatVectors(
- MachineInstr &MI, bool IsUndef, const ArrayRef<Register> Ops) {
+ MachineInstr &MI, std::pair<bool, SmallVector<Register>> &matchinfo) {
// We determined that the concat_vectors can be flatten.
// Generate the flattened build_vector.
Register DstReg = MI.getOperand(0).getReg();
@@ -289,12 +289,12 @@ void CombinerHelper::applyCombineConcatVectors(
// checking that at all Ops are undef. Alternatively, we could have
// generate a build_vector of undefs and rely on another combine to
// clean that up. For now, given we already gather this information
- // in tryCombineConcatVectors, just save compile time and issue the
+ // in matchCombineConcatVectors, just save compile time and issue the
// right thing.
- if (IsUndef)
+ if (matchinfo.first)
Builder.buildUndef(NewDstReg);
else
- Builder.buildBuildVector(NewDstReg, Ops);
+ Builder.buildBuildVector(NewDstReg, matchinfo.second);
MI.eraseFromParent();
replaceRegWith(MRI, DstReg, NewDstReg);
}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1daa7d5fe6a7a..e4d8359c71e62 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -64,7 +64,7 @@ def AArch64PreLegalizerCombiner: GICombiner<
}
def AArch64O0PreLegalizerCombiner: GICombiner<
- "AArch64O0PreLegalizerCombinerImpl", [optnone_combines]> {
+ "AArch64O0PreLegalizerCombinerImpl", [optnone_combines, combine_concat_vector]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
@@ -288,5 +288,5 @@ def AArch64PostLegalizerCombiner
constant_fold_binops, identity_combines,
ptr_add_immed_chain, overlapping_and,
split_store_zero_128, undef_combines,
- select_to_minmax, or_to_bsp]> {
+ select_to_minmax, or_to_bsp, combine_concat_vector]> {
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
index 0b82ed1280ddd..17dd8f2314a2b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
@@ -91,8 +91,6 @@ bool AArch64O0PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
switch (Opc) {
- case TargetOpcode::G_CONCAT_VECTORS:
- return Helper.tryCombineConcatVectors(MI);
case TargetOpcode::G_SHUFFLE_VECTOR:
return Helper.tryCombineShuffleVector(MI);
case TargetOpcode::G_MEMCPY_INLINE:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 574d065ab01bb..a82d3cd095659 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -720,8 +720,6 @@ bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
switch (Opc) {
- case TargetOpcode::G_CONCAT_VECTORS:
- return Helper.tryCombineConcatVectors(MI);
case TargetOpcode::G_SHUFFLE_VECTOR:
return Helper.tryCombineShuffleVector(MI);
case TargetOpcode::G_UADDO:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 0c7e198810da7..f14d970f1e5de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -106,8 +106,6 @@ bool AMDGPUPreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
return true;
switch (MI.getOpcode()) {
- case TargetOpcode::G_CONCAT_VECTORS:
- return Helper.tryCombineConcatVectors(MI);
case TargetOpcode::G_SHUFFLE_VECTOR:
return Helper.tryCombineShuffleVector(MI);
}
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index fa1ab61a6216f..0965d82f707e6 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -4043,28 +4043,28 @@ define <8 x half> @stofp_v8i64_v8f16(<8 x i64> %a) {
; CHECK-GI-FP16-LABEL: stofp_v8i64_v8f16:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: scvtf v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT: scvtf v2.2d, v2.2d
; CHECK-GI-FP16-NEXT: scvtf v1.2d, v1.2d
+; CHECK-GI-FP16-NEXT: scvtf v2.2d, v2.2d
; CHECK-GI-FP16-NEXT: scvtf v3.2d, v3.2d
; CHECK-GI-FP16-NEXT: mov d4, v0.d[1]
-; CHECK-GI-FP16-NEXT: mov d5, v2.d[1]
; CHECK-GI-FP16-NEXT: fcvt h0, d0
-; CHECK-GI-FP16-NEXT: fcvt h2, d2
-; CHECK-GI-FP16-NEXT: fcvt h4, d4
-; CHECK-GI-FP16-NEXT: fcvt h5, d5
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov d4, v1.d[1]
+; CHECK-GI-FP16-NEXT: mov d5, v1.d[1]
; CHECK-GI-FP16-NEXT: fcvt h1, d1
-; CHECK-GI-FP16-NEXT: mov v2.h[1], v5.h[0]
-; CHECK-GI-FP16-NEXT: mov d5, v3.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h3, d3
; CHECK-GI-FP16-NEXT: fcvt h4, d4
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v4.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h4, d5
; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h1, d5
-; CHECK-GI-FP16-NEXT: mov v2.h[2], v3.h[0]
+; CHECK-GI-FP16-NEXT: mov d1, v2.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h2, d2
; CHECK-GI-FP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov v2.h[3], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.d[1], v2.d[0]
+; CHECK-GI-FP16-NEXT: fcvt h1, d1
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov d2, v3.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h3, d3
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h1, d2
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v3.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[7], v1.h[0]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = sitofp <8 x i64> %a to <8 x half>
@@ -4103,28 +4103,28 @@ define <8 x half> @utofp_v8i64_v8f16(<8 x i64> %a) {
; CHECK-GI-FP16-LABEL: utofp_v8i64_v8f16:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT: ucvtf v2.2d, v2.2d
; CHECK-GI-FP16-NEXT: ucvtf v1.2d, v1.2d
+; CHECK-GI-FP16-NEXT: ucvtf v2.2d, v2.2d
; CHECK-GI-FP16-NEXT: ucvtf v3.2d, v3.2d
; CHECK-GI-FP16-NEXT: mov d4, v0.d[1]
-; CHECK-GI-FP16-NEXT: mov d5, v2.d[1]
; CHECK-GI-FP16-NEXT: fcvt h0, d0
-; CHECK-GI-FP16-NEXT: fcvt h2, d2
-; CHECK-GI-FP16-NEXT: fcvt h4, d4
-; CHECK-GI-FP16-NEXT: fcvt h5, d5
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov d4, v1.d[1]
+; CHECK-GI-FP16-NEXT: mov d5, v1.d[1]
; CHECK-GI-FP16-NEXT: fcvt h1, d1
-; CHECK-GI-FP16-NEXT: mov v2.h[1], v5.h[0]
-; CHECK-GI-FP16-NEXT: mov d5, v3.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h3, d3
; CHECK-GI-FP16-NEXT: fcvt h4, d4
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v4.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h4, d5
; CHECK-GI-FP16-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h1, d5
-; CHECK-GI-FP16-NEXT: mov v2.h[2], v3.h[0]
+; CHECK-GI-FP16-NEXT: mov d1, v2.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h2, d2
; CHECK-GI-FP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov v2.h[3], v1.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.d[1], v2.d[0]
+; CHECK-GI-FP16-NEXT: fcvt h1, d1
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov d2, v3.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h3, d3
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v1.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h1, d2
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v3.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[7], v1.h[0]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = uitofp <8 x i64> %a to <8 x half>
@@ -4183,51 +4183,51 @@ define <16 x half> @stofp_v16i64_v16f16(<16 x i64> %a) {
; CHECK-GI-FP16-LABEL: stofp_v16i64_v16f16:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: scvtf v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT: scvtf v16.2d, v2.2d
; CHECK-GI-FP16-NEXT: scvtf v4.2d, v4.2d
-; CHECK-GI-FP16-NEXT: scvtf v2.2d, v6.2d
-; CHECK-GI-FP16-NEXT: scvtf v20.2d, v1.2d
-; CHECK-GI-FP16-NEXT: scvtf v3.2d, v3.2d
+; CHECK-GI-FP16-NEXT: scvtf v18.2d, v1.2d
; CHECK-GI-FP16-NEXT: scvtf v5.2d, v5.2d
-; CHECK-GI-FP16-NEXT: scvtf v7.2d, v7.2d
-; CHECK-GI-FP16-NEXT: mov d6, v0.d[1]
-; CHECK-GI-FP16-NEXT: mov d17, v16.d[1]
-; CHECK-GI-FP16-NEXT: mov d18, v4.d[1]
-; CHECK-GI-FP16-NEXT: mov d19, v2.d[1]
+; CHECK-GI-FP16-NEXT: scvtf v2.2d, v2.2d
+; CHECK-GI-FP16-NEXT: scvtf v3.2d, v3.2d
+; CHECK-GI-FP16-NEXT: mov d16, v0.d[1]
+; CHECK-GI-FP16-NEXT: mov d17, v4.d[1]
; CHECK-GI-FP16-NEXT: fcvt h0, d0
-; CHECK-GI-FP16-NEXT: fcvt h16, d16
; CHECK-GI-FP16-NEXT: fcvt h1, d4
-; CHECK-GI-FP16-NEXT: fcvt h2, d2
-; CHECK-GI-FP16-NEXT: fcvt h6, d6
-; CHECK-GI-FP16-NEXT: fcvt h17, d17
-; CHECK-GI-FP16-NEXT: fcvt h4, d18
-; CHECK-GI-FP16-NEXT: fcvt h18, d19
-; CHECK-GI-FP16-NEXT: fcvt h19, d20
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v6.h[0]
-; CHECK-GI-FP16-NEXT: mov d6, v20.d[1]
-; CHECK-GI-FP16-NEXT: mov v16.h[1], v17.h[0]
-; CHECK-GI-FP16-NEXT: mov d17, v3.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h3, d3
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov d4, v5.d[1]
+; CHECK-GI-FP16-NEXT: mov d19, v5.d[1]
; CHECK-GI-FP16-NEXT: fcvt h5, d5
-; CHECK-GI-FP16-NEXT: mov v2.h[1], v18.h[0]
-; CHECK-GI-FP16-NEXT: mov d18, v7.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h7, d7
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v19.h[0]
-; CHECK-GI-FP16-NEXT: mov v16.h[2], v3.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h3, d6
+; CHECK-GI-FP16-NEXT: fcvt h16, d16
+; CHECK-GI-FP16-NEXT: fcvt h4, d17
+; CHECK-GI-FP16-NEXT: mov d17, v18.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h18, d18
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v16.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v4.h[0]
+; CHECK-GI-FP16-NEXT: scvtf v4.2d, v6.2d
; CHECK-GI-FP16-NEXT: fcvt h6, d17
-; CHECK-GI-FP16-NEXT: fcvt h4, d4
+; CHECK-GI-FP16-NEXT: fcvt h16, d19
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v18.h[0]
; CHECK-GI-FP16-NEXT: mov v1.h[2], v5.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h5, d18
-; CHECK-GI-FP16-NEXT: mov v2.h[2], v7.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v3.h[0]
-; CHECK-GI-FP16-NEXT: mov v16.h[3], v6.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov v2.h[3], v5.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.d[1], v16.d[0]
-; CHECK-GI-FP16-NEXT: mov v1.d[1], v2.d[0]
+; CHECK-GI-FP16-NEXT: mov d5, v2.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h2, d2
+; CHECK-GI-FP16-NEXT: mov d17, v4.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h4, d4
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v6.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v16.h[0]
+; CHECK-GI-FP16-NEXT: scvtf v6.2d, v7.2d
+; CHECK-GI-FP16-NEXT: fcvt h5, d5
+; CHECK-GI-FP16-NEXT: fcvt h7, d17
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v4.h[0]
+; CHECK-GI-FP16-NEXT: mov d2, v3.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h3, d3
+; CHECK-GI-FP16-NEXT: mov d4, v6.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h6, d6
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v5.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v7.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h2, d2
+; CHECK-GI-FP16-NEXT: fcvt h4, d4
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v3.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v6.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[7], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[7], v4.h[0]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = sitofp <16 x i64> %a to <16 x half>
@@ -4286,51 +4286,51 @@ define <16 x half> @utofp_v16i64_v16f16(<16 x i64> %a) {
; CHECK-GI-FP16-LABEL: utofp_v16i64_v16f16:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT: ucvtf v16.2d, v2.2d
; CHECK-GI-FP16-NEXT: ucvtf v4.2d, v4.2d
-; CHECK-GI-FP16-NEXT: ucvtf v2.2d, v6.2d
-; CHECK-GI-FP16-NEXT: ucvtf v20.2d, v1.2d
-; CHECK-GI-FP16-NEXT: ucvtf v3.2d, v3.2d
+; CHECK-GI-FP16-NEXT: ucvtf v18.2d, v1.2d
; CHECK-GI-FP16-NEXT: ucvtf v5.2d, v5.2d
-; CHECK-GI-FP16-NEXT: ucvtf v7.2d, v7.2d
-; CHECK-GI-FP16-NEXT: mov d6, v0.d[1]
-; CHECK-GI-FP16-NEXT: mov d17, v16.d[1]
-; CHECK-GI-FP16-NEXT: mov d18, v4.d[1]
-; CHECK-GI-FP16-NEXT: mov d19, v2.d[1]
+; CHECK-GI-FP16-NEXT: ucvtf v2.2d, v2.2d
+; CHECK-GI-FP16-NEXT: ucvtf v3.2d, v3.2d
+; CHECK-GI-FP16-NEXT: mov d16, v0.d[1]
+; CHECK-GI-FP16-NEXT: mov d17, v4.d[1]
; CHECK-GI-FP16-NEXT: fcvt h0, d0
-; CHECK-GI-FP16-NEXT: fcvt h16, d16
; CHECK-GI-FP16-NEXT: fcvt h1, d4
-; CHECK-GI-FP16-NEXT: fcvt h2, d2
-; CHECK-GI-FP16-NEXT: fcvt h6, d6
-; CHECK-GI-FP16-NEXT: fcvt h17, d17
-; CHECK-GI-FP16-NEXT: fcvt h4, d18
-; CHECK-GI-FP16-NEXT: fcvt h18, d19
-; CHECK-GI-FP16-NEXT: fcvt h19, d20
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v6.h[0]
-; CHECK-GI-FP16-NEXT: mov d6, v20.d[1]
-; CHECK-GI-FP16-NEXT: mov v16.h[1], v17.h[0]
-; CHECK-GI-FP16-NEXT: mov d17, v3.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h3, d3
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov d4, v5.d[1]
+; CHECK-GI-FP16-NEXT: mov d19, v5.d[1]
; CHECK-GI-FP16-NEXT: fcvt h5, d5
-; CHECK-GI-FP16-NEXT: mov v2.h[1], v18.h[0]
-; CHECK-GI-FP16-NEXT: mov d18, v7.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h7, d7
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v19.h[0]
-; CHECK-GI-FP16-NEXT: mov v16.h[2], v3.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h3, d6
+; CHECK-GI-FP16-NEXT: fcvt h16, d16
+; CHECK-GI-FP16-NEXT: fcvt h4, d17
+; CHECK-GI-FP16-NEXT: mov d17, v18.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h18, d18
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v16.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v4.h[0]
+; CHECK-GI-FP16-NEXT: ucvtf v4.2d, v6.2d
; CHECK-GI-FP16-NEXT: fcvt h6, d17
-; CHECK-GI-FP16-NEXT: fcvt h4, d4
+; CHECK-GI-FP16-NEXT: fcvt h16, d19
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v18.h[0]
; CHECK-GI-FP16-NEXT: mov v1.h[2], v5.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h5, d18
-; CHECK-GI-FP16-NEXT: mov v2.h[2], v7.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v3.h[0]
-; CHECK-GI-FP16-NEXT: mov v16.h[3], v6.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov v2.h[3], v5.h[0]
-; CHECK-GI-FP16-NEXT: mov v0.d[1], v16.d[0]
-; CHECK-GI-FP16-NEXT: mov v1.d[1], v2.d[0]
+; CHECK-GI-FP16-NEXT: mov d5, v2.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h2, d2
+; CHECK-GI-FP16-NEXT: mov d17, v4.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h4, d4
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v6.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v16.h[0]
+; CHECK-GI-FP16-NEXT: ucvtf v6.2d, v7.2d
+; CHECK-GI-FP16-NEXT: fcvt h5, d5
+; CHECK-GI-FP16-NEXT: fcvt h7, d17
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v4.h[0]
+; CHECK-GI-FP16-NEXT: mov d2, v3.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h3, d3
+; CHECK-GI-FP16-NEXT: mov d4, v6.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h6, d6
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v5.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v7.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h2, d2
+; CHECK-GI-FP16-NEXT: fcvt h4, d4
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v3.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v6.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[7], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[7], v4.h[0]
; CHECK-GI-FP16-NEXT: ret
entry:
%c = uitofp <16 x i64> %a to <16 x half>
@@ -4436,103 +4436,104 @@ define <32 x half> @stofp_v32i64_v32f16(<32 x i64> %a) {
;
; CHECK-GI-FP16-LABEL: stofp_v32i64_v32f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: scvtf v16.2d, v2.2d
+; CHECK-GI-FP16-NEXT: ldp q16, q18, [sp]
; CHECK-GI-FP16-NEXT: scvtf v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT: scvtf v18.2d, v4.2d
-; CHECK-GI-FP16-NEXT: scvtf v17.2d, v6.2d
-; CHECK-GI-FP16-NEXT: scvtf v4.2d, v1.2d
-; CHECK-GI-FP16-NEXT: scvtf v3.2d, v3.2d
-; CHECK-GI-FP16-NEXT: ldp q1, q23, [sp]
+; CHECK-GI-FP16-NEXT: ldp q17, q19, [sp, #64]
+; CHECK-GI-FP16-NEXT: scvtf v4.2d, v4.2d
+; CHECK-GI-FP16-NEXT: scvtf v1.2d, v1.2d
; CHECK-GI-FP16-NEXT: scvtf v5.2d, v5.2d
-; CHECK-GI-FP16-NEXT: scvtf v6.2d, v7.2d
-; CHECK-GI-FP16-NEXT: mov d20, v16.d[1]
-; CHECK-GI-FP16-NEXT: mov d19, v0.d[1]
-; CHECK-GI-FP16-NEXT: mov d21, v18.d[1]
-; CHECK-GI-FP16-NEXT: mov d22, v17.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h16, d16
-; CHECK-GI-FP16-NEXT: scvtf v2.2d, v1.2d
+; CHECK-GI-FP16-NEXT: scvtf v6.2d, v6.2d
+; CHECK-GI-FP16-NEXT: scvtf v20.2d, v16.2d
+; CHECK-GI-FP16-NEXT: scvtf v24.2d, v18.2d
+; CHECK-GI-FP16-NEXT: scvtf v2.2d, v2.2d
+; CHECK-GI-FP16-NEXT: scvtf v16.2d, v17.2d
+; CHECK-GI-FP16-NEXT: mov d21, v0.d[1]
+; CHECK-GI-FP16-NEXT: scvtf v25.2d, v19.2d
+; CHECK-GI-FP16-NEXT: mov d22, v4.d[1]
; CHECK-GI-FP16-NEXT: fcvt h0, d0
-; CHECK-GI-FP16-NEXT: fcvt h1, d18
-; CHECK-GI-FP16-NEXT: ldr q18, [sp, #32]
-; CHECK-GI-FP16-NEXT: fcvt h7, d17
-; CHECK-GI-FP16-NEXT: ldp q25, q17, [sp, #48]
-; CHECK-GI-FP16-NEXT: fcvt h20, d20
-; CHECK-GI-FP16-NEXT: fcvt h24, d19
-; CHECK-GI-FP16-NEXT: fcvt h21, d21
-; CHECK-GI-FP16-NEXT: fcvt h22, d22
-; CHECK-GI-FP16-NEXT: scvtf v18.2d, v18.2d
-; CHECK-GI-FP16-NEXT: fcvt h26, d4
-; CHECK-GI-FP16-NEXT: scvtf v17.2d, v17.2d
-; CHECK-GI-FP16-NEXT: fcvt h27, d3
-; CHECK-GI-FP16-NEXT: fcvt h28, d6
-; CHECK-GI-FP16-NEXT: scvtf v23.2d, v23.2d
-; CHECK-GI-FP16-NEXT: scvtf v25.2d, v25.2d
-; CHECK-GI-FP16-NEXT: mov d4, v4.d[1]
-; CHECK-GI-FP16-NEXT: mov v16.h[1], v20.h[0]
-; CHECK-GI-FP16-NEXT: ldp q19, q20, [sp, #80]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v24.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h24, d5
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v21.h[0]
-; CHECK-GI-FP16-NEXT: ldr q21, [sp, #112]
-; CHECK-GI-FP16-NEXT: mov v7.h[1], v22.h[0]
-; CHECK-GI-FP16-NEXT: mov d22, v2.d[1]
-; CHECK-GI-FP16-NEXT: scvtf v20.2d, v20.2d
-; CHECK-GI-FP16-NEXT: fcvt h2, d2
-; CHECK-GI-FP16-NEXT: scvtf v19.2d, v19.2d
-; CHECK-GI-FP16-NEXT: mov v16.h[2], v27.h[0]
-; CHECK-GI-FP16-NEXT: scvtf v21.2d, v21.2d
-; CHECK-GI-FP16-NEXT: mov d5, v5.d[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v26.h[0]
-; CHECK-GI-FP16-NEXT: mov d26, v18.d[1]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v24.h[0]
-; CHECK-GI-FP16-NEXT: mov d24, v17.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h22, d22
-; CHECK-GI-FP16-NEXT: mov v7.h[2], v28.h[0]
-; CHECK-GI-FP16-NEXT: mov d27, v20.d[1]
+; CHECK-GI-FP16-NEXT: scvtf v3.2d, v3.2d
+; CHECK-GI-FP16-NEXT: mov d18, v1.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h17, d1
+; CHECK-GI-FP16-NEXT: mov d19, v5.d[1]
+; CHECK-GI-FP16-NEXT: mov d23, v20.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h1, d4
+; CHECK-GI-FP16-NEXT: fcvt h4, d20
+; CHECK-GI-FP16-NEXT: mov d26, v16.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h20, d5
+; CHECK-GI-FP16-NEXT: fcvt h5, d16
+; CHECK-GI-FP16-NEXT: fcvt h28, d21
+; CHECK-GI-FP16-NEXT: fcvt h29, d22
+; CHECK-GI-FP16-NEXT: fcvt h22, d24
+; CHECK-GI-FP16-NEXT: fcvt h21, d25
; CHECK-GI-FP16-NEXT: fcvt h18, d18
-; CHECK-GI-FP16-NEXT: mov d28, v3.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h3, d17
-; CHECK-GI-FP16-NEXT: fcvt h20, d20
-; CHECK-GI-FP16-NEXT: mov d6, v6.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h26, d26
-; CHECK-GI-FP16-NEXT: fcvt h4, d4
-; CHECK-GI-FP16-NEXT: fcvt h5, d5
-; CHECK-GI-FP16-NEXT: fcvt h17, d24
-; CHECK-GI-FP16-NEXT: mov v2.h[1], v22.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h24, d23
-; CHECK-GI-FP16-NEXT: fcvt h22, d27
-; CHECK-GI-FP16-NEXT: mov d23, v23.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h6, d6
-; CHECK-GI-FP16-NEXT: mov v18.h[1], v26.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h26, d25
-; CHECK-GI-FP16-NEXT: mov d25, v25.d[1]
-; CHECK-GI-FP16-NEXT: mov v3.h[1], v17.h[0]
-; CHECK-GI-FP16-NEXT: mov d17, v19.d[1]
; CHECK-GI-FP16-NEXT: fcvt h19, d19
-; CHECK-GI-FP16-NEXT: mov v20.h[1], v22.h[0]
-; CHECK-GI-FP16-NEXT: mov d22, v21.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h21, d21
-; CHECK-GI-FP16-NEXT: mov v2.h[2], v24.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h24, d28
+; CHECK-GI-FP16-NEXT: fcvt h27, d23
+; CHECK-GI-FP16-NEXT: mov d23, v24.d[1]
+; CHECK-GI-FP16-NEXT: mov d24, v25.d[1]
+; CHECK-GI-FP16-NEXT: ldp q25, q16, [sp, #32]
+; CHECK-GI-FP16-NEXT: fcvt h26, d26
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v28.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v29.h[0]
+; CHECK-GI-FP16-NEXT: scvtf v7.2d, v7.2d
+; CHECK-GI-FP16-NEXT: mov v4.h[1], v27.h[0]
+; CHECK-GI-FP16-NEXT: scvtf v25.2d, v25.2d
; CHECK-GI-FP16-NEXT: fcvt h23, d23
-; CHECK-GI-FP16-NEXT: mov v18.h[2], v26.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h25, d25
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v4.h[0]
+; CHECK-GI-FP16-NEXT: mov v5.h[1], v26.h[0]
+; CHECK-GI-FP16-NEXT: ldp q26, q27, [sp, #96]
+; CHECK-GI-FP16-NEXT: fcvt h24, d24
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v17.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v20.h[0]
+; CHECK-GI-FP16-NEXT: mov d20, v6.d[1]
+; CHECK-GI-FP16-NEXT: mov d17, v2.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h2, d2
+; CHECK-GI-FP16-NEXT: scvtf v26.2d, v26.2d
+; CHECK-GI-FP16-NEXT: mov v4.h[2], v22.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h22, d25
+; CHECK-GI-FP16-NEXT: mov v5.h[2], v21.h[0]
+; CHECK-GI-FP16-NEXT: mov d21, v25.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h6, d6
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v18.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v19.h[0]
+; CHECK-GI-FP16-NEXT: scvtf v16.2d, v16.2d
+; CHECK-GI-FP16-NEXT: scvtf v18.2d, v27.2d
+; CHECK-GI-FP16-NEXT: fcvt h19, d20
; CHECK-GI-FP16-NEXT: fcvt h17, d17
-; CHECK-GI-FP16-NEXT: mov v3.h[2], v19.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v5.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h19, d22
-; CHECK-GI-FP16-NEXT: mov v20.h[2], v21.h[0]
-; CHECK-GI-FP16-NEXT: mov v7.h[3], v6.h[0]
-; CHECK-GI-FP16-NEXT: mov v16.h[3], v24.h[0]
-; CHECK-GI-FP16-NEXT: mov v2.h[3], v23.h[0]
-; CHECK-GI-FP16-NEXT: mov v18.h[3], v25.h[0]
-; CHECK-GI-FP16-NEXT: mov v3.h[3], v17.h[0]
-; CHECK-GI-FP16-NEXT: mov v20.h[3], v19.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.d[1], v7.d[0]
-; CHECK-GI-FP16-NEXT: mov v0.d[1], v16.d[0]
-; CHECK-GI-FP16-NEXT: mov v2.d[1], v18.d[0]
-; CHECK-GI-FP16-NEXT: mov v3.d[1], v20.d[0]
+; CHECK-GI-FP16-NEXT: mov d25, v26.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h26, d26
+; CHECK-GI-FP16-NEXT: mov v4.h[3], v23.h[0]
+; CHECK-GI-FP16-NEXT: mov v5.h[3], v24.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h20, d21
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v6.h[0]
+; CHECK-GI-FP16-NEXT: mov d2, v3.d[1]
+; CHECK-GI-FP16-NEXT: mov d23, v18.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h18, d18
+; CHECK-GI-FP16-NEXT: fcvt h3, d3
+; CHECK-GI-FP16-NEXT: fcvt h21, d25
+; CHECK-GI-FP16-NEXT: mov v4.h[4], v22.h[0]
+; CHECK-GI-FP16-NEXT: mov d22, v16.d[1]
+; CHECK-GI-FP16-NEXT: mov v5.h[4], v26.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h16, d16
+; CHECK-GI-FP16-NEXT: mov d6, v7.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h7, d7
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v17.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v19.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h19, d23
+; CHECK-GI-FP16-NEXT: fcvt h2, d2
+; CHECK-GI-FP16-NEXT: mov v4.h[5], v20.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h17, d22
+; CHECK-GI-FP16-NEXT: mov v5.h[5], v21.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h6, d6
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v3.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v7.h[0]
+; CHECK-GI-FP16-NEXT: mov v4.h[6], v16.h[0]
+; CHECK-GI-FP16-NEXT: mov v5.h[6], v18.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[7], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[7], v6.h[0]
+; CHECK-GI-FP16-NEXT: mov v4.h[7], v17.h[0]
+; CHECK-GI-FP16-NEXT: mov v5.h[7], v19.h[0]
+; CHECK-GI-FP16-NEXT: mov v2.16b, v4.16b
+; CHECK-GI-FP16-NEXT: mov v3.16b, v5.16b
; CHECK-GI-FP16-NEXT: ret
entry:
%c = sitofp <32 x i64> %a to <32 x half>
@@ -4638,103 +4639,104 @@ define <32 x half> @utofp_v32i64_v32f16(<32 x i64> %a) {
;
; CHECK-GI-FP16-LABEL: utofp_v32i64_v32f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: ucvtf v16.2d, v2.2d
+; CHECK-GI-FP16-NEXT: ldp q16, q18, [sp]
; CHECK-GI-FP16-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT: ucvtf v18.2d, v4.2d
-; CHECK-GI-FP16-NEXT: ucvtf v17.2d, v6.2d
-; CHECK-GI-FP16-NEXT: ucvtf v4.2d, v1.2d
-; CHECK-GI-FP16-NEXT: ucvtf v3.2d, v3.2d
-; CHECK-GI-FP16-NEXT: ldp q1, q23, [sp]
+; CHECK-GI-FP16-NEXT: ldp q17, q19, [sp, #64]
+; CHECK-GI-FP16-NEXT: ucvtf v4.2d, v4.2d
+; CHECK-GI-FP16-NEXT: ucvtf v1.2d, v1.2d
; CHECK-GI-FP16-NEXT: ucvtf v5.2d, v5.2d
-; CHECK-GI-FP16-NEXT: ucvtf v6.2d, v7.2d
-; CHECK-GI-FP16-NEXT: mov d20, v16.d[1]
-; CHECK-GI-FP16-NEXT: mov d19, v0.d[1]
-; CHECK-GI-FP16-NEXT: mov d21, v18.d[1]
-; CHECK-GI-FP16-NEXT: mov d22, v17.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h16, d16
-; CHECK-GI-FP16-NEXT: ucvtf v2.2d, v1.2d
+; CHECK-GI-FP16-NEXT: ucvtf v6.2d, v6.2d
+; CHECK-GI-FP16-NEXT: ucvtf v20.2d, v16.2d
+; CHECK-GI-FP16-NEXT: ucvtf v24.2d, v18.2d
+; CHECK-GI-FP16-NEXT: ucvtf v2.2d, v2.2d
+; CHECK-GI-FP16-NEXT: ucvtf v16.2d, v17.2d
+; CHECK-GI-FP16-NEXT: mov d21, v0.d[1]
+; CHECK-GI-FP16-NEXT: ucvtf v25.2d, v19.2d
+; CHECK-GI-FP16-NEXT: mov d22, v4.d[1]
; CHECK-GI-FP16-NEXT: fcvt h0, d0
-; CHECK-GI-FP16-NEXT: fcvt h1, d18
-; CHECK-GI-FP16-NEXT: ldr q18, [sp, #32]
-; CHECK-GI-FP16-NEXT: fcvt h7, d17
-; CHECK-GI-FP16-NEXT: ldp q25, q17, [sp, #48]
-; CHECK-GI-FP16-NEXT: fcvt h20, d20
-; CHECK-GI-FP16-NEXT: fcvt h24, d19
-; CHECK-GI-FP16-NEXT: fcvt h21, d21
-; CHECK-GI-FP16-NEXT: fcvt h22, d22
-; CHECK-GI-FP16-NEXT: ucvtf v18.2d, v18.2d
-; CHECK-GI-FP16-NEXT: fcvt h26, d4
-; CHECK-GI-FP16-NEXT: ucvtf v17.2d, v17.2d
-; CHECK-GI-FP16-NEXT: fcvt h27, d3
-; CHECK-GI-FP16-NEXT: fcvt h28, d6
-; CHECK-GI-FP16-NEXT: ucvtf v23.2d, v23.2d
-; CHECK-GI-FP16-NEXT: ucvtf v25.2d, v25.2d
-; CHECK-GI-FP16-NEXT: mov d4, v4.d[1]
-; CHECK-GI-FP16-NEXT: mov v16.h[1], v20.h[0]
-; CHECK-GI-FP16-NEXT: ldp q19, q20, [sp, #80]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v24.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h24, d5
-; CHECK-GI-FP16-NEXT: mov v1.h[1], v21.h[0]
-; CHECK-GI-FP16-NEXT: ldr q21, [sp, #112]
-; CHECK-GI-FP16-NEXT: mov v7.h[1], v22.h[0]
-; CHECK-GI-FP16-NEXT: mov d22, v2.d[1]
-; CHECK-GI-FP16-NEXT: ucvtf v20.2d, v20.2d
-; CHECK-GI-FP16-NEXT: fcvt h2, d2
-; CHECK-GI-FP16-NEXT: ucvtf v19.2d, v19.2d
-; CHECK-GI-FP16-NEXT: mov v16.h[2], v27.h[0]
-; CHECK-GI-FP16-NEXT: ucvtf v21.2d, v21.2d
-; CHECK-GI-FP16-NEXT: mov d5, v5.d[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[2], v26.h[0]
-; CHECK-GI-FP16-NEXT: mov d26, v18.d[1]
-; CHECK-GI-FP16-NEXT: mov v1.h[2], v24.h[0]
-; CHECK-GI-FP16-NEXT: mov d24, v17.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h22, d22
-; CHECK-GI-FP16-NEXT: mov v7.h[2], v28.h[0]
-; CHECK-GI-FP16-NEXT: mov d27, v20.d[1]
+; CHECK-GI-FP16-NEXT: ucvtf v3.2d, v3.2d
+; CHECK-GI-FP16-NEXT: mov d18, v1.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h17, d1
+; CHECK-GI-FP16-NEXT: mov d19, v5.d[1]
+; CHECK-GI-FP16-NEXT: mov d23, v20.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h1, d4
+; CHECK-GI-FP16-NEXT: fcvt h4, d20
+; CHECK-GI-FP16-NEXT: mov d26, v16.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h20, d5
+; CHECK-GI-FP16-NEXT: fcvt h5, d16
+; CHECK-GI-FP16-NEXT: fcvt h28, d21
+; CHECK-GI-FP16-NEXT: fcvt h29, d22
+; CHECK-GI-FP16-NEXT: fcvt h22, d24
+; CHECK-GI-FP16-NEXT: fcvt h21, d25
; CHECK-GI-FP16-NEXT: fcvt h18, d18
-; CHECK-GI-FP16-NEXT: mov d28, v3.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h3, d17
-; CHECK-GI-FP16-NEXT: fcvt h20, d20
-; CHECK-GI-FP16-NEXT: mov d6, v6.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h26, d26
-; CHECK-GI-FP16-NEXT: fcvt h4, d4
-; CHECK-GI-FP16-NEXT: fcvt h5, d5
-; CHECK-GI-FP16-NEXT: fcvt h17, d24
-; CHECK-GI-FP16-NEXT: mov v2.h[1], v22.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h24, d23
-; CHECK-GI-FP16-NEXT: fcvt h22, d27
-; CHECK-GI-FP16-NEXT: mov d23, v23.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h6, d6
-; CHECK-GI-FP16-NEXT: mov v18.h[1], v26.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h26, d25
-; CHECK-GI-FP16-NEXT: mov d25, v25.d[1]
-; CHECK-GI-FP16-NEXT: mov v3.h[1], v17.h[0]
-; CHECK-GI-FP16-NEXT: mov d17, v19.d[1]
; CHECK-GI-FP16-NEXT: fcvt h19, d19
-; CHECK-GI-FP16-NEXT: mov v20.h[1], v22.h[0]
-; CHECK-GI-FP16-NEXT: mov d22, v21.d[1]
-; CHECK-GI-FP16-NEXT: fcvt h21, d21
-; CHECK-GI-FP16-NEXT: mov v2.h[2], v24.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h24, d28
+; CHECK-GI-FP16-NEXT: fcvt h27, d23
+; CHECK-GI-FP16-NEXT: mov d23, v24.d[1]
+; CHECK-GI-FP16-NEXT: mov d24, v25.d[1]
+; CHECK-GI-FP16-NEXT: ldp q25, q16, [sp, #32]
+; CHECK-GI-FP16-NEXT: fcvt h26, d26
+; CHECK-GI-FP16-NEXT: mov v0.h[1], v28.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[1], v29.h[0]
+; CHECK-GI-FP16-NEXT: ucvtf v7.2d, v7.2d
+; CHECK-GI-FP16-NEXT: mov v4.h[1], v27.h[0]
+; CHECK-GI-FP16-NEXT: ucvtf v25.2d, v25.2d
; CHECK-GI-FP16-NEXT: fcvt h23, d23
-; CHECK-GI-FP16-NEXT: mov v18.h[2], v26.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h25, d25
-; CHECK-GI-FP16-NEXT: mov v0.h[3], v4.h[0]
+; CHECK-GI-FP16-NEXT: mov v5.h[1], v26.h[0]
+; CHECK-GI-FP16-NEXT: ldp q26, q27, [sp, #96]
+; CHECK-GI-FP16-NEXT: fcvt h24, d24
+; CHECK-GI-FP16-NEXT: mov v0.h[2], v17.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[2], v20.h[0]
+; CHECK-GI-FP16-NEXT: mov d20, v6.d[1]
+; CHECK-GI-FP16-NEXT: mov d17, v2.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h2, d2
+; CHECK-GI-FP16-NEXT: ucvtf v26.2d, v26.2d
+; CHECK-GI-FP16-NEXT: mov v4.h[2], v22.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h22, d25
+; CHECK-GI-FP16-NEXT: mov v5.h[2], v21.h[0]
+; CHECK-GI-FP16-NEXT: mov d21, v25.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h6, d6
+; CHECK-GI-FP16-NEXT: mov v0.h[3], v18.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[3], v19.h[0]
+; CHECK-GI-FP16-NEXT: ucvtf v16.2d, v16.2d
+; CHECK-GI-FP16-NEXT: ucvtf v18.2d, v27.2d
+; CHECK-GI-FP16-NEXT: fcvt h19, d20
; CHECK-GI-FP16-NEXT: fcvt h17, d17
-; CHECK-GI-FP16-NEXT: mov v3.h[2], v19.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.h[3], v5.h[0]
-; CHECK-GI-FP16-NEXT: fcvt h19, d22
-; CHECK-GI-FP16-NEXT: mov v20.h[2], v21.h[0]
-; CHECK-GI-FP16-NEXT: mov v7.h[3], v6.h[0]
-; CHECK-GI-FP16-NEXT: mov v16.h[3], v24.h[0]
-; CHECK-GI-FP16-NEXT: mov v2.h[3], v23.h[0]
-; CHECK-GI-FP16-NEXT: mov v18.h[3], v25.h[0]
-; CHECK-GI-FP16-NEXT: mov v3.h[3], v17.h[0]
-; CHECK-GI-FP16-NEXT: mov v20.h[3], v19.h[0]
-; CHECK-GI-FP16-NEXT: mov v1.d[1], v7.d[0]
-; CHECK-GI-FP16-NEXT: mov v0.d[1], v16.d[0]
-; CHECK-GI-FP16-NEXT: mov v2.d[1], v18.d[0]
-; CHECK-GI-FP16-NEXT: mov v3.d[1], v20.d[0]
+; CHECK-GI-FP16-NEXT: mov d25, v26.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h26, d26
+; CHECK-GI-FP16-NEXT: mov v4.h[3], v23.h[0]
+; CHECK-GI-FP16-NEXT: mov v5.h[3], v24.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h20, d21
+; CHECK-GI-FP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[4], v6.h[0]
+; CHECK-GI-FP16-NEXT: mov d2, v3.d[1]
+; CHECK-GI-FP16-NEXT: mov d23, v18.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h18, d18
+; CHECK-GI-FP16-NEXT: fcvt h3, d3
+; CHECK-GI-FP16-NEXT: fcvt h21, d25
+; CHECK-GI-FP16-NEXT: mov v4.h[4], v22.h[0]
+; CHECK-GI-FP16-NEXT: mov d22, v16.d[1]
+; CHECK-GI-FP16-NEXT: mov v5.h[4], v26.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h16, d16
+; CHECK-GI-FP16-NEXT: mov d6, v7.d[1]
+; CHECK-GI-FP16-NEXT: fcvt h7, d7
+; CHECK-GI-FP16-NEXT: mov v0.h[5], v17.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[5], v19.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h19, d23
+; CHECK-GI-FP16-NEXT: fcvt h2, d2
+; CHECK-GI-FP16-NEXT: mov v4.h[5], v20.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h17, d22
+; CHECK-GI-FP16-NEXT: mov v5.h[5], v21.h[0]
+; CHECK-GI-FP16-NEXT: fcvt h6, d6
+; CHECK-GI-FP16-NEXT: mov v0.h[6], v3.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[6], v7.h[0]
+; CHECK-GI-FP16-NEXT: mov v4.h[6], v16.h[0]
+; CHECK-GI-FP16-NEXT: mov v5.h[6], v18.h[0]
+; CHECK-GI-FP16-NEXT: mov v0.h[7], v2.h[0]
+; CHECK-GI-FP16-NEXT: mov v1.h[7], v6.h[0]
+; CHECK-GI-FP16-NEXT: mov v4.h[7], v17.h[0]
+; CHECK-GI-FP16-NEXT: mov v5.h[7], v19.h[0]
+; CHECK-GI-FP16-NEXT: mov v2.16b, v4.16b
+; CHECK-GI-FP16-NEXT: mov v3.16b, v5.16b
; CHECK-GI-FP16-NEXT: ret
entry:
%c = uitofp <32 x i64> %a to <32 x half>
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index f17b9724aadba..01620652301ed 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1137,11 +1137,8 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) {
;
; CHECK-GI-LABEL: vselect_constant_cond_zero_v4i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI85_1
-; CHECK-GI-NEXT: adrp x9, .LCPI85_0
-; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI85_1]
-; CHECK-GI-NEXT: ldr d2, [x9, :lo12:.LCPI85_0]
-; CHECK-GI-NEXT: mov v1.d[1], v2.d[0]
+; CHECK-GI-NEXT: adrp x8, .LCPI85_0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI85_0]
; CHECK-GI-NEXT: shl v1.4s, v1.4s, #31
; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
@@ -1204,11 +1201,8 @@ define <4 x i32> @vselect_constant_cond_v4i32(<4 x i32> %a, <4 x i32> %b) {
;
; CHECK-GI-LABEL: vselect_constant_cond_v4i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI88_1
-; CHECK-GI-NEXT: adrp x9, .LCPI88_0
-; CHECK-GI-NEXT: ldr d2, [x8, :lo12:.LCPI88_1]
-; CHECK-GI-NEXT: ldr d3, [x9, :lo12:.LCPI88_0]
-; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
+; CHECK-GI-NEXT: adrp x8, .LCPI88_0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI88_0]
; CHECK-GI-NEXT: shl v2.4s, v2.4s, #31
; CHECK-GI-NEXT: sshr v2.4s, v2.4s, #31
; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 1531154b8a03c..86dd1bdd511eb 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2092,104 +2092,104 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
; CHECK-GI-DOT-LABEL: test_udot_v24i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ldr b1, [x0]
-; CHECK-GI-DOT-NEXT: ldr b5, [x0, #1]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #1]
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: ldr b3, [x0, #8]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #9]
-; CHECK-GI-DOT-NEXT: ldr b2, [x0, #16]
-; CHECK-GI-DOT-NEXT: ldr b4, [x1]
-; CHECK-GI-DOT-NEXT: mov v1.b[1], v5.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x1, #1]
+; CHECK-GI-DOT-NEXT: ldr b2, [x1]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #1]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #8]
+; CHECK-GI-DOT-NEXT: mov v1.b[1], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #2]
; CHECK-GI-DOT-NEXT: ldr b6, [x1, #8]
+; CHECK-GI-DOT-NEXT: mov v2.b[1], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #2]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #17]
+; CHECK-GI-DOT-NEXT: ldr b16, [x1, #17]
+; CHECK-GI-DOT-NEXT: mov v1.b[2], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #3]
+; CHECK-GI-DOT-NEXT: mov v2.b[2], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #3]
+; CHECK-GI-DOT-NEXT: mov v1.b[3], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #4]
+; CHECK-GI-DOT-NEXT: mov v2.b[3], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #4]
+; CHECK-GI-DOT-NEXT: mov v1.b[4], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #5]
+; CHECK-GI-DOT-NEXT: mov v2.b[4], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #5]
+; CHECK-GI-DOT-NEXT: mov v1.b[5], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #6]
+; CHECK-GI-DOT-NEXT: mov v2.b[5], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #6]
+; CHECK-GI-DOT-NEXT: mov v1.b[6], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #7]
+; CHECK-GI-DOT-NEXT: mov v2.b[6], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #7]
+; CHECK-GI-DOT-NEXT: mov v1.b[7], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #16]
+; CHECK-GI-DOT-NEXT: mov v2.b[7], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #16]
; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #9]
-; CHECK-GI-DOT-NEXT: ldr b18, [x0, #17]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #18]
; CHECK-GI-DOT-NEXT: mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT: ldr b5, [x1, #16]
-; CHECK-GI-DOT-NEXT: ldr b19, [x1, #17]
-; CHECK-GI-DOT-NEXT: mov v6.b[1], v17.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #2]
-; CHECK-GI-DOT-NEXT: mov v2.b[1], v18.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x0, #10]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #2]
-; CHECK-GI-DOT-NEXT: mov v5.b[1], v19.b[0]
-; CHECK-GI-DOT-NEXT: ldr b18, [x1, #10]
-; CHECK-GI-DOT-NEXT: mov v1.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b19, [x0, #18]
-; CHECK-GI-DOT-NEXT: ldr b7, [x1, #18]
-; CHECK-GI-DOT-NEXT: mov v3.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT: mov v6.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x0, #3]
-; CHECK-GI-DOT-NEXT: mov v2.b[2], v19.b[0]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #3]
-; CHECK-GI-DOT-NEXT: ldr b18, [x1, #11]
-; CHECK-GI-DOT-NEXT: mov v5.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #11]
-; CHECK-GI-DOT-NEXT: ldr b19, [x0, #19]
-; CHECK-GI-DOT-NEXT: mov v1.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT: ldr b16, [x1, #18]
+; CHECK-GI-DOT-NEXT: mov v1.b[8], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #9]
+; CHECK-GI-DOT-NEXT: mov v2.b[8], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #9]
+; CHECK-GI-DOT-NEXT: mov v3.b[2], v7.b[0]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #19]
+; CHECK-GI-DOT-NEXT: mov v4.b[2], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #19]
+; CHECK-GI-DOT-NEXT: mov v1.b[9], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #10]
+; CHECK-GI-DOT-NEXT: mov v2.b[9], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #10]
; CHECK-GI-DOT-NEXT: mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.b[3], v17.b[0]
-; CHECK-GI-DOT-NEXT: mov v6.b[3], v18.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #4]
-; CHECK-GI-DOT-NEXT: mov v2.b[3], v19.b[0]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #4]
-; CHECK-GI-DOT-NEXT: mov v5.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x0, #12]
-; CHECK-GI-DOT-NEXT: ldr b18, [x1, #12]
-; CHECK-GI-DOT-NEXT: ldr b19, [x0, #20]
-; CHECK-GI-DOT-NEXT: mov v1.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x1, #20]
-; CHECK-GI-DOT-NEXT: mov v3.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT: mov v6.b[4], v18.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x0, #5]
-; CHECK-GI-DOT-NEXT: mov v2.b[4], v19.b[0]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #5]
-; CHECK-GI-DOT-NEXT: mov v5.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #13]
-; CHECK-GI-DOT-NEXT: ldr b18, [x1, #13]
-; CHECK-GI-DOT-NEXT: ldr b19, [x0, #21]
-; CHECK-GI-DOT-NEXT: mov v1.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #20]
+; CHECK-GI-DOT-NEXT: mov v4.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT: ldr b16, [x1, #20]
+; CHECK-GI-DOT-NEXT: mov v1.b[10], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #11]
+; CHECK-GI-DOT-NEXT: mov v2.b[10], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #11]
+; CHECK-GI-DOT-NEXT: mov v3.b[4], v7.b[0]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #21]
+; CHECK-GI-DOT-NEXT: mov v4.b[4], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #21]
+; CHECK-GI-DOT-NEXT: mov v1.b[11], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #12]
+; CHECK-GI-DOT-NEXT: mov v2.b[11], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #12]
; CHECK-GI-DOT-NEXT: mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.b[5], v17.b[0]
-; CHECK-GI-DOT-NEXT: mov v6.b[5], v18.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #6]
-; CHECK-GI-DOT-NEXT: mov v2.b[5], v19.b[0]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #6]
-; CHECK-GI-DOT-NEXT: mov v5.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x0, #14]
-; CHECK-GI-DOT-NEXT: ldr b18, [x1, #14]
-; CHECK-GI-DOT-NEXT: ldr b19, [x0, #22]
-; CHECK-GI-DOT-NEXT: mov v1.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x1, #22]
-; CHECK-GI-DOT-NEXT: mov v3.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT: mov v6.b[6], v18.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x0, #7]
-; CHECK-GI-DOT-NEXT: mov v2.b[6], v19.b[0]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #7]
-; CHECK-GI-DOT-NEXT: mov v5.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #15]
-; CHECK-GI-DOT-NEXT: ldr b18, [x1, #15]
-; CHECK-GI-DOT-NEXT: ldr b19, [x0, #23]
-; CHECK-GI-DOT-NEXT: mov v1.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #22]
+; CHECK-GI-DOT-NEXT: mov v4.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT: ldr b16, [x1, #22]
+; CHECK-GI-DOT-NEXT: mov v1.b[12], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #13]
+; CHECK-GI-DOT-NEXT: mov v2.b[12], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #13]
+; CHECK-GI-DOT-NEXT: mov v3.b[6], v7.b[0]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #23]
+; CHECK-GI-DOT-NEXT: mov v4.b[6], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #23]
+; CHECK-GI-DOT-NEXT: mov v1.b[13], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #14]
+; CHECK-GI-DOT-NEXT: mov v2.b[13], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #14]
; CHECK-GI-DOT-NEXT: mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.b[7], v17.b[0]
-; CHECK-GI-DOT-NEXT: mov v6.b[7], v18.b[0]
-; CHECK-GI-DOT-NEXT: mov v2.b[7], v19.b[0]
-; CHECK-GI-DOT-NEXT: mov v5.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.d[1], v6.d[0]
-; CHECK-GI-DOT-NEXT: mov v1.d[1], v3.d[0]
-; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: mov v2.d[1], v0.d[0]
-; CHECK-GI-DOT-NEXT: mov v5.d[1], v0.d[0]
-; CHECK-GI-DOT-NEXT: udot v3.4s, v4.16b, v1.16b
-; CHECK-GI-DOT-NEXT: udot v0.4s, v5.16b, v2.16b
-; CHECK-GI-DOT-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-GI-DOT-NEXT: mov v4.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT: mov v1.b[14], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #15]
+; CHECK-GI-DOT-NEXT: mov v2.b[14], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #15]
+; CHECK-GI-DOT-NEXT: mov v3.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT: mov v4.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT: mov v1.b[15], v5.b[0]
+; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: mov v2.b[15], v6.b[0]
+; CHECK-GI-DOT-NEXT: udot v0.4s, v4.16b, v3.16b
+; CHECK-GI-DOT-NEXT: udot v5.4s, v2.16b, v1.16b
+; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
@@ -2670,104 +2670,104 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ldr b1, [x0]
-; CHECK-GI-DOT-NEXT: ldr b5, [x0, #1]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #1]
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: ldr b3, [x0, #8]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #9]
-; CHECK-GI-DOT-NEXT: ldr b2, [x0, #16]
-; CHECK-GI-DOT-NEXT: ldr b4, [x1]
-; CHECK-GI-DOT-NEXT: mov v1.b[1], v5.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x1, #1]
+; CHECK-GI-DOT-NEXT: ldr b2, [x1]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #1]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #8]
+; CHECK-GI-DOT-NEXT: mov v1.b[1], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #2]
; CHECK-GI-DOT-NEXT: ldr b6, [x1, #8]
+; CHECK-GI-DOT-NEXT: mov v2.b[1], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #2]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #17]
+; CHECK-GI-DOT-NEXT: ldr b16, [x1, #17]
+; CHECK-GI-DOT-NEXT: mov v1.b[2], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #3]
+; CHECK-GI-DOT-NEXT: mov v2.b[2], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #3]
+; CHECK-GI-DOT-NEXT: mov v1.b[3], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #4]
+; CHECK-GI-DOT-NEXT: mov v2.b[3], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #4]
+; CHECK-GI-DOT-NEXT: mov v1.b[4], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #5]
+; CHECK-GI-DOT-NEXT: mov v2.b[4], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #5]
+; CHECK-GI-DOT-NEXT: mov v1.b[5], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #6]
+; CHECK-GI-DOT-NEXT: mov v2.b[5], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #6]
+; CHECK-GI-DOT-NEXT: mov v1.b[6], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #7]
+; CHECK-GI-DOT-NEXT: mov v2.b[6], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #7]
+; CHECK-GI-DOT-NEXT: mov v1.b[7], v3.b[0]
+; CHECK-GI-DOT-NEXT: ldr b3, [x0, #16]
+; CHECK-GI-DOT-NEXT: mov v2.b[7], v4.b[0]
+; CHECK-GI-DOT-NEXT: ldr b4, [x1, #16]
; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #9]
-; CHECK-GI-DOT-NEXT: ldr b18, [x0, #17]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #18]
; CHECK-GI-DOT-NEXT: mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT: ldr b5, [x1, #16]
-; CHECK-GI-DOT-NEXT: ldr b19, [x1, #17]
-; CHECK-GI-DOT-NEXT: mov v6.b[1], v17.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #2]
-; CHECK-GI-DOT-NEXT: mov v2.b[1], v18.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x0, #10]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #2]
-; CHECK-GI-DOT-NEXT: mov v5.b[1], v19.b[0]
-; CHECK-GI-DOT-NEXT: ldr b18, [x1, #10]
-; CHECK-GI-DOT-NEXT: mov v1.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b19, [x0, #18]
-; CHECK-GI-DOT-NEXT: ldr b7, [x1, #18]
-; CHECK-GI-DOT-NEXT: mov v3.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT: mov v6.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x0, #3]
-; CHECK-GI-DOT-NEXT: mov v2.b[2], v19.b[0]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #3]
-; CHECK-GI-DOT-NEXT: ldr b18, [x1, #11]
-; CHECK-GI-DOT-NEXT: mov v5.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #11]
-; CHECK-GI-DOT-NEXT: ldr b19, [x0, #19]
-; CHECK-GI-DOT-NEXT: mov v1.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT: ldr b16, [x1, #18]
+; CHECK-GI-DOT-NEXT: mov v1.b[8], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #9]
+; CHECK-GI-DOT-NEXT: mov v2.b[8], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #9]
+; CHECK-GI-DOT-NEXT: mov v3.b[2], v7.b[0]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #19]
+; CHECK-GI-DOT-NEXT: mov v4.b[2], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #19]
+; CHECK-GI-DOT-NEXT: mov v1.b[9], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #10]
+; CHECK-GI-DOT-NEXT: mov v2.b[9], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #10]
; CHECK-GI-DOT-NEXT: mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.b[3], v17.b[0]
-; CHECK-GI-DOT-NEXT: mov v6.b[3], v18.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #4]
-; CHECK-GI-DOT-NEXT: mov v2.b[3], v19.b[0]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #4]
-; CHECK-GI-DOT-NEXT: mov v5.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x0, #12]
-; CHECK-GI-DOT-NEXT: ldr b18, [x1, #12]
-; CHECK-GI-DOT-NEXT: ldr b19, [x0, #20]
-; CHECK-GI-DOT-NEXT: mov v1.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x1, #20]
-; CHECK-GI-DOT-NEXT: mov v3.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT: mov v6.b[4], v18.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x0, #5]
-; CHECK-GI-DOT-NEXT: mov v2.b[4], v19.b[0]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #5]
-; CHECK-GI-DOT-NEXT: mov v5.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #13]
-; CHECK-GI-DOT-NEXT: ldr b18, [x1, #13]
-; CHECK-GI-DOT-NEXT: ldr b19, [x0, #21]
-; CHECK-GI-DOT-NEXT: mov v1.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #20]
+; CHECK-GI-DOT-NEXT: mov v4.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT: ldr b16, [x1, #20]
+; CHECK-GI-DOT-NEXT: mov v1.b[10], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #11]
+; CHECK-GI-DOT-NEXT: mov v2.b[10], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #11]
+; CHECK-GI-DOT-NEXT: mov v3.b[4], v7.b[0]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #21]
+; CHECK-GI-DOT-NEXT: mov v4.b[4], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #21]
+; CHECK-GI-DOT-NEXT: mov v1.b[11], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #12]
+; CHECK-GI-DOT-NEXT: mov v2.b[11], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #12]
; CHECK-GI-DOT-NEXT: mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.b[5], v17.b[0]
-; CHECK-GI-DOT-NEXT: mov v6.b[5], v18.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #6]
-; CHECK-GI-DOT-NEXT: mov v2.b[5], v19.b[0]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #6]
-; CHECK-GI-DOT-NEXT: mov v5.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x0, #14]
-; CHECK-GI-DOT-NEXT: ldr b18, [x1, #14]
-; CHECK-GI-DOT-NEXT: ldr b19, [x0, #22]
-; CHECK-GI-DOT-NEXT: mov v1.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x1, #22]
-; CHECK-GI-DOT-NEXT: mov v3.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT: mov v6.b[6], v18.b[0]
-; CHECK-GI-DOT-NEXT: ldr b16, [x0, #7]
-; CHECK-GI-DOT-NEXT: mov v2.b[6], v19.b[0]
-; CHECK-GI-DOT-NEXT: ldr b17, [x1, #7]
-; CHECK-GI-DOT-NEXT: mov v5.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT: ldr b7, [x0, #15]
-; CHECK-GI-DOT-NEXT: ldr b18, [x1, #15]
-; CHECK-GI-DOT-NEXT: ldr b19, [x0, #23]
-; CHECK-GI-DOT-NEXT: mov v1.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #22]
+; CHECK-GI-DOT-NEXT: mov v4.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT: ldr b16, [x1, #22]
+; CHECK-GI-DOT-NEXT: mov v1.b[12], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #13]
+; CHECK-GI-DOT-NEXT: mov v2.b[12], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #13]
+; CHECK-GI-DOT-NEXT: mov v3.b[6], v7.b[0]
+; CHECK-GI-DOT-NEXT: ldr b7, [x0, #23]
+; CHECK-GI-DOT-NEXT: mov v4.b[6], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #23]
+; CHECK-GI-DOT-NEXT: mov v1.b[13], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #14]
+; CHECK-GI-DOT-NEXT: mov v2.b[13], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #14]
; CHECK-GI-DOT-NEXT: mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.b[7], v17.b[0]
-; CHECK-GI-DOT-NEXT: mov v6.b[7], v18.b[0]
-; CHECK-GI-DOT-NEXT: mov v2.b[7], v19.b[0]
-; CHECK-GI-DOT-NEXT: mov v5.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT: mov v4.d[1], v6.d[0]
-; CHECK-GI-DOT-NEXT: mov v1.d[1], v3.d[0]
-; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: mov v2.d[1], v0.d[0]
-; CHECK-GI-DOT-NEXT: mov v5.d[1], v0.d[0]
-; CHECK-GI-DOT-NEXT: sdot v3.4s, v4.16b, v1.16b
-; CHECK-GI-DOT-NEXT: sdot v0.4s, v5.16b, v2.16b
-; CHECK-GI-DOT-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-GI-DOT-NEXT: mov v4.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT: mov v1.b[14], v5.b[0]
+; CHECK-GI-DOT-NEXT: ldr b5, [x0, #15]
+; CHECK-GI-DOT-NEXT: mov v2.b[14], v6.b[0]
+; CHECK-GI-DOT-NEXT: ldr b6, [x1, #15]
+; CHECK-GI-DOT-NEXT: mov v3.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT: mov v4.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT: mov v1.b[15], v5.b[0]
+; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: mov v2.b[15], v6.b[0]
+; CHECK-GI-DOT-NEXT: sdot v0.4s, v4.16b, v3.16b
+; CHECK-GI-DOT-NEXT: sdot v5.4s, v2.16b, v1.16b
+; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
>From b3cb99b9456b4c771e12ac0452500d0bf96ae568 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Wed, 7 Feb 2024 10:21:28 +0000
Subject: [PATCH 2/2] fixup! [AArch64][GloablISel] Refactor Combine
G_CONCAT_VECTOR
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 16 +++++------
.../include/llvm/Target/GlobalISel/Combine.td | 4 +--
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 27 ++++++++++---------
llvm/lib/Target/AArch64/AArch64Combine.td | 2 +-
4 files changed, 24 insertions(+), 25 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 133b5d2c4e6b7..6a805ee40a7d8 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -227,19 +227,15 @@ class CombinerHelper {
/// ==========================================================
/// Check if the G_CONCAT_VECTORS \p MI is undef or if it
/// can be flattened into a build_vector.
- /// In the first case \p bool will be true.
- /// In the second case \p SmallVector<Register> will contain the operands
+ /// In the first case \p Ops will be empty
+ /// In the second case \p Ops will contain the operands
/// needed to produce the flattened build_vector.
///
/// \pre MI.getOpcode() == G_CONCAT_VECTORS.
- bool
- matchCombineConcatVectors(MachineInstr &MI,
- std::pair<bool, SmallVector<Register>> &matchinfo);
- /// Replace \p MI with a flattened build_vector with \p SmallVector<Register>
- /// or an implicit_def if \p bool is true.
- void
- applyCombineConcatVectors(MachineInstr &MI,
- std::pair<bool, SmallVector<Register>> &matchinfo);
+ bool matchCombineConcatVectors(MachineInstr &MI, SmallVector<Register> &Ops);
+ /// Replace \p MI with a flattened build_vector with \p Ops
+ /// or an implicit_def if \p Ops is empty.
+ void applyCombineConcatVectors(MachineInstr &MI, SmallVector<Register> &Ops);
/// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS.
/// Returns true if MI changed.
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 77a6faaf837d5..7eadb718f1641 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1254,7 +1254,7 @@ def match_ors : GICombineRule<
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
// Combines concat operations
-def concat_matchinfo : GIDefMatchData<"std::pair<bool, SmallVector<Register>>">;
+def concat_matchinfo : GIDefMatchData<"SmallVector<Register>">;
def combine_concat_vector : GICombineRule<
(defs root:$root, concat_matchinfo:$matchinfo),
(match (wip_match_opcode G_CONCAT_VECTORS):$root,
@@ -1342,4 +1342,4 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
// compile time performance.
def optnone_combines : GICombineGroup<[trivial_combines,
ptr_add_immed_chain, combines_for_extload,
- not_cmp_fold, opt_brcond_by_inverting_cond]>;
+ not_cmp_fold, opt_brcond_by_inverting_cond, combine_concat_vector]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 71b383b54a141..b400eb34e2901 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -222,11 +222,11 @@ void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
replaceRegWith(MRI, DstReg, SrcReg);
}
-bool CombinerHelper::matchCombineConcatVectors(
- MachineInstr &MI, std::pair<bool, SmallVector<Register>> &matchinfo) {
+bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI,
+ SmallVector<Register> &Ops) {
assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
"Invalid instruction");
- matchinfo.first = true;
+ bool IsUndef = true;
MachineInstr *Undef = nullptr;
// Walk over all the operands of concat vectors and check if they are
@@ -240,11 +240,11 @@ bool CombinerHelper::matchCombineConcatVectors(
return false;
switch (Def->getOpcode()) {
case TargetOpcode::G_BUILD_VECTOR:
- matchinfo.first = false;
+ IsUndef = false;
// Remember the operands of the build_vector to fold
// them into the yet-to-build flattened concat vectors.
for (const MachineOperand &BuildVecMO : Def->uses())
- matchinfo.second.push_back(BuildVecMO.getReg());
+ Ops.push_back(BuildVecMO.getReg());
break;
case TargetOpcode::G_IMPLICIT_DEF: {
LLT OpType = MRI.getType(Reg);
@@ -260,7 +260,7 @@ bool CombinerHelper::matchCombineConcatVectors(
// for the flattening.
for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
EltIdx != EltEnd; ++EltIdx)
- matchinfo.second.push_back(Undef->getOperand(0).getReg());
+ Ops.push_back(Undef->getOperand(0).getReg());
break;
}
default:
@@ -270,15 +270,18 @@ bool CombinerHelper::matchCombineConcatVectors(
// Check if the combine is illegal
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
- if (!isLegalOrBeforeLegalizer({TargetOpcode::G_BUILD_VECTOR,
- {DstTy, MRI.getType(matchinfo.second[0])}})) {
+ if (!isLegalOrBeforeLegalizer(
+ {TargetOpcode::G_BUILD_VECTOR, {DstTy, MRI.getType(Ops[0])}})) {
return false;
}
+ if (IsUndef)
+ Ops.clear();
+
return true;
}
-void CombinerHelper::applyCombineConcatVectors(
- MachineInstr &MI, std::pair<bool, SmallVector<Register>> &matchinfo) {
+void CombinerHelper::applyCombineConcatVectors(MachineInstr &MI,
+ SmallVector<Register> &Ops) {
// We determined that the concat_vectors can be flatten.
// Generate the flattened build_vector.
Register DstReg = MI.getOperand(0).getReg();
@@ -291,10 +294,10 @@ void CombinerHelper::applyCombineConcatVectors(
// clean that up. For now, given we already gather this information
// in matchCombineConcatVectors, just save compile time and issue the
// right thing.
- if (matchinfo.first)
+ if (Ops.empty())
Builder.buildUndef(NewDstReg);
else
- Builder.buildBuildVector(NewDstReg, matchinfo.second);
+ Builder.buildBuildVector(NewDstReg, Ops);
MI.eraseFromParent();
replaceRegWith(MRI, DstReg, NewDstReg);
}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index e4d8359c71e62..b839e597d7c46 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -64,7 +64,7 @@ def AArch64PreLegalizerCombiner: GICombiner<
}
def AArch64O0PreLegalizerCombiner: GICombiner<
- "AArch64O0PreLegalizerCombinerImpl", [optnone_combines, combine_concat_vector]> {
+ "AArch64O0PreLegalizerCombinerImpl", [optnone_combines]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
More information about the llvm-commits
mailing list