[llvm] [AArch64][GlobalISel] Prefer to use Vector Truncate (PR #105692)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 22 09:31:30 PDT 2024
https://github.com/chuongg3 created https://github.com/llvm/llvm-project/pull/105692
Tries to combine scalarised truncates into vector truncate operations
**EXAMPLE**:
`%a(i32), %b(i32) = G_UNMERGE %src(<2 x i32>)`
`%T_a(i16) = G_TRUNC %a(i32)`
`%T_b(i16) = G_TRUNC %b(i32)`
`%Imp(i16) = G_IMPLICIT_DEF(i16)`
`%dst(v8i16) = G_MERGE_VALUES %T_a(i16), %T_b(i16), %Imp(i16), %Imp(i16)`
**===>**
`%Imp(<2 x i32>) = G_IMPLICIT_DEF(<2 x i32>)`
`%Mid(<4 x s16>) = G_CONCAT_VECTORS %src(<2 x i32>), %Imp(<2 x i32>)`
`%dst(<4 x s16>) = G_TRUNC %Mid(<4 x s16>)`
>From 75082df825c4d4a12973ecbfef7a9fcd3334eb63 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Thu, 1 Aug 2024 08:55:36 +0000
Subject: [PATCH 1/4] [GlobalISel] Look between instructions to be matched
When a pattern is matched in TableGen, a check is run called
isObviouslySafeToFold(). One of the condition that it checks for is
whether the instructions that are being matched are consecutive, so
the instruction's insertion point does not change.
This patch allows the movement of the insertion point of a load
instruction if none of the intervening instructions are stores or have
side-effects.
---
.../GlobalISel/GIMatchTableExecutor.cpp | 32 +++-
.../AArch64/GlobalISel/select-load.mir | 148 +++++++++++++++---
.../AArch64/arm64-indexed-vector-ldst.ll | 67 +++-----
llvm/test/CodeGen/AArch64/arm64-ld1.ll | 130 +++++----------
4 files changed, 217 insertions(+), 160 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp b/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp
index 26752369a7711a..c44fe3bcd9cf22 100644
--- a/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp
@@ -61,15 +61,41 @@ bool GIMatchTableExecutor::isBaseWithConstantOffset(
bool GIMatchTableExecutor::isObviouslySafeToFold(MachineInstr &MI,
MachineInstr &IntoMI) const {
+ auto IntoMIIter = IntoMI.getIterator();
+
// Immediate neighbours are already folded.
if (MI.getParent() == IntoMI.getParent() &&
- std::next(MI.getIterator()) == IntoMI.getIterator())
+ std::next(MI.getIterator()) == IntoMIIter)
return true;
// Convergent instructions cannot be moved in the CFG.
if (MI.isConvergent() && MI.getParent() != IntoMI.getParent())
return false;
- return !MI.mayLoadOrStore() && !MI.mayRaiseFPException() &&
- !MI.hasUnmodeledSideEffects() && MI.implicit_operands().empty();
+ if (MI.isLoadFoldBarrier())
+ return false;
+
+ // If the load is simple, check instructions between MI and IntoMI
+ if (MI.mayLoad() && MI.getParent() == IntoMI.getParent()) {
+ if (MI.memoperands_empty())
+ return false;
+ auto &MMO = **(MI.memoperands_begin());
+ if (MMO.isAtomic() || MMO.isVolatile())
+ return false;
+
+ // Ensure instructions between MI and IntoMI are not affected when combined
+ unsigned Iter = 0;
+ const unsigned MaxIter = 20;
+ for (auto CurrMI = MI.getIterator(); CurrMI != IntoMIIter; ++CurrMI) {
+ if (CurrMI->isLoadFoldBarrier())
+ return false;
+
+ if (Iter++ == MaxIter)
+ return false;
+ }
+
+ return true;
+ }
+
+ return true;
}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-load.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-load.mir
index 3a46b2a943288b..20e1a93fe2e639 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-load.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-load.mir
@@ -41,8 +41,10 @@
define void @anyext_on_fpr() { ret void }
define void @anyext_on_fpr8() { ret void }
-...
+ define void @load_s32_gpr_LD1() { ret void }
+ define void @load_s32_gpr_GIM() { ret void }
+...
---
name: load_s64_gpr
legalized: true
@@ -57,7 +59,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_s64_gpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[COPY]], 0 :: (load (s64) from %ir.addr)
; CHECK-NEXT: $x0 = COPY [[LDRXui]]
%0(p0) = COPY $x0
@@ -79,7 +83,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_s32_gpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui [[COPY]], 0 :: (load (s32) from %ir.addr)
; CHECK-NEXT: $w0 = COPY [[LDRWui]]
%0(p0) = COPY $x0
@@ -97,7 +103,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_s16_gpr_anyext
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRHHui:%[0-9]+]]:gpr32 = LDRHHui [[COPY]], 0 :: (load (s16) from %ir.addr)
; CHECK-NEXT: $w0 = COPY [[LDRHHui]]
%0:gpr(p0) = COPY $x0
@@ -119,7 +127,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_s16_gpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRHHui:%[0-9]+]]:gpr32 = LDRHHui [[COPY]], 0 :: (load (s16) from %ir.addr)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32all = COPY [[LDRHHui]]
; CHECK-NEXT: $w0 = COPY [[COPY1]]
@@ -139,7 +149,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_s8_gpr_anyext
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRBBui:%[0-9]+]]:gpr32 = LDRBBui [[COPY]], 0 :: (load (s8) from %ir.addr)
; CHECK-NEXT: $w0 = COPY [[LDRBBui]]
%0:gpr(p0) = COPY $x0
@@ -161,7 +173,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_s8_gpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRBBui:%[0-9]+]]:gpr32 = LDRBBui [[COPY]], 0 :: (load (s8) from %ir.addr)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32all = COPY [[LDRBBui]]
; CHECK-NEXT: $w0 = COPY [[COPY1]]
@@ -188,7 +202,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_fi_s64_gpr
- ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui %stack.0.ptr0, 0 :: (load (s64))
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui %stack.0.ptr0, 0 :: (load (s64))
; CHECK-NEXT: $x0 = COPY [[LDRXui]]
%0(p0) = G_FRAME_INDEX %stack.0.ptr0
%1(s64) = G_LOAD %0 :: (load (s64))
@@ -211,7 +227,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_gep_128_s64_gpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[COPY]], 16 :: (load (s64) from %ir.addr)
; CHECK-NEXT: $x0 = COPY [[LDRXui]]
%0(p0) = COPY $x0
@@ -237,7 +255,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_gep_512_s32_gpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui [[COPY]], 128 :: (load (s32) from %ir.addr)
; CHECK-NEXT: $w0 = COPY [[LDRWui]]
%0(p0) = COPY $x0
@@ -263,7 +283,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_gep_64_s16_gpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRHHui:%[0-9]+]]:gpr32 = LDRHHui [[COPY]], 32 :: (load (s16) from %ir.addr)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32all = COPY [[LDRHHui]]
; CHECK-NEXT: $w0 = COPY [[COPY1]]
@@ -291,7 +313,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_gep_1_s8_gpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRBBui:%[0-9]+]]:gpr32 = LDRBBui [[COPY]], 1 :: (load (s8) from %ir.addr)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32all = COPY [[LDRBBui]]
; CHECK-NEXT: $w0 = COPY [[COPY1]]
@@ -317,7 +341,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_s64_fpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[COPY]], 0 :: (load (s64) from %ir.addr)
; CHECK-NEXT: $d0 = COPY [[LDRDui]]
%0(p0) = COPY $x0
@@ -339,7 +365,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_s32_fpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRSui:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 0 :: (load (s32) from %ir.addr)
; CHECK-NEXT: $s0 = COPY [[LDRSui]]
%0(p0) = COPY $x0
@@ -361,7 +389,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_s16_fpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRHui:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 0 :: (load (s16) from %ir.addr)
; CHECK-NEXT: $h0 = COPY [[LDRHui]]
%0(p0) = COPY $x0
@@ -383,7 +413,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_s8_fpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRBui:%[0-9]+]]:fpr8 = LDRBui [[COPY]], 0 :: (load (s8) from %ir.addr)
; CHECK-NEXT: $b0 = COPY [[LDRBui]]
%0(p0) = COPY $x0
@@ -407,7 +439,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_gep_8_s64_fpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[COPY]], 1 :: (load (s64) from %ir.addr)
; CHECK-NEXT: $d0 = COPY [[LDRDui]]
%0(p0) = COPY $x0
@@ -433,7 +467,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_gep_16_s32_fpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRSui:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 4 :: (load (s32) from %ir.addr)
; CHECK-NEXT: $s0 = COPY [[LDRSui]]
%0(p0) = COPY $x0
@@ -459,7 +495,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_gep_64_s16_fpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRHui:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 32 :: (load (s16) from %ir.addr)
; CHECK-NEXT: $h0 = COPY [[LDRHui]]
%0(p0) = COPY $x0
@@ -485,7 +523,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_gep_32_s8_fpr
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRBui:%[0-9]+]]:fpr8 = LDRBui [[COPY]], 32 :: (load (s8) from %ir.addr)
; CHECK-NEXT: $b0 = COPY [[LDRBui]]
%0(p0) = COPY $x0
@@ -508,7 +548,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_v2s32
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[COPY]], 0 :: (load (<2 x s32>) from %ir.addr)
; CHECK-NEXT: $d0 = COPY [[LDRDui]]
%0(p0) = COPY $x0
@@ -529,7 +571,9 @@ body: |
liveins: $x0
; CHECK-LABEL: name: load_v2s64
- ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK-NEXT: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0 :: (load (<2 x s64>) from %ir.addr)
; CHECK-NEXT: $q0 = COPY [[LDRQui]]
%0(p0) = COPY $x0
@@ -712,3 +756,63 @@ body: |
RET_ReallyLR
...
+---
+name: load_s32_gpr_LD1
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $q0, $x0
+
+ ; CHECK-LABEL: name: load_s32_gpr_LD1
+ ; CHECK: liveins: $q0, $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK-NEXT: [[LD1i32_:%[0-9]+]]:fpr128 = LD1i32 [[COPY]], 0, [[COPY1]] :: (load (s32))
+ ; CHECK-NEXT: $q0 = COPY [[LD1i32_]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:fpr(<4 x s32>) = COPY $q0
+ %1:gpr(p0) = COPY $x0
+ %2:fpr(s32) = G_LOAD %1(p0) :: (load (s32))
+ %3:gpr(s32) = G_CONSTANT i32 3
+ %5:gpr(s64) = G_CONSTANT i64 0
+ %4:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %0, %2(s32), %5(s64)
+ $q0 = COPY %4(<4 x s32>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+
+name: load_s32_gpr_GIM
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $q0, $x0
+ ;This test should not select an LD1 instruction as there is a store instruction between G_INSERT_VECTOR_ELT and G_LOAD
+ ; CHECK-LABEL: name: load_s32_gpr_GIM
+ ; CHECK: liveins: $q0, $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x0
+ ; CHECK-NEXT: [[LDRSui:%[0-9]+]]:fpr32 = LDRSui [[COPY1]], 0 :: (load (s32))
+ ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 3
+ ; CHECK-NEXT: STRWui [[MOVi32imm]], [[COPY1]], 0 :: (store (s32))
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[LDRSui]], %subreg.ssub
+ ; CHECK-NEXT: [[INSvi32lane:%[0-9]+]]:fpr128 = INSvi32lane [[COPY]], 0, [[INSERT_SUBREG]], 0
+ ; CHECK-NEXT: $q0 = COPY [[INSvi32lane]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:fpr(<4 x s32>) = COPY $q0
+ %1:gpr(p0) = COPY $x0
+ %2:fpr(s32) = G_LOAD %1(p0) :: (load (s32))
+ %3:gpr(s32) = G_CONSTANT i32 3
+ G_STORE %3(s32), %1(p0) :: (store (s32))
+ %5:gpr(s64) = G_CONSTANT i64 0
+ %4:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %0, %2(s32), %5(s64)
+ $q0 = COPY %4(<4 x s32>)
+ RET_ReallyLR implicit $q0
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 628fb550a0532b..720951eca6a344 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -13417,9 +13417,8 @@ define <8 x i16> @test_v8i16_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <8 x
;
; CHECK-GI-LABEL: test_v8i16_post_reg_ld1lane:
; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr h1, [x0]
+; CHECK-GI-NEXT: ld1.h { v0 }[1], [x0]
; CHECK-GI-NEXT: add x8, x0, x2, lsl #1
-; CHECK-GI-NEXT: mov.h v0[1], v1[0]
; CHECK-GI-NEXT: str x8, [x1]
; CHECK-GI-NEXT: ret
%tmp1 = load i16, ptr %bar
@@ -13465,12 +13464,11 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <4 x
;
; CHECK-GI-LABEL: test_v4i16_post_reg_ld1lane:
; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr h1, [x0]
; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: add x8, x0, x2, lsl #1
-; CHECK-GI-NEXT: mov.h v0[1], v1[0]
-; CHECK-GI-NEXT: str x8, [x1]
+; CHECK-GI-NEXT: ld1.h { v0 }[1], [x0]
; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: str x8, [x1]
; CHECK-GI-NEXT: ret
%tmp1 = load i16, ptr %bar
%tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1
@@ -13509,9 +13507,8 @@ define <4 x i32> @test_v4i32_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <4 x
;
; CHECK-GI-LABEL: test_v4i32_post_reg_ld1lane:
; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr s1, [x0]
+; CHECK-GI-NEXT: ld1.s { v0 }[1], [x0]
; CHECK-GI-NEXT: add x8, x0, x2, lsl #2
-; CHECK-GI-NEXT: mov.s v0[1], v1[0]
; CHECK-GI-NEXT: str x8, [x1]
; CHECK-GI-NEXT: ret
%tmp1 = load i32, ptr %bar
@@ -13557,12 +13554,11 @@ define <2 x i32> @test_v2i32_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <2 x
;
; CHECK-GI-LABEL: test_v2i32_post_reg_ld1lane:
; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr s1, [x0]
; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: add x8, x0, x2, lsl #2
-; CHECK-GI-NEXT: mov.s v0[1], v1[0]
-; CHECK-GI-NEXT: str x8, [x1]
+; CHECK-GI-NEXT: ld1.s { v0 }[1], [x0]
; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: str x8, [x1]
; CHECK-GI-NEXT: ret
%tmp1 = load i32, ptr %bar
%tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1
@@ -13601,9 +13597,8 @@ define <2 x i64> @test_v2i64_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <2 x
;
; CHECK-GI-LABEL: test_v2i64_post_reg_ld1lane:
; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr d1, [x0]
+; CHECK-GI-NEXT: ld1.d { v0 }[1], [x0]
; CHECK-GI-NEXT: add x8, x0, x2, lsl #3
-; CHECK-GI-NEXT: mov.d v0[1], v1[0]
; CHECK-GI-NEXT: str x8, [x1]
; CHECK-GI-NEXT: ret
%tmp1 = load i64, ptr %bar
@@ -13643,9 +13638,8 @@ define <4 x float> @test_v4f32_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <4
;
; CHECK-GI-LABEL: test_v4f32_post_reg_ld1lane:
; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr s1, [x0]
+; CHECK-GI-NEXT: ld1.s { v0 }[1], [x0]
; CHECK-GI-NEXT: add x8, x0, x2, lsl #2
-; CHECK-GI-NEXT: mov.s v0[1], v1[0]
; CHECK-GI-NEXT: str x8, [x1]
; CHECK-GI-NEXT: ret
%tmp1 = load float, ptr %bar
@@ -13691,12 +13685,11 @@ define <2 x float> @test_v2f32_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <2
;
; CHECK-GI-LABEL: test_v2f32_post_reg_ld1lane:
; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr s1, [x0]
; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: add x8, x0, x2, lsl #2
-; CHECK-GI-NEXT: mov.s v0[1], v1[0]
-; CHECK-GI-NEXT: str x8, [x1]
+; CHECK-GI-NEXT: ld1.s { v0 }[1], [x0]
; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: str x8, [x1]
; CHECK-GI-NEXT: ret
%tmp1 = load float, ptr %bar
%tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1
@@ -13735,9 +13728,8 @@ define <2 x double> @test_v2f64_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <
;
; CHECK-GI-LABEL: test_v2f64_post_reg_ld1lane:
; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr d1, [x0]
+; CHECK-GI-NEXT: ld1.d { v0 }[1], [x0]
; CHECK-GI-NEXT: add x8, x0, x2, lsl #3
-; CHECK-GI-NEXT: mov.d v0[1], v1[0]
; CHECK-GI-NEXT: str x8, [x1]
; CHECK-GI-NEXT: ret
%tmp1 = load double, ptr %bar
@@ -13792,15 +13784,14 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane_forced_narrow(ptr %bar, ptr %ptr,
; CHECK-GI-LABEL: test_v4i16_post_reg_ld1lane_forced_narrow:
; CHECK-GI: ; %bb.0:
; CHECK-GI-NEXT: add x8, x0, x2, lsl #1
-; CHECK-GI-NEXT: ldr h1, [x0]
; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: ld1.h { v0 }[1], [x0]
; CHECK-GI-NEXT: str x8, [x1]
-; CHECK-GI-NEXT: mov.h v0[1], v1[0]
-; CHECK-GI-NEXT: ldr d2, [x3]
; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: cnt.8b v2, v2
-; CHECK-GI-NEXT: uaddlp.4h v2, v2
-; CHECK-GI-NEXT: uaddlp.2s v1, v2
+; CHECK-GI-NEXT: ldr d1, [x3]
+; CHECK-GI-NEXT: cnt.8b v1, v1
+; CHECK-GI-NEXT: uaddlp.4h v1, v1
+; CHECK-GI-NEXT: uaddlp.2s v1, v1
; CHECK-GI-NEXT: str d1, [x3]
; CHECK-GI-NEXT: ret
%tmp1 = load i16, ptr %bar
@@ -13989,24 +13980,14 @@ define void @test_ld1lane_build_i8(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr
}
define <4 x i32> @test_inc_cycle(<4 x i32> %vec, ptr %in) {
-; CHECK-SD-LABEL: test_inc_cycle:
-; CHECK-SD: ; %bb.0:
-; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0]
-; CHECK-SD-NEXT: adrp x9, _var at PAGE
-; CHECK-SD-NEXT: fmov x8, d0
-; CHECK-SD-NEXT: add x8, x0, x8, lsl #2
-; CHECK-SD-NEXT: str x8, [x9, _var at PAGEOFF]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_inc_cycle:
-; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr s1, [x0]
-; CHECK-GI-NEXT: adrp x9, _var at PAGE
-; CHECK-GI-NEXT: mov.s v0[0], v1[0]
-; CHECK-GI-NEXT: fmov x8, d0
-; CHECK-GI-NEXT: add x8, x0, x8, lsl #2
-; CHECK-GI-NEXT: str x8, [x9, _var at PAGEOFF]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_inc_cycle:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT: adrp x9, _var at PAGE
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: add x8, x0, x8, lsl #2
+; CHECK-NEXT: str x8, [x9, _var at PAGEOFF]
+; CHECK-NEXT: ret
%elt = load i32, ptr %in
%newvec = insertelement <4 x i32> %vec, i32 %elt, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
index 54b96520dce41d..d3a8f59c96f993 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
@@ -1021,16 +1021,10 @@ define <16 x i8> @ld1_16b(<16 x i8> %V, ptr %bar) {
}
define <8 x i16> @ld1_8h(<8 x i16> %V, ptr %bar) {
-; CHECK-SD-LABEL: ld1_8h:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1.h { v0 }[0], [x0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ld1_8h:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr h1, [x0]
-; CHECK-GI-NEXT: mov.h v0[0], v1[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ld1_8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i16, ptr %bar
%tmp2 = insertelement <8 x i16> %V, i16 %tmp1, i32 0
@@ -1038,16 +1032,10 @@ define <8 x i16> @ld1_8h(<8 x i16> %V, ptr %bar) {
}
define <4 x i32> @ld1_4s(<4 x i32> %V, ptr %bar) {
-; CHECK-SD-LABEL: ld1_4s:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ld1_4s:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr s1, [x0]
-; CHECK-GI-NEXT: mov.s v0[0], v1[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ld1_4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i32, ptr %bar
%tmp2 = insertelement <4 x i32> %V, i32 %tmp1, i32 0
@@ -1055,16 +1043,10 @@ define <4 x i32> @ld1_4s(<4 x i32> %V, ptr %bar) {
}
define <4 x float> @ld1_4s_float(<4 x float> %V, ptr %bar) {
-; CHECK-SD-LABEL: ld1_4s_float:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ld1_4s_float:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr s1, [x0]
-; CHECK-GI-NEXT: mov.s v0[0], v1[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ld1_4s_float:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load float, ptr %bar
%tmp2 = insertelement <4 x float> %V, float %tmp1, i32 0
@@ -1072,16 +1054,10 @@ define <4 x float> @ld1_4s_float(<4 x float> %V, ptr %bar) {
}
define <2 x i64> @ld1_2d(<2 x i64> %V, ptr %bar) {
-; CHECK-SD-LABEL: ld1_2d:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1.d { v0 }[0], [x0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ld1_2d:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr d1, [x0]
-; CHECK-GI-NEXT: mov.d v0[0], v1[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ld1_2d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i64, ptr %bar
%tmp2 = insertelement <2 x i64> %V, i64 %tmp1, i32 0
@@ -1089,16 +1065,10 @@ define <2 x i64> @ld1_2d(<2 x i64> %V, ptr %bar) {
}
define <2 x double> @ld1_2d_double(<2 x double> %V, ptr %bar) {
-; CHECK-SD-LABEL: ld1_2d_double:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1.d { v0 }[0], [x0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ld1_2d_double:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr d1, [x0]
-; CHECK-GI-NEXT: mov.d v0[0], v1[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ld1_2d_double:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load double, ptr %bar
%tmp2 = insertelement <2 x double> %V, double %tmp1, i32 0
@@ -1137,20 +1107,12 @@ define <8 x i8> @ld1_8b(<8 x i8> %V, ptr %bar) {
}
define <4 x i16> @ld1_4h(<4 x i16> %V, ptr %bar) {
-; CHECK-SD-LABEL: ld1_4h:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: ld1.h { v0 }[0], [x0]
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ld1_4h:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr h1, [x0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov.h v0[0], v1[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ld1_4h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i16, ptr %bar
%tmp2 = insertelement <4 x i16> %V, i16 %tmp1, i32 0
@@ -1158,20 +1120,12 @@ define <4 x i16> @ld1_4h(<4 x i16> %V, ptr %bar) {
}
define <2 x i32> @ld1_2s(<2 x i32> %V, ptr %bar) {
-; CHECK-SD-LABEL: ld1_2s:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0]
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ld1_2s:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr s1, [x0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov.s v0[0], v1[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ld1_2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i32, ptr %bar
%tmp2 = insertelement <2 x i32> %V, i32 %tmp1, i32 0
@@ -1179,20 +1133,12 @@ define <2 x i32> @ld1_2s(<2 x i32> %V, ptr %bar) {
}
define <2 x float> @ld1_2s_float(<2 x float> %V, ptr %bar) {
-; CHECK-SD-LABEL: ld1_2s_float:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0]
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ld1_2s_float:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr s1, [x0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov.s v0[0], v1[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ld1_2s_float:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load float, ptr %bar
%tmp2 = insertelement <2 x float> %V, float %tmp1, i32 0
>From 5c058d95a316202cff24146947e0c97eae58b377 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Thu, 1 Aug 2024 15:22:19 +0000
Subject: [PATCH 2/4] [AArch64][GlobalISel] Lower G_BUILD_VECTOR to
G_INSERT_VECTOR_ELT
The lowering happens in post-legalizer lowering if any source
registers from G_BUILD_VECTOR are not constants.
Add pattern pragment setting `scalar_to_vector ($src)` as
equivalent to `vector_insert (undef), ($src), (i61 0)`
---
llvm/lib/Target/AArch64/AArch64Combine.td | 10 +-
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 73 ++--
.../GISel/AArch64InstructionSelector.cpp | 15 +
.../GISel/AArch64PostLegalizerLowering.cpp | 39 ++
.../legalize-shuffle-vector-widen-crash.ll | 12 +-
...legalizer-lowering-build-vector-to-dup.mir | 24 +-
.../postlegalizer-lowering-shuffle-splat.mir | 16 +-
llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll | 2 +-
llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll | 2 +-
llvm/test/CodeGen/AArch64/aarch64-smull.ll | 116 +++---
llvm/test/CodeGen/AArch64/abs.ll | 8 +-
llvm/test/CodeGen/AArch64/arm64-dup.ll | 61 ++-
.../AArch64/arm64-extract-insert-varidx.ll | 45 +--
.../AArch64/arm64-indexed-vector-ldst.ll | 82 ++--
llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 318 +++++++--------
.../CodeGen/AArch64/arm64-subvector-extend.ll | 198 +++++-----
llvm/test/CodeGen/AArch64/arm64-tbl.ll | 159 ++++----
llvm/test/CodeGen/AArch64/bitcast.ll | 106 ++---
llvm/test/CodeGen/AArch64/bswap.ll | 6 +-
llvm/test/CodeGen/AArch64/concat-vector.ll | 178 ++++-----
llvm/test/CodeGen/AArch64/fabs.ll | 32 +-
llvm/test/CodeGen/AArch64/faddsub.ll | 80 ++--
llvm/test/CodeGen/AArch64/fcmp.ll | 334 ++++++++--------
llvm/test/CodeGen/AArch64/fcopysign.ll | 34 +-
llvm/test/CodeGen/AArch64/fcvt.ll | 224 +++++------
llvm/test/CodeGen/AArch64/fdiv.ll | 40 +-
llvm/test/CodeGen/AArch64/fexplog.ll | 315 +++++++--------
.../AArch64/fixed-vector-interleave.ll | 14 +-
llvm/test/CodeGen/AArch64/fminimummaximum.ll | 80 ++--
llvm/test/CodeGen/AArch64/fminmax.ll | 80 ++--
llvm/test/CodeGen/AArch64/fmla.ll | 168 ++++----
llvm/test/CodeGen/AArch64/fmul.ll | 40 +-
llvm/test/CodeGen/AArch64/fneg.ll | 32 +-
llvm/test/CodeGen/AArch64/fpow.ll | 12 +-
llvm/test/CodeGen/AArch64/fpowi.ll | 12 +-
llvm/test/CodeGen/AArch64/fptoi.ll | 70 ++--
llvm/test/CodeGen/AArch64/fptrunc.ll | 12 +-
llvm/test/CodeGen/AArch64/frem.ll | 12 +-
llvm/test/CodeGen/AArch64/fsincos.ll | 126 +++---
llvm/test/CodeGen/AArch64/fsqrt.ll | 32 +-
llvm/test/CodeGen/AArch64/icmp.ll | 16 +-
llvm/test/CodeGen/AArch64/insertextract.ll | 47 +--
llvm/test/CodeGen/AArch64/itofp.ll | 180 ++++-----
llvm/test/CodeGen/AArch64/llvm.exp10.ll | 10 +-
llvm/test/CodeGen/AArch64/load.ll | 48 +--
.../AArch64/neon-bitwise-instructions.ll | 34 +-
.../AArch64/neon-compare-instructions.ll | 12 +-
llvm/test/CodeGen/AArch64/neon-extadd.ll | 162 ++++----
llvm/test/CodeGen/AArch64/neon-extmul.ll | 28 +-
llvm/test/CodeGen/AArch64/neon-perm.ll | 13 +-
llvm/test/CodeGen/AArch64/ptradd.ll | 52 ++-
llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 38 +-
llvm/test/CodeGen/AArch64/sext.ll | 190 ++++-----
llvm/test/CodeGen/AArch64/shift.ll | 177 +++++----
llvm/test/CodeGen/AArch64/shufflevector.ll | 70 ++--
llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 38 +-
llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 38 +-
llvm/test/CodeGen/AArch64/usub_sat_vec.ll | 38 +-
llvm/test/CodeGen/AArch64/vecreduce-add.ll | 368 +++++++++---------
llvm/test/CodeGen/AArch64/xtn.ll | 42 +-
llvm/test/CodeGen/AArch64/zext.ll | 152 ++++----
61 files changed, 2444 insertions(+), 2528 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 3f717c8a60050f..ef00e962f3870f 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -290,6 +290,13 @@ def combine_mul_cmlt : GICombineRule<
(apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }])
>;
+def lower_build_insert_vec_elt : GICombineRule<
+ (defs root:$root, register_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_BUILD_VECTOR):$root,
+ [{ return matchLowerBuildToInsertVecElt(*${root}, MRI); }]),
+ (apply [{ applyLowerBuildToInsertVecElt(*${root}, MRI, B); }])
+>;
+
// Post-legalization combines which should happen at all optimization levels.
// (E.g. ones that facilitate matching for the selector) For example, matching
// pseudos.
@@ -300,7 +307,8 @@ def AArch64PostLegalizerLowering
lower_vector_fcmp, form_truncstore,
vector_sext_inreg_to_shift,
unmerge_ext_to_unmerge, lower_mull,
- vector_unmerge_lowering, insertelt_nonconst]> {
+ vector_unmerge_lowering, insertelt_nonconst,
+ lower_build_insert_vec_elt]> {
}
// Post-legalization combines which are primarily optimizations.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1053ba9242768a..b849e4c50e4fce 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3302,6 +3302,10 @@ defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
// Pre-fetch.
defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
+def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
+ [(vector_insert undef, node:$src, (i64 0)),
+ (scalar_to_vector node:$src)]>;
+
// For regular load, we do not have any alignment requirement.
// Thus, it is safe to directly map the vector loads with interesting
// addressing modes.
@@ -3310,13 +3314,13 @@ multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
ValueType ScalTy, ValueType VecTy,
Instruction LOADW, Instruction LOADX,
SubRegIndex sub> {
- def : Pat<(VecTy (scalar_to_vector (ScalTy
+ def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
(loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
(LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
sub)>;
- def : Pat<(VecTy (scalar_to_vector (ScalTy
+ def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
(loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
(LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
@@ -3344,12 +3348,12 @@ defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;
defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;
-def : Pat <(v1i64 (scalar_to_vector (i64
+def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
(load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
ro_Wextend64:$extend))))),
(LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
-def : Pat <(v1i64 (scalar_to_vector (i64
+def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
(load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
ro_Xextend64:$extend))))),
(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
@@ -3482,34 +3486,34 @@ def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
// Thus, it is safe to directly map the vector loads with interesting
// addressing modes.
// FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (scalar_to_vector (i32
+def : Pat <(v8i8 (vec_ins_or_scal_vec (i32
(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
(INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v16i8 (scalar_to_vector (i32
+def : Pat <(v16i8 (vec_ins_or_scal_vec (i32
(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v4i16 (scalar_to_vector (i32
+def : Pat <(v4i16 (vec_ins_or_scal_vec (i32
(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
(INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v8i16 (scalar_to_vector (i32
+def : Pat <(v8i16 (vec_ins_or_scal_vec (i32
(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v2i32 (scalar_to_vector (i32
+def : Pat <(v2i32 (vec_ins_or_scal_vec (i32
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
(INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v4i32 (scalar_to_vector (i32
+def : Pat <(v4i32 (vec_ins_or_scal_vec (i32
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v1i64 (scalar_to_vector (i64
+def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat <(v2i64 (scalar_to_vector (i64
+def : Pat <(v2i64 (vec_ins_or_scal_vec (i64
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
@@ -6824,10 +6828,10 @@ def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
defm INS : SIMDIns;
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v16i8 (vec_ins_or_scal_vec GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v8i8 (vec_ins_or_scal_vec GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
@@ -6835,50 +6839,49 @@ def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
def : Pat<(v8i8 (bitconvert (i64 (zext GPR32:$Rn)))),
(SUBREG_TO_REG (i32 0), (f32 (FMOVWSr GPR32:$Rn)), ssub)>;
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v8i16 (vec_ins_or_scal_vec GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v4i16 (vec_ins_or_scal_vec GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
(INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
(INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+def : Pat<(v2i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))),
(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
-def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+def : Pat<(v4i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))),
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
-
-def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+def : Pat<(v2i64 (vec_ins_or_scal_vec (i64 FPR64:$Rn))),
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
(i64 FPR64:$Rn), dsub))>;
-def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
(INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
(INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+def : Pat<(v4f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+def : Pat<(v2f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))),
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+def : Pat<(v2f64 (vec_ins_or_scal_vec (f64 FPR64:$Rn))),
(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
@@ -8550,7 +8553,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH
let Predicates = [HasNEON] in {
class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
SDPatternOperator ExtLoad, Instruction LD1>
- : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
+ : Pat<(ResultTy (vec_ins_or_scal_vec (i32 (ExtLoad GPR64sp:$Rn)))),
(ResultTy (EXTRACT_SUBREG
(LD1 (VecTy (IMPLICIT_DEF)), 0, GPR64sp:$Rn), dsub))>;
@@ -8983,11 +8986,11 @@ def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
+def : Pat<(v1i64 (vec_ins_or_scal_vec GPR64:$Xn)),
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
+def : Pat<(v1f64 (vec_ins_or_scal_vec GPR64:$Xn)),
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+def : Pat<(v1f64 (vec_ins_or_scal_vec (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
(COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index e9e6b6cb68d0d1..18361cf3685642 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2116,6 +2116,21 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
I.getOperand(1).setReg(NewSrc.getReg(0));
return true;
}
+ case AArch64::G_INSERT_VECTOR_ELT: {
+ // Convert the type from p0 to s64 to help selection.
+ LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+ LLT SrcVecTy = MRI.getType(I.getOperand(1).getReg());
+ if (!SrcVecTy.isPointerVector())
+ return false;
+ auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(2).getReg());
+ MRI.setType(I.getOperand(1).getReg(),
+ DstTy.changeElementType(LLT::scalar(64)));
+ MRI.setType(I.getOperand(0).getReg(),
+ DstTy.changeElementType(LLT::scalar(64)));
+ MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
+ I.getOperand(2).setReg(NewSrc.getReg(0));
+ return true;
+ }
case TargetOpcode::G_UITOFP:
case TargetOpcode::G_SITOFP: {
// If both source and destination regbanks are FPR, then convert the opcode
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 4a1977ba1a00f0..d90fbaff38ce50 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -1048,6 +1048,45 @@ void applyLowerVectorFCMP(MachineInstr &MI, MachineRegisterInfo &MRI,
MI.eraseFromParent();
}
+// Intend to match the last part of
+// AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG)
+bool matchLowerBuildToInsertVecElt(MachineInstr &MI, MachineRegisterInfo &MRI) {
+ assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
+ "Expected G_BUILD_VECTOR instruction");
+ bool isConstant = true;
+
+ // Check if the values are the same
+ for (unsigned i = 1; i < MI.getNumOperands(); i++) {
+ auto ConstVal =
+ getAnyConstantVRegValWithLookThrough(MI.getOperand(i).getReg(), MRI);
+ if (!ConstVal.has_value()) {
+ isConstant = false;
+ }
+ }
+
+ if (isConstant)
+ return false;
+
+ return true;
+}
+
+void applyLowerBuildToInsertVecElt(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) {
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ Register DstReg = B.buildUndef(DstTy).getReg(0);
+
+ for (unsigned i = 1; i < MI.getNumOperands(); i++) {
+ Register SrcReg = MI.getOperand(i).getReg();
+ if (MRI.getVRegDef(SrcReg)->getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
+ continue;
+ Register IdxReg = B.buildConstant(LLT::scalar(64), i - 1).getReg(0);
+ DstReg =
+ B.buildInsertVectorElement(DstTy, DstReg, SrcReg, IdxReg).getReg(0);
+ }
+ B.buildCopy(MI.getOperand(0).getReg(), DstReg);
+ MI.eraseFromParent();
+}
+
bool matchFormTruncstore(MachineInstr &MI, MachineRegisterInfo &MRI,
Register &SrcReg) {
assert(MI.getOpcode() == TargetOpcode::G_STORE);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
index f7efaeaa507053..87c1307ad29556 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
@@ -10,12 +10,14 @@ define i32 @bar() {
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: mov b1, v0[1]
-; CHECK-NEXT: mov b2, v0[2]
-; CHECK-NEXT: mov b3, v0[3]
-; CHECK-NEXT: mov.h v0[1], v1[0]
-; CHECK-NEXT: mov.h v2[1], v3[0]
+; CHECK-NEXT: mov b2, v0[3]
+; CHECK-NEXT: mov b3, v0[2]
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: mov.h v0[1], w8
+; CHECK-NEXT: mov.h v3[1], w9
; CHECK-NEXT: ushll.4s v0, v0, #0
-; CHECK-NEXT: ushll.4s v1, v2, #0
+; CHECK-NEXT: ushll.4s v1, v3, #0
; CHECK-NEXT: mov.d v0[1], v1[0]
; CHECK-NEXT: movi.4s v1, #1
; CHECK-NEXT: and.16b v0, v0, v1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
index 70867c2ea2842a..0115531dfb09ae 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
@@ -42,20 +42,30 @@ body: |
; LOWER-NEXT: {{ $}}
; LOWER-NEXT: %r:_(s32) = COPY $w0
; LOWER-NEXT: %q:_(s32) = COPY $w1
- ; LOWER-NEXT: %build_vector:_(<2 x s32>) = G_BUILD_VECTOR %r(s32), %q(s32)
+ ; LOWER-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
+ ; LOWER-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; LOWER-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %r(s32), [[C]](s64)
+ ; LOWER-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; LOWER-NEXT: [[IVEC1:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], %q(s32), [[C1]](s64)
+ ; LOWER-NEXT: %build_vector:_(<2 x s32>) = COPY [[IVEC1]](<2 x s32>)
; LOWER-NEXT: $d0 = COPY %build_vector(<2 x s32>)
; LOWER-NEXT: RET_ReallyLR implicit $d0
;
; SELECT-LABEL: name: dont_combine_different_reg
; SELECT: liveins: $d0, $w0, $w1
; SELECT-NEXT: {{ $}}
- ; SELECT-NEXT: %r:gpr32all = COPY $w0
+ ; SELECT-NEXT: %r:gpr32 = COPY $w0
; SELECT-NEXT: %q:gpr32 = COPY $w1
- ; SELECT-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
- ; SELECT-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], %r, %subreg.ssub
- ; SELECT-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 1, %q
- ; SELECT-NEXT: %build_vector:fpr64 = COPY [[INSvi32gpr]].dsub
- ; SELECT-NEXT: $d0 = COPY %build_vector
+ ; SELECT-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
+ ; SELECT-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+ ; SELECT-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[DEF]], %subreg.dsub
+ ; SELECT-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 0, %r
+ ; SELECT-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr]].dsub
+ ; SELECT-NEXT: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+ ; SELECT-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY]], %subreg.dsub
+ ; SELECT-NEXT: [[INSvi32gpr1:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG1]], 1, %q
+ ; SELECT-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr1]].dsub
+ ; SELECT-NEXT: $d0 = COPY [[COPY1]]
; SELECT-NEXT: RET_ReallyLR implicit $d0
%r:_(s32) = COPY $w0
%q:_(s32) = COPY $w1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
index 9d12c3c32c7f8b..c5176805b9ebef 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
@@ -355,7 +355,21 @@ body: |
; CHECK: liveins: $w0, $w1, $w2, $w3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %lane:_(s32) = COPY $w0
- ; CHECK-NEXT: %shuf:_(<4 x s32>) = G_DUP %lane(s32)
+ ; CHECK-NEXT: %b:_(s32) = COPY $w1
+ ; CHECK-NEXT: %c:_(s32) = COPY $w2
+ ; CHECK-NEXT: %d:_(s32) = COPY $w3
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %lane(s32), [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], %b(s32), [[C1]](s64)
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+ ; CHECK-NEXT: [[IVEC2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], %c(s32), [[C2]](s64)
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+ ; CHECK-NEXT: [[IVEC3:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC2]], %d(s32), [[C3]](s64)
+ ; CHECK-NEXT: %buildvec:_(<4 x s32>) = COPY [[IVEC3]](<4 x s32>)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: %shuf:_(<4 x s32>) = G_DUPLANE32 %buildvec, [[C4]](s64)
; CHECK-NEXT: $q0 = COPY %shuf(<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%lane:_(s32) = COPY $w0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
index f47da47002fbcd..9734ab35bd6b2d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -76,7 +76,7 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
; CHECK-GI-NEXT: bic w9, w9, w8
; CHECK-GI-NEXT: and w8, w8, w10
; CHECK-GI-NEXT: orr w8, w9, w8
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%neg = xor <1 x i32> %C, <i32 -1>
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index 6431cfc58a54d2..45ad4b07ff66f7 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -76,7 +76,7 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
; CHECK-GI-NEXT: and w9, w8, w9
; CHECK-GI-NEXT: bic w8, w10, w8
; CHECK-GI-NEXT: orr w8, w9, w8
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%and = and <1 x i32> %C, %B
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 307aa397eabbbe..d677526bab0005 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -209,24 +209,22 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
; CHECK-GI-NEXT: ldr w8, [x0]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: uxtb w8, w8
-; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: mov b1, v0.b[2]
+; CHECK-GI-NEXT: mov b2, v0.b[1]
; CHECK-GI-NEXT: mov b3, v0.b[3]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: fmov w10, s2
; CHECK-GI-NEXT: fmov w11, s3
+; CHECK-GI-NEXT: ldr d2, [x1]
; CHECK-GI-NEXT: uxtb w9, w9
; CHECK-GI-NEXT: uxtb w10, w10
; CHECK-GI-NEXT: uxtb w11, w11
; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: fmov s2, w10
-; CHECK-GI-NEXT: fmov s3, w11
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], w10
+; CHECK-GI-NEXT: mov v1.h[1], w11
; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0
-; CHECK-GI-NEXT: ldr d2, [x1]
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0
; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
@@ -269,25 +267,25 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
;
; CHECK-GI-LABEL: smull_zext_v2i32_v2i64:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr h1, [x0]
+; CHECK-GI-NEXT: ld1 { v1.h }[0], [x0]
; CHECK-GI-NEXT: ldr h2, [x0, #2]
; CHECK-GI-NEXT: movi d0, #0x00ffff0000ffff
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: mov w8, v0.s[0]
; CHECK-GI-NEXT: mov w9, v0.s[1]
-; CHECK-GI-NEXT: ldr d0, [x1]
-; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT: fmov d1, x8
-; CHECK-GI-NEXT: fmov x11, d0
-; CHECK-GI-NEXT: mov v1.d[1], x9
-; CHECK-GI-NEXT: mov x9, v0.d[1]
-; CHECK-GI-NEXT: fmov x10, d1
-; CHECK-GI-NEXT: mov x8, v1.d[1]
-; CHECK-GI-NEXT: mul x10, x10, x11
+; CHECK-GI-NEXT: mov x11, v1.d[1]
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: mov x10, v0.d[1]
; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: fmov d0, x10
-; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: mul x9, x10, x11
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
; CHECK-GI-NEXT: ret
%load.A = load <2 x i16>, ptr %A
%load.B = load <2 x i32>, ptr %B
@@ -322,14 +320,14 @@ define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-GI-NEXT: ldr d1, [x1]
; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT: fmov x11, d1
-; CHECK-GI-NEXT: mov x9, v1.d[1]
-; CHECK-GI-NEXT: fmov x10, d0
-; CHECK-GI-NEXT: mov x8, v0.d[1]
-; CHECK-GI-NEXT: mul x10, x10, x11
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mov x11, v1.d[1]
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: mov x10, v0.d[1]
; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: fmov d0, x10
-; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: mul x9, x10, x11
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
; CHECK-GI-NEXT: ret
%load.A = load <2 x i32>, ptr %A
%and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF>
@@ -1048,14 +1046,14 @@ define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
; CHECK-GI-NEXT: adrp x8, .LCPI36_0
; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI36_0]
-; CHECK-GI-NEXT: fmov x10, d0
-; CHECK-GI-NEXT: fmov x11, d1
-; CHECK-GI-NEXT: mov x8, v0.d[1]
-; CHECK-GI-NEXT: mov x9, v1.d[1]
-; CHECK-GI-NEXT: mul x10, x10, x11
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mov x10, v0.d[1]
+; CHECK-GI-NEXT: mov x11, v1.d[1]
; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: fmov d0, x10
-; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: mul x9, x10, x11
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
; CHECK-GI-NEXT: ret
%tmp3 = sext <2 x i32> %arg to <2 x i64>
%tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
@@ -1163,14 +1161,14 @@ define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
; CHECK-GI-NEXT: adrp x8, .LCPI40_0
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI40_0]
-; CHECK-GI-NEXT: fmov x10, d0
-; CHECK-GI-NEXT: fmov x11, d1
-; CHECK-GI-NEXT: mov x8, v0.d[1]
-; CHECK-GI-NEXT: mov x9, v1.d[1]
-; CHECK-GI-NEXT: mul x10, x10, x11
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mov x10, v0.d[1]
+; CHECK-GI-NEXT: mov x11, v1.d[1]
; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: fmov d0, x10
-; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: mul x9, x10, x11
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
; CHECK-GI-NEXT: ret
%tmp3 = zext <2 x i32> %arg to <2 x i64>
%tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
@@ -1264,15 +1262,15 @@ define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
; CHECK-GI-NEXT: adrp x8, .LCPI43_0
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI43_0]
-; CHECK-GI-NEXT: fmov x10, d0
-; CHECK-GI-NEXT: fmov x11, d1
-; CHECK-GI-NEXT: mov x8, v0.d[1]
-; CHECK-GI-NEXT: mov x9, v1.d[1]
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mov x10, v0.d[1]
+; CHECK-GI-NEXT: mov x11, v1.d[1]
; CHECK-GI-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: fmov d0, x10
-; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: mul x9, x10, x11
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: ret
%tmp3 = zext <2 x i32> %arg to <2 x i64>
@@ -1891,15 +1889,15 @@ define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v2.2d, #0x000000000000ff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT: fmov x10, d0
-; CHECK-GI-NEXT: mov x8, v0.d[1]
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: mov x10, v0.d[1]
; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT: fmov x11, d1
-; CHECK-GI-NEXT: mov x9, v1.d[1]
-; CHECK-GI-NEXT: mul x10, x10, x11
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mov x11, v1.d[1]
; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: fmov d0, x10
-; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: mul x9, x10, x11
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
; CHECK-GI-NEXT: ret
entry:
%in1 = zext <2 x i32> %src1 to <2 x i64>
@@ -1947,10 +1945,10 @@ define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
; CHECK-GI-NEXT: fmov x9, d0
; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: mul x9, x9, x12
-; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mov v0.d[0], x8
; CHECK-GI-NEXT: mul x11, x13, x14
+; CHECK-GI-NEXT: mov v1.d[0], x9
; CHECK-GI-NEXT: mov v0.d[1], x10
-; CHECK-GI-NEXT: fmov d1, x9
; CHECK-GI-NEXT: mov v1.d[1], x11
; CHECK-GI-NEXT: ret
entry:
@@ -1992,9 +1990,9 @@ define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {
; CHECK-GI-NEXT: mul x8, x8, x9
; CHECK-GI-NEXT: mul x9, x12, x9
; CHECK-GI-NEXT: mul x10, x10, x11
-; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mov v0.d[0], x8
; CHECK-GI-NEXT: mul x11, x13, x11
-; CHECK-GI-NEXT: fmov d1, x9
+; CHECK-GI-NEXT: mov v1.d[0], x9
; CHECK-GI-NEXT: mov v0.d[1], x10
; CHECK-GI-NEXT: mov v1.d[1], x11
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 78c1ff7b993706..6da019a79b7277 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -247,7 +247,7 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: cmp w8, #0
; CHECK-GI-NEXT: cneg w8, w9, le
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -299,10 +299,8 @@ define <3 x i8> @abs_v3i8(<3 x i8> %a){
; CHECK-GI-LABEL: abs_v3i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w1
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT: fmov s1, w2
-; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT: mov v0.b[1], w1
+; CHECK-GI-NEXT: mov v0.b[2], w2
; CHECK-GI-NEXT: abs v0.8b, v0.8b
; CHECK-GI-NEXT: umov w0, v0.b[0]
; CHECK-GI-NEXT: umov w1, v0.b[1]
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 0291f8c9123047..a25763e3b15907 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -334,25 +334,40 @@ entry:
}
define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone {
-; CHECK-LABEL: f:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: mov.s v0[1], w1
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: f:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov s0, w0
+; CHECK-SD-NEXT: mov.s v0[1], w1
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: f:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov.s v0[0], w0
+; CHECK-GI-NEXT: mov.s v0[1], w1
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
%vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
%vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
ret <2 x i32> %vecinit1
}
define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone {
-; CHECK-LABEL: g:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: mov.s v0[1], w1
-; CHECK-NEXT: mov.s v0[2], w1
-; CHECK-NEXT: mov.s v0[3], w0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: g:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov s0, w0
+; CHECK-SD-NEXT: mov.s v0[1], w1
+; CHECK-SD-NEXT: mov.s v0[2], w1
+; CHECK-SD-NEXT: mov.s v0[3], w0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: g:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov.s v0[0], w0
+; CHECK-GI-NEXT: mov.s v0[1], w1
+; CHECK-GI-NEXT: mov.s v0[2], w1
+; CHECK-GI-NEXT: mov.s v0[3], w0
+; CHECK-GI-NEXT: ret
%vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
%vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
%vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
@@ -361,11 +376,17 @@ define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone {
}
define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone {
-; CHECK-LABEL: h:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: mov.d v0[1], x1
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: h:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov d0, x0
+; CHECK-SD-NEXT: mov.d v0[1], x1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: h:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov.d v0[0], x0
+; CHECK-GI-NEXT: mov.d v0[1], x1
+; CHECK-GI-NEXT: ret
%vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
%vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
ret <2 x i64> %vecinit1
@@ -386,8 +407,8 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
;
; CHECK-GI-LABEL: test_build_illegal:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov s0, v0[3]
-; CHECK-GI-NEXT: mov.h v0[3], v0[0]
+; CHECK-GI-NEXT: mov.s w8, v0[3]
+; CHECK-GI-NEXT: mov.h v0[3], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%val = extractelement <4 x i32> %in, i32 3
diff --git a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
index bc399c8d4ff071..8611532d6ea924 100644
--- a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
@@ -29,19 +29,20 @@ define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) {
; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
; CHECK-GISEL-NEXT: mov w9, w0
; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT: mov b1, v0.b[1]
; CHECK-GISEL-NEXT: add x8, sp, #8
-; CHECK-GISEL-NEXT: and x9, x9, #0x7
; CHECK-GISEL-NEXT: str d0, [sp, #8]
+; CHECK-GISEL-NEXT: and x9, x9, #0x7
+; CHECK-GISEL-NEXT: mov b2, v0.b[1]
; CHECK-GISEL-NEXT: mov b3, v0.b[2]
; CHECK-GISEL-NEXT: lsl x10, x9, #1
; CHECK-GISEL-NEXT: mov b0, v0.b[3]
; CHECK-GISEL-NEXT: sub x9, x10, x9
-; CHECK-GISEL-NEXT: ldr b2, [x8, x9]
-; CHECK-GISEL-NEXT: mov v2.b[1], v1.b[0]
-; CHECK-GISEL-NEXT: mov v2.b[2], v3.b[0]
-; CHECK-GISEL-NEXT: mov v2.b[3], v0.b[0]
-; CHECK-GISEL-NEXT: ushll v0.8h, v2.8b, #0
+; CHECK-GISEL-NEXT: ldr b1, [x8, x9]
+; CHECK-GISEL-NEXT: mov v1.b[0], v1.b[0]
+; CHECK-GISEL-NEXT: mov v1.b[1], v2.b[0]
+; CHECK-GISEL-NEXT: mov v1.b[2], v3.b[0]
+; CHECK-GISEL-NEXT: mov v1.b[3], v0.b[0]
+; CHECK-GISEL-NEXT: ushll v0.8h, v1.8b, #0
; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GISEL-NEXT: add sp, sp, #16
; CHECK-GISEL-NEXT: ret
@@ -82,14 +83,15 @@ define <8 x i8> @test_varidx_extract_v16s8(<16 x i8> %x, i32 %idx) {
; CHECK-GISEL-NEXT: sub sp, sp, #16
; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
; CHECK-GISEL-NEXT: mov w9, w0
-; CHECK-GISEL-NEXT: mov b2, v0.b[1]
; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and x9, x9, #0xf
; CHECK-GISEL-NEXT: str q0, [sp]
+; CHECK-GISEL-NEXT: and x9, x9, #0xf
+; CHECK-GISEL-NEXT: mov b2, v0.b[1]
; CHECK-GISEL-NEXT: mov b3, v0.b[2]
; CHECK-GISEL-NEXT: lsl x10, x9, #1
; CHECK-GISEL-NEXT: sub x9, x10, x9
; CHECK-GISEL-NEXT: ldr b1, [x8, x9]
+; CHECK-GISEL-NEXT: mov v1.b[0], v1.b[0]
; CHECK-GISEL-NEXT: mov v1.b[1], v2.b[0]
; CHECK-GISEL-NEXT: mov b2, v0.b[3]
; CHECK-GISEL-NEXT: mov v1.b[2], v3.b[0]
@@ -176,15 +178,14 @@ define <2 x i16> @test_varidx_extract_v4s16(<4 x i16> %x, i32 %idx) {
; CHECK-GISEL: // %bb.0:
; CHECK-GISEL-NEXT: sub sp, sp, #16
; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GISEL-NEXT: mov w9, w0
-; CHECK-GISEL-NEXT: mov h1, v0.h[1]
+; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GISEL-NEXT: add x8, sp, #8
; CHECK-GISEL-NEXT: str d0, [sp, #8]
; CHECK-GISEL-NEXT: and x9, x9, #0x3
-; CHECK-GISEL-NEXT: ldr h0, [x8, x9, lsl #1]
-; CHECK-GISEL-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GISEL-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GISEL-NEXT: ldr h1, [x8, x9, lsl #1]
+; CHECK-GISEL-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GISEL-NEXT: ushll v0.4s, v1.4h, #0
; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GISEL-NEXT: add sp, sp, #16
; CHECK-GISEL-NEXT: ret
@@ -217,16 +218,13 @@ define <4 x i16> @test_varidx_extract_v8s16(<8 x i16> %x, i32 %idx) {
; CHECK-GISEL-NEXT: sub sp, sp, #16
; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
; CHECK-GISEL-NEXT: mov w9, w0
-; CHECK-GISEL-NEXT: mov h2, v0.h[1]
; CHECK-GISEL-NEXT: mov x8, sp
; CHECK-GISEL-NEXT: str q0, [sp]
; CHECK-GISEL-NEXT: and x9, x9, #0x7
-; CHECK-GISEL-NEXT: mov h3, v0.h[2]
; CHECK-GISEL-NEXT: ldr h1, [x8, x9, lsl #1]
-; CHECK-GISEL-NEXT: mov h0, v0.h[3]
-; CHECK-GISEL-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GISEL-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GISEL-NEXT: mov v1.h[3], v0.h[0]
+; CHECK-GISEL-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GISEL-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GISEL-NEXT: mov v1.h[3], v0.h[3]
; CHECK-GISEL-NEXT: fmov d0, d1
; CHECK-GISEL-NEXT: add sp, sp, #16
; CHECK-GISEL-NEXT: ret
@@ -289,13 +287,12 @@ define <2 x i32> @test_varidx_extract_v4s32(<4 x i32> %x, i32 %idx) {
; CHECK-GISEL-NEXT: sub sp, sp, #16
; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
; CHECK-GISEL-NEXT: mov w9, w0
-; CHECK-GISEL-NEXT: mov s1, v0.s[1]
; CHECK-GISEL-NEXT: mov x8, sp
; CHECK-GISEL-NEXT: str q0, [sp]
; CHECK-GISEL-NEXT: and x9, x9, #0x3
-; CHECK-GISEL-NEXT: ldr s0, [x8, x9, lsl #2]
-; CHECK-GISEL-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GISEL-NEXT: ldr s1, [x8, x9, lsl #2]
+; CHECK-GISEL-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GISEL-NEXT: fmov d0, d1
; CHECK-GISEL-NEXT: add sp, sp, #16
; CHECK-GISEL-NEXT: ret
%tmp = extractelement <4 x i32> %x, i32 %idx
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 720951eca6a344..0412aef7545e9d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -13820,12 +13820,10 @@ define void @test_ld1lane_build(ptr %ptr0, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr
; CHECK-GI-LABEL: test_ld1lane_build:
; CHECK-GI: ; %bb.0:
; CHECK-GI-NEXT: ldr s0, [x0]
-; CHECK-GI-NEXT: ldr s1, [x1]
-; CHECK-GI-NEXT: ldr s2, [x2]
-; CHECK-GI-NEXT: ldr s3, [x3]
-; CHECK-GI-NEXT: mov.s v0[1], v1[0]
-; CHECK-GI-NEXT: mov.s v2[1], v3[0]
-; CHECK-GI-NEXT: sub.2s v0, v0, v2
+; CHECK-GI-NEXT: ldr s1, [x2]
+; CHECK-GI-NEXT: ld1.s { v0 }[1], [x1]
+; CHECK-GI-NEXT: ld1.s { v1 }[1], [x3]
+; CHECK-GI-NEXT: sub.2s v0, v0, v1
; CHECK-GI-NEXT: str d0, [x4]
; CHECK-GI-NEXT: ret
%load0 = load i32, ptr %ptr0, align 4
@@ -13844,28 +13842,15 @@ define void @test_ld1lane_build(ptr %ptr0, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr
}
define void @test_ld1lane_build_i16(ptr %a, ptr %b, ptr %c, ptr %d, <4 x i16> %e, ptr %p) {
-; CHECK-SD-LABEL: test_ld1lane_build_i16:
-; CHECK-SD: ; %bb.0:
-; CHECK-SD-NEXT: ldr h1, [x0]
-; CHECK-SD-NEXT: ld1.h { v1 }[1], [x1]
-; CHECK-SD-NEXT: ld1.h { v1 }[2], [x2]
-; CHECK-SD-NEXT: ld1.h { v1 }[3], [x3]
-; CHECK-SD-NEXT: sub.4h v0, v1, v0
-; CHECK-SD-NEXT: str d0, [x4]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_ld1lane_build_i16:
-; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr h1, [x0]
-; CHECK-GI-NEXT: ldr h2, [x1]
-; CHECK-GI-NEXT: mov.h v1[1], v2[0]
-; CHECK-GI-NEXT: ldr h2, [x2]
-; CHECK-GI-NEXT: mov.h v1[2], v2[0]
-; CHECK-GI-NEXT: ldr h2, [x3]
-; CHECK-GI-NEXT: mov.h v1[3], v2[0]
-; CHECK-GI-NEXT: sub.4h v0, v1, v0
-; CHECK-GI-NEXT: str d0, [x4]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_ld1lane_build_i16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ldr h1, [x0]
+; CHECK-NEXT: ld1.h { v1 }[1], [x1]
+; CHECK-NEXT: ld1.h { v1 }[2], [x2]
+; CHECK-NEXT: ld1.h { v1 }[3], [x3]
+; CHECK-NEXT: sub.4h v0, v1, v0
+; CHECK-NEXT: str d0, [x4]
+; CHECK-NEXT: ret
%ld.a = load i16, ptr %a
%ld.b = load i16, ptr %b
%ld.c = load i16, ptr %c
@@ -13880,34 +13865,18 @@ define void @test_ld1lane_build_i16(ptr %a, ptr %b, ptr %c, ptr %d, <4 x i16> %
}
define void @test_ld1lane_build_half(ptr %a, ptr %b, ptr %c, ptr %d, <4 x half> %e, ptr %p) {
-; CHECK-SD-LABEL: test_ld1lane_build_half:
-; CHECK-SD: ; %bb.0:
-; CHECK-SD-NEXT: ldr h1, [x0]
-; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-SD-NEXT: ld1.h { v1 }[1], [x1]
-; CHECK-SD-NEXT: ld1.h { v1 }[2], [x2]
-; CHECK-SD-NEXT: ld1.h { v1 }[3], [x3]
-; CHECK-SD-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-SD-NEXT: fsub.4s v0, v1, v0
-; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-SD-NEXT: str d0, [x4]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_ld1lane_build_half:
-; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: ldr h1, [x0]
-; CHECK-GI-NEXT: ldr h2, [x1]
-; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT: mov.h v1[1], v2[0]
-; CHECK-GI-NEXT: ldr h2, [x2]
-; CHECK-GI-NEXT: mov.h v1[2], v2[0]
-; CHECK-GI-NEXT: ldr h2, [x3]
-; CHECK-GI-NEXT: mov.h v1[3], v2[0]
-; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NEXT: fsub.4s v0, v1, v0
-; CHECK-GI-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-GI-NEXT: str d0, [x4]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_ld1lane_build_half:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ldr h1, [x0]
+; CHECK-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-NEXT: ld1.h { v1 }[1], [x1]
+; CHECK-NEXT: ld1.h { v1 }[2], [x2]
+; CHECK-NEXT: ld1.h { v1 }[3], [x3]
+; CHECK-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-NEXT: fsub.4s v0, v1, v0
+; CHECK-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NEXT: str d0, [x4]
+; CHECK-NEXT: ret
%ld.a = load half, ptr %a
%ld.b = load half, ptr %b
%ld.c = load half, ptr %c
@@ -13942,6 +13911,7 @@ define void @test_ld1lane_build_i8(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr
; CHECK-GI-NEXT: ldr b1, [x0]
; CHECK-GI-NEXT: ldr b2, [x1]
; CHECK-GI-NEXT: ldr x8, [sp]
+; CHECK-GI-NEXT: mov.b v1[0], v1[0]
; CHECK-GI-NEXT: mov.b v1[1], v2[0]
; CHECK-GI-NEXT: ldr b2, [x2]
; CHECK-GI-NEXT: mov.b v1[2], v2[0]
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index c56f4409e3a627..c0d91c1e0c836b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1259,7 +1259,7 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
;
; CHECK-GI-LABEL: scalar_to_vector.v2i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: mov v0.s[0], w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%b = insertelement <2 x i32> undef, i32 %a, i32 0
@@ -1267,19 +1267,29 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
}
define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
-; CHECK-LABEL: scalar_to_vector.v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: scalar_to_vector.v4i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov s0, w0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: scalar_to_vector.v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: ret
%b = insertelement <4 x i32> undef, i32 %a, i32 0
ret <4 x i32> %b
}
define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
-; CHECK-LABEL: scalar_to_vector.v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: scalar_to_vector.v2i64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov d0, x0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: scalar_to_vector.v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov v0.d[0], x0
+; CHECK-GI-NEXT: ret
%b = insertelement <2 x i64> undef, i64 %a, i32 0
ret <2 x i64> %b
}
@@ -1348,21 +1358,22 @@ define <8 x i8> @getl(<16 x i8> %x) #0 {
;
; CHECK-GI-LABEL: getl:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov b4, v0.b[4]
-; CHECK-GI-NEXT: mov b5, v0.b[5]
-; CHECK-GI-NEXT: mov b6, v0.b[6]
-; CHECK-GI-NEXT: mov b7, v0.b[7]
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT: mov v0.b[4], v4.b[0]
-; CHECK-GI-NEXT: mov v0.b[5], v5.b[0]
-; CHECK-GI-NEXT: mov v0.b[6], v6.b[0]
-; CHECK-GI-NEXT: mov v0.b[7], v7.b[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: mov b2, v0.b[1]
+; CHECK-GI-NEXT: mov v1.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[2]
+; CHECK-GI-NEXT: mov v1.b[1], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v0.b[3]
+; CHECK-GI-NEXT: mov v1.b[2], v3.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[4]
+; CHECK-GI-NEXT: mov v1.b[3], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v0.b[5]
+; CHECK-GI-NEXT: mov v1.b[4], v3.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[6]
+; CHECK-GI-NEXT: mov b0, v0.b[7]
+; CHECK-GI-NEXT: mov v1.b[5], v2.b[0]
+; CHECK-GI-NEXT: mov v1.b[6], v3.b[0]
+; CHECK-GI-NEXT: mov v1.b[7], v0.b[0]
+; CHECK-GI-NEXT: fmov d0, d1
; CHECK-GI-NEXT: ret
%vecext = extractelement <16 x i8> %x, i32 0
%vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0
@@ -1405,16 +1416,13 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) {
; CHECK-GI-NEXT: sub sp, sp, #16
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
; CHECK-GI-NEXT: mov w9, w0
-; CHECK-GI-NEXT: mov h2, v0.h[1]
; CHECK-GI-NEXT: mov x8, sp
; CHECK-GI-NEXT: str q0, [sp]
; CHECK-GI-NEXT: and x9, x9, #0x7
-; CHECK-GI-NEXT: mov h3, v0.h[2]
; CHECK-GI-NEXT: ldr h1, [x8, x9, lsl #1]
-; CHECK-GI-NEXT: mov h0, v0.h[3]
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
; CHECK-GI-NEXT: fmov d0, d1
; CHECK-GI-NEXT: add sp, sp, #16
; CHECK-GI-NEXT: ret
@@ -1709,8 +1717,8 @@ define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) {
;
; CHECK-GI-LABEL: test_concat_undef_v1i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v0.s[1], v0.s[0]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -1794,25 +1802,26 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v16i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: mov v2.16b, v1.16b
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov b3, v0.b[1]
; CHECK-GI-NEXT: adrp x8, .LCPI127_0
-; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: mov b2, v0.b[1]
-; CHECK-GI-NEXT: mov b3, v0.b[2]
-; CHECK-GI-NEXT: mov b4, v0.b[3]
-; CHECK-GI-NEXT: mov b5, v0.b[4]
-; CHECK-GI-NEXT: mov b6, v0.b[5]
-; CHECK-GI-NEXT: mov b7, v0.b[6]
-; CHECK-GI-NEXT: mov b16, v0.b[7]
-; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI127_0]
-; CHECK-GI-NEXT: mov v0.b[2], v3.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v4.b[0]
-; CHECK-GI-NEXT: mov v0.b[4], v5.b[0]
-; CHECK-GI-NEXT: mov v0.b[5], v6.b[0]
-; CHECK-GI-NEXT: mov v0.b[6], v7.b[0]
-; CHECK-GI-NEXT: mov v0.b[7], v16.b[0]
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT: mov v1.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b4, v0.b[2]
+; CHECK-GI-NEXT: mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[3]
+; CHECK-GI-NEXT: mov v1.b[2], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v0.b[4]
+; CHECK-GI-NEXT: mov v1.b[3], v3.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[5]
+; CHECK-GI-NEXT: mov v1.b[4], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v0.b[6]
+; CHECK-GI-NEXT: mov b0, v0.b[7]
+; CHECK-GI-NEXT: mov v1.b[5], v3.b[0]
+; CHECK-GI-NEXT: mov v1.b[6], v4.b[0]
+; CHECK-GI-NEXT: mov v1.b[7], v0.b[0]
+; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI127_0]
+; CHECK-GI-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECK-GI-NEXT: ret
entry:
%vecext = extractelement <8 x i8> %x, i32 0
@@ -1844,36 +1853,38 @@ define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v16i8_v16i8_v8i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov b2, v0.b[1]
-; CHECK-GI-NEXT: mov b3, v0.b[2]
+; CHECK-GI-NEXT: mov b3, v0.b[1]
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov b4, v0.b[3]
-; CHECK-GI-NEXT: mov b5, v0.b[4]
-; CHECK-GI-NEXT: mov b6, v0.b[5]
-; CHECK-GI-NEXT: mov b7, v0.b[6]
-; CHECK-GI-NEXT: mov b16, v0.b[7]
-; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v1.b[1]
-; CHECK-GI-NEXT: mov v0.b[2], v3.b[0]
+; CHECK-GI-NEXT: mov b4, v0.b[2]
+; CHECK-GI-NEXT: mov v2.b[1], v3.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[3]
+; CHECK-GI-NEXT: mov v2.b[2], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v0.b[4]
+; CHECK-GI-NEXT: mov v2.b[3], v3.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[5]
+; CHECK-GI-NEXT: mov v2.b[4], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v0.b[6]
+; CHECK-GI-NEXT: mov b0, v0.b[7]
+; CHECK-GI-NEXT: mov v2.b[5], v3.b[0]
; CHECK-GI-NEXT: mov b3, v1.b[2]
-; CHECK-GI-NEXT: mov v0.b[3], v4.b[0]
-; CHECK-GI-NEXT: mov v0.b[4], v5.b[0]
-; CHECK-GI-NEXT: mov v0.b[5], v6.b[0]
-; CHECK-GI-NEXT: mov v0.b[6], v7.b[0]
-; CHECK-GI-NEXT: mov v0.b[7], v16.b[0]
-; CHECK-GI-NEXT: mov v0.b[8], v1.b[0]
-; CHECK-GI-NEXT: mov v0.b[9], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v1.b[3]
-; CHECK-GI-NEXT: mov v0.b[10], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[6], v4.b[0]
+; CHECK-GI-NEXT: mov v2.b[7], v0.b[0]
+; CHECK-GI-NEXT: mov b0, v1.b[1]
+; CHECK-GI-NEXT: mov v2.b[8], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[9], v0.b[0]
+; CHECK-GI-NEXT: mov b0, v1.b[3]
+; CHECK-GI-NEXT: mov v2.b[10], v3.b[0]
; CHECK-GI-NEXT: mov b3, v1.b[4]
-; CHECK-GI-NEXT: mov v0.b[11], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v1.b[5]
-; CHECK-GI-NEXT: mov v0.b[12], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[11], v0.b[0]
+; CHECK-GI-NEXT: mov b0, v1.b[5]
+; CHECK-GI-NEXT: mov v2.b[12], v3.b[0]
; CHECK-GI-NEXT: mov b3, v1.b[6]
-; CHECK-GI-NEXT: mov b1, v1.b[7]
-; CHECK-GI-NEXT: mov v0.b[13], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[14], v3.b[0]
-; CHECK-GI-NEXT: mov v0.b[15], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[13], v0.b[0]
+; CHECK-GI-NEXT: mov b0, v1.b[7]
+; CHECK-GI-NEXT: mov v2.b[14], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[15], v0.b[0]
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
; CHECK-GI-NEXT: ret
entry:
%vecext = extractelement <16 x i8> %x, i32 0
@@ -1922,36 +1933,38 @@ define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v8i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov b2, v0.b[1]
-; CHECK-GI-NEXT: mov b3, v0.b[2]
+; CHECK-GI-NEXT: mov b3, v0.b[1]
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov b4, v0.b[3]
-; CHECK-GI-NEXT: mov b5, v0.b[4]
-; CHECK-GI-NEXT: mov b6, v0.b[5]
-; CHECK-GI-NEXT: mov b7, v0.b[6]
-; CHECK-GI-NEXT: mov b16, v0.b[7]
-; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v1.b[1]
-; CHECK-GI-NEXT: mov v0.b[2], v3.b[0]
+; CHECK-GI-NEXT: mov b4, v0.b[2]
+; CHECK-GI-NEXT: mov v2.b[1], v3.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[3]
+; CHECK-GI-NEXT: mov v2.b[2], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v0.b[4]
+; CHECK-GI-NEXT: mov v2.b[3], v3.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[5]
+; CHECK-GI-NEXT: mov v2.b[4], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v0.b[6]
+; CHECK-GI-NEXT: mov b0, v0.b[7]
+; CHECK-GI-NEXT: mov v2.b[5], v3.b[0]
; CHECK-GI-NEXT: mov b3, v1.b[2]
-; CHECK-GI-NEXT: mov v0.b[3], v4.b[0]
-; CHECK-GI-NEXT: mov v0.b[4], v5.b[0]
-; CHECK-GI-NEXT: mov v0.b[5], v6.b[0]
-; CHECK-GI-NEXT: mov v0.b[6], v7.b[0]
-; CHECK-GI-NEXT: mov v0.b[7], v16.b[0]
-; CHECK-GI-NEXT: mov v0.b[8], v1.b[0]
-; CHECK-GI-NEXT: mov v0.b[9], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v1.b[3]
-; CHECK-GI-NEXT: mov v0.b[10], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[6], v4.b[0]
+; CHECK-GI-NEXT: mov v2.b[7], v0.b[0]
+; CHECK-GI-NEXT: mov b0, v1.b[1]
+; CHECK-GI-NEXT: mov v2.b[8], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[9], v0.b[0]
+; CHECK-GI-NEXT: mov b0, v1.b[3]
+; CHECK-GI-NEXT: mov v2.b[10], v3.b[0]
; CHECK-GI-NEXT: mov b3, v1.b[4]
-; CHECK-GI-NEXT: mov v0.b[11], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v1.b[5]
-; CHECK-GI-NEXT: mov v0.b[12], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[11], v0.b[0]
+; CHECK-GI-NEXT: mov b0, v1.b[5]
+; CHECK-GI-NEXT: mov v2.b[12], v3.b[0]
; CHECK-GI-NEXT: mov b3, v1.b[6]
-; CHECK-GI-NEXT: mov b1, v1.b[7]
-; CHECK-GI-NEXT: mov v0.b[13], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[14], v3.b[0]
-; CHECK-GI-NEXT: mov v0.b[15], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[13], v0.b[0]
+; CHECK-GI-NEXT: mov b0, v1.b[7]
+; CHECK-GI-NEXT: mov v2.b[14], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[15], v0.b[0]
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
; CHECK-GI-NEXT: ret
entry:
%vecext = extractelement <8 x i8> %x, i32 0
@@ -2017,17 +2030,15 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v8i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: mov v2.16b, v1.16b
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: adrp x8, .LCPI131_0
-; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI131_0]
-; CHECK-GI-NEXT: mov v0.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI131_0]
+; CHECK-GI-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECK-GI-NEXT: ret
entry:
%vecext = extractelement <4 x i16> %x, i32 0
@@ -2051,20 +2062,16 @@ define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v8i16_v8i16_v4i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NEXT: mov h3, v0.h[2]
+; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov h3, v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NEXT: mov h1, v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NEXT: mov v0.h[6], v3.h[0]
-; CHECK-GI-NEXT: mov v0.h[7], v1.h[0]
+; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT: mov v2.h[4], v1.h[0]
+; CHECK-GI-NEXT: mov v2.h[5], v1.h[1]
+; CHECK-GI-NEXT: mov v2.h[6], v1.h[2]
+; CHECK-GI-NEXT: mov v2.h[7], v1.h[3]
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
; CHECK-GI-NEXT: ret
entry:
%vecext = extractelement <8 x i16> %x, i32 0
@@ -2097,20 +2104,16 @@ define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v4i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NEXT: mov h3, v0.h[2]
+; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov h3, v1.h[2]
-; CHECK-GI-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NEXT: mov h1, v1.h[3]
-; CHECK-GI-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NEXT: mov v0.h[6], v3.h[0]
-; CHECK-GI-NEXT: mov v0.h[7], v1.h[0]
+; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT: mov v2.h[4], v1.h[0]
+; CHECK-GI-NEXT: mov v2.h[5], v1.h[1]
+; CHECK-GI-NEXT: mov v2.h[6], v1.h[2]
+; CHECK-GI-NEXT: mov v2.h[7], v1.h[3]
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
; CHECK-GI-NEXT: ret
entry:
%vecext = extractelement <4 x i16> %x, i32 0
@@ -2160,13 +2163,13 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v4i32_v2i32_v4i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: mov v2.16b, v1.16b
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: adrp x8, .LCPI135_0
-; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI135_0]
-; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI135_0]
+; CHECK-GI-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECK-GI-NEXT: ret
entry:
%vecext = extractelement <2 x i32> %x, i32 0
@@ -2186,12 +2189,12 @@ define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v2i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov s2, v0.s[1]
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT: mov s2, v1.s[1]
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT: mov v0.s[3], v2.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[3], v1.s[1]
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
; CHECK-GI-NEXT: ret
entry:
%vecext = extractelement <4 x i32> %x, i32 0
@@ -2241,11 +2244,18 @@ entry:
}
define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_concat_v2i64_v2i64_v1i64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_concat_v2i64_v2i64_v1i64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v0.d[0], v0.d[0]
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: ret
entry:
%vecext = extractelement <2 x i64> %x, i32 0
%vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index abf2e1272d6450..51ce5360744eb0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -466,62 +466,62 @@ define <32 x i8> @sext_v32i1(<32 x i1> %arg) {
;
; CHECK-GI-LABEL: sext_v32i1:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fmov s17, w0
-; CHECK-GI-NEXT: fmov s19, w4
+; CHECK-GI-NEXT: mov.s v3[0], w0
+; CHECK-GI-NEXT: mov.s v5[0], w4
; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s21, [sp, #8]
+; CHECK-GI-NEXT: ldr s20, [sp, #8]
; CHECK-GI-NEXT: ldr s1, [sp, #32]
-; CHECK-GI-NEXT: ldr s22, [sp, #40]
-; CHECK-GI-NEXT: ldr s2, [sp, #64]
-; CHECK-GI-NEXT: ldr s23, [sp, #72]
-; CHECK-GI-NEXT: ldr s3, [sp, #96]
-; CHECK-GI-NEXT: ldr s24, [sp, #104]
-; CHECK-GI-NEXT: mov.s v17[1], w1
-; CHECK-GI-NEXT: mov.s v19[1], w5
-; CHECK-GI-NEXT: ldr s5, [sp, #128]
-; CHECK-GI-NEXT: ldr s20, [sp, #136]
-; CHECK-GI-NEXT: mov.s v0[1], v21[0]
-; CHECK-GI-NEXT: ldr s7, [sp, #160]
+; CHECK-GI-NEXT: ldr s21, [sp, #40]
+; CHECK-GI-NEXT: ldr s6, [sp, #64]
+; CHECK-GI-NEXT: ldr s22, [sp, #72]
+; CHECK-GI-NEXT: ldr s7, [sp, #96]
+; CHECK-GI-NEXT: ldr s23, [sp, #104]
+; CHECK-GI-NEXT: mov.s v0[1], v20[0]
+; CHECK-GI-NEXT: mov.s v1[1], v21[0]
+; CHECK-GI-NEXT: ldr s16, [sp, #128]
+; CHECK-GI-NEXT: ldr s24, [sp, #136]
+; CHECK-GI-NEXT: mov.s v3[1], w1
+; CHECK-GI-NEXT: ldr s17, [sp, #160]
; CHECK-GI-NEXT: ldr s25, [sp, #168]
-; CHECK-GI-NEXT: mov.s v1[1], v22[0]
-; CHECK-GI-NEXT: mov.s v2[1], v23[0]
-; CHECK-GI-NEXT: mov.s v3[1], v24[0]
-; CHECK-GI-NEXT: mov.s v5[1], v20[0]
-; CHECK-GI-NEXT: mov.s v7[1], v25[0]
-; CHECK-GI-NEXT: ldr s16, [sp, #16]
-; CHECK-GI-NEXT: ldr s18, [sp, #48]
+; CHECK-GI-NEXT: mov.s v5[1], w5
+; CHECK-GI-NEXT: mov.s v6[1], v22[0]
+; CHECK-GI-NEXT: mov.s v7[1], v23[0]
+; CHECK-GI-NEXT: mov.s v16[1], v24[0]
+; CHECK-GI-NEXT: mov.s v17[1], v25[0]
+; CHECK-GI-NEXT: ldr s4, [sp, #16]
+; CHECK-GI-NEXT: ldr s19, [sp, #48]
; CHECK-GI-NEXT: ldr s20, [sp, #80]
; CHECK-GI-NEXT: ldr s21, [sp, #112]
; CHECK-GI-NEXT: ldr s22, [sp, #144]
; CHECK-GI-NEXT: ldr s23, [sp, #176]
-; CHECK-GI-NEXT: mov.s v17[2], w2
-; CHECK-GI-NEXT: mov.s v19[2], w6
-; CHECK-GI-NEXT: mov.s v0[2], v16[0]
-; CHECK-GI-NEXT: mov.s v1[2], v18[0]
-; CHECK-GI-NEXT: mov.s v2[2], v20[0]
-; CHECK-GI-NEXT: mov.s v3[2], v21[0]
-; CHECK-GI-NEXT: mov.s v5[2], v22[0]
-; CHECK-GI-NEXT: mov.s v7[2], v23[0]
-; CHECK-GI-NEXT: ldr s4, [sp, #24]
-; CHECK-GI-NEXT: ldr s6, [sp, #56]
-; CHECK-GI-NEXT: ldr s16, [sp, #88]
-; CHECK-GI-NEXT: ldr s18, [sp, #120]
+; CHECK-GI-NEXT: mov.s v3[2], w2
+; CHECK-GI-NEXT: mov.s v5[2], w6
+; CHECK-GI-NEXT: mov.s v0[2], v4[0]
+; CHECK-GI-NEXT: mov.s v1[2], v19[0]
+; CHECK-GI-NEXT: mov.s v6[2], v20[0]
+; CHECK-GI-NEXT: mov.s v7[2], v21[0]
+; CHECK-GI-NEXT: mov.s v16[2], v22[0]
+; CHECK-GI-NEXT: mov.s v17[2], v23[0]
+; CHECK-GI-NEXT: ldr s2, [sp, #24]
+; CHECK-GI-NEXT: ldr s18, [sp, #56]
+; CHECK-GI-NEXT: ldr s4, [sp, #88]
+; CHECK-GI-NEXT: ldr s19, [sp, #120]
; CHECK-GI-NEXT: ldr s20, [sp, #152]
; CHECK-GI-NEXT: ldr s21, [sp, #184]
-; CHECK-GI-NEXT: mov.s v17[3], w3
-; CHECK-GI-NEXT: mov.s v19[3], w7
-; CHECK-GI-NEXT: mov.s v0[3], v4[0]
-; CHECK-GI-NEXT: mov.s v1[3], v6[0]
-; CHECK-GI-NEXT: mov.s v2[3], v16[0]
-; CHECK-GI-NEXT: mov.s v3[3], v18[0]
-; CHECK-GI-NEXT: mov.s v5[3], v20[0]
-; CHECK-GI-NEXT: mov.s v7[3], v21[0]
-; CHECK-GI-NEXT: uzp1.8h v4, v17, v19
+; CHECK-GI-NEXT: mov.s v3[3], w3
+; CHECK-GI-NEXT: mov.s v5[3], w7
+; CHECK-GI-NEXT: mov.s v0[3], v2[0]
+; CHECK-GI-NEXT: mov.s v1[3], v18[0]
+; CHECK-GI-NEXT: mov.s v6[3], v4[0]
+; CHECK-GI-NEXT: mov.s v7[3], v19[0]
+; CHECK-GI-NEXT: mov.s v16[3], v20[0]
+; CHECK-GI-NEXT: mov.s v17[3], v21[0]
+; CHECK-GI-NEXT: uzp1.8h v2, v3, v5
; CHECK-GI-NEXT: uzp1.8h v0, v0, v1
-; CHECK-GI-NEXT: uzp1.8h v1, v2, v3
-; CHECK-GI-NEXT: uzp1.8h v2, v5, v7
-; CHECK-GI-NEXT: uzp1.16b v0, v4, v0
-; CHECK-GI-NEXT: uzp1.16b v1, v1, v2
+; CHECK-GI-NEXT: uzp1.8h v1, v6, v7
+; CHECK-GI-NEXT: uzp1.8h v3, v16, v17
+; CHECK-GI-NEXT: uzp1.16b v0, v2, v0
+; CHECK-GI-NEXT: uzp1.16b v1, v1, v3
; CHECK-GI-NEXT: shl.16b v0, v0, #7
; CHECK-GI-NEXT: shl.16b v1, v1, #7
; CHECK-GI-NEXT: sshr.16b v0, v0, #7
@@ -820,114 +820,114 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) {
; CHECK-GI-NEXT: ldr s1, [sp, #64]
; CHECK-GI-NEXT: ldr s23, [sp, #72]
; CHECK-GI-NEXT: mov.s v0[1], v4[0]
-; CHECK-GI-NEXT: ldr s28, [sp, #200]
-; CHECK-GI-NEXT: ldr s3, [sp, #128]
+; CHECK-GI-NEXT: ldr s4, [sp, #160]
+; CHECK-GI-NEXT: ldr s25, [sp, #168]
; CHECK-GI-NEXT: mov.s v2[1], v5[0]
; CHECK-GI-NEXT: mov.s v1[1], v23[0]
; CHECK-GI-NEXT: ldr s5, [sp, #192]
+; CHECK-GI-NEXT: ldr s28, [sp, #200]
+; CHECK-GI-NEXT: mov.s v4[1], v25[0]
+; CHECK-GI-NEXT: ldr s3, [sp, #128]
; CHECK-GI-NEXT: ldr s7, [sp, #136]
-; CHECK-GI-NEXT: ldr s4, [sp, #160]
-; CHECK-GI-NEXT: ldr s24, [sp, #168]
-; CHECK-GI-NEXT: mov.s v5[1], v28[0]
; CHECK-GI-NEXT: ldr s6, [sp, #48]
; CHECK-GI-NEXT: ldr s21, [sp, #80]
-; CHECK-GI-NEXT: mov.s v3[1], v7[0]
-; CHECK-GI-NEXT: mov.s v4[1], v24[0]
+; CHECK-GI-NEXT: mov.s v5[1], v28[0]
; CHECK-GI-NEXT: ldr s16, [sp, #112]
-; CHECK-GI-NEXT: ldr s29, [sp, #208]
+; CHECK-GI-NEXT: ldr s27, [sp, #176]
+; CHECK-GI-NEXT: mov.s v3[1], v7[0]
; CHECK-GI-NEXT: mov.s v0[2], v6[0]
; CHECK-GI-NEXT: mov.s v1[2], v21[0]
+; CHECK-GI-NEXT: ldr s29, [sp, #208]
+; CHECK-GI-NEXT: ldr s20, [sp, #144]
+; CHECK-GI-NEXT: mov.s v2[2], v16[0]
; CHECK-GI-NEXT: ldr s6, [sp, #224]
; CHECK-GI-NEXT: ldr s30, [sp, #232]
-; CHECK-GI-NEXT: mov.s v2[2], v16[0]
-; CHECK-GI-NEXT: ldr s20, [sp, #144]
-; CHECK-GI-NEXT: ldr s27, [sp, #176]
+; CHECK-GI-NEXT: mov.s v4[2], v27[0]
+; CHECK-GI-NEXT: ldr s7, [sp, #256]
+; CHECK-GI-NEXT: ldr s31, [sp, #264]
; CHECK-GI-NEXT: mov.s v5[2], v29[0]
-; CHECK-GI-NEXT: mov.s v6[1], v30[0]
; CHECK-GI-NEXT: ldr s18, [sp, #88]
; CHECK-GI-NEXT: ldr s19, [sp, #120]
-; CHECK-GI-NEXT: ldr s7, [sp, #256]
-; CHECK-GI-NEXT: ldr s31, [sp, #264]
; CHECK-GI-NEXT: mov.s v3[2], v20[0]
-; CHECK-GI-NEXT: mov.s v4[2], v27[0]
-; CHECK-GI-NEXT: ldr s25, [sp, #216]
-; CHECK-GI-NEXT: ldr s26, [sp, #240]
+; CHECK-GI-NEXT: ldr s23, [sp, #184]
+; CHECK-GI-NEXT: ldr s24, [sp, #216]
+; CHECK-GI-NEXT: mov.s v6[1], v30[0]
+; CHECK-GI-NEXT: mov.s v7[1], v31[0]
; CHECK-GI-NEXT: ldr s17, [sp, #56]
; CHECK-GI-NEXT: ldr s22, [sp, #152]
+; CHECK-GI-NEXT: ldr s26, [sp, #240]
+; CHECK-GI-NEXT: ldr s28, [sp, #272]
; CHECK-GI-NEXT: mov.s v1[3], v18[0]
-; CHECK-GI-NEXT: ldr s23, [sp, #184]
; CHECK-GI-NEXT: mov.s v2[3], v19[0]
; CHECK-GI-NEXT: ldr s18, [sp, #320]
; CHECK-GI-NEXT: ldr s27, [sp, #328]
-; CHECK-GI-NEXT: mov.s v7[1], v31[0]
; CHECK-GI-NEXT: ldr s19, [sp, #352]
; CHECK-GI-NEXT: ldr s29, [sp, #360]
-; CHECK-GI-NEXT: mov.s v5[3], v25[0]
-; CHECK-GI-NEXT: mov.s v6[2], v26[0]
-; CHECK-GI-NEXT: fmov s25, w0
-; CHECK-GI-NEXT: fmov s26, w4
-; CHECK-GI-NEXT: ldr s28, [sp, #272]
+; CHECK-GI-NEXT: mov.s v4[3], v23[0]
+; CHECK-GI-NEXT: mov.s v23[0], w0
+; CHECK-GI-NEXT: mov.s v5[3], v24[0]
+; CHECK-GI-NEXT: mov.s v24[0], w4
; CHECK-GI-NEXT: mov.s v0[3], v17[0]
; CHECK-GI-NEXT: ldr s17, [sp, #288]
; CHECK-GI-NEXT: ldr s8, [sp, #296]
; CHECK-GI-NEXT: mov.s v3[3], v22[0]
; CHECK-GI-NEXT: ldr s20, [sp, #384]
-; CHECK-GI-NEXT: mov.s v4[3], v23[0]
+; CHECK-GI-NEXT: mov.s v6[2], v26[0]
; CHECK-GI-NEXT: ldr s30, [sp, #392]
+; CHECK-GI-NEXT: mov.s v7[2], v28[0]
; CHECK-GI-NEXT: ldr s22, [sp, #416]
-; CHECK-GI-NEXT: ldr s31, [sp, #424]
-; CHECK-GI-NEXT: ldr s23, [sp, #448]
+; CHECK-GI-NEXT: ldr s28, [sp, #424]
; CHECK-GI-NEXT: mov.s v18[1], v27[0]
+; CHECK-GI-NEXT: ldr s26, [sp, #448]
; CHECK-GI-NEXT: mov.s v19[1], v29[0]
; CHECK-GI-NEXT: ldr s27, [sp, #456]
-; CHECK-GI-NEXT: ldr s24, [sp, #336]
+; CHECK-GI-NEXT: ldr s25, [sp, #336]
; CHECK-GI-NEXT: mov.s v17[1], v8[0]
-; CHECK-GI-NEXT: mov.s v7[2], v28[0]
-; CHECK-GI-NEXT: mov.s v25[1], w1
-; CHECK-GI-NEXT: mov.s v26[1], w5
+; CHECK-GI-NEXT: mov.s v23[1], w1
+; CHECK-GI-NEXT: mov.s v24[1], w5
; CHECK-GI-NEXT: mov.s v20[1], v30[0]
-; CHECK-GI-NEXT: ldr s28, [sp, #368]
-; CHECK-GI-NEXT: mov.s v22[1], v31[0]
-; CHECK-GI-NEXT: mov.s v23[1], v27[0]
+; CHECK-GI-NEXT: ldr s29, [sp, #368]
+; CHECK-GI-NEXT: mov.s v22[1], v28[0]
+; CHECK-GI-NEXT: mov.s v26[1], v27[0]
; CHECK-GI-NEXT: ldr s9, [sp, #304]
; CHECK-GI-NEXT: ldr s27, [sp, #400]
-; CHECK-GI-NEXT: mov.s v18[2], v24[0]
-; CHECK-GI-NEXT: ldr s24, [sp, #432]
-; CHECK-GI-NEXT: mov.s v19[2], v28[0]
-; CHECK-GI-NEXT: ldr s28, [sp, #464]
+; CHECK-GI-NEXT: mov.s v18[2], v25[0]
+; CHECK-GI-NEXT: ldr s25, [sp, #432]
+; CHECK-GI-NEXT: mov.s v19[2], v29[0]
+; CHECK-GI-NEXT: ldr s29, [sp, #464]
; CHECK-GI-NEXT: ldr s16, [sp, #248]
; CHECK-GI-NEXT: ldr s21, [sp, #280]
; CHECK-GI-NEXT: mov.s v17[2], v9[0]
-; CHECK-GI-NEXT: mov.s v25[2], w2
-; CHECK-GI-NEXT: mov.s v26[2], w6
+; CHECK-GI-NEXT: mov.s v23[2], w2
+; CHECK-GI-NEXT: mov.s v24[2], w6
; CHECK-GI-NEXT: mov.s v20[2], v27[0]
-; CHECK-GI-NEXT: mov.s v22[2], v24[0]
-; CHECK-GI-NEXT: mov.s v23[2], v28[0]
-; CHECK-GI-NEXT: ldr s29, [sp, #312]
+; CHECK-GI-NEXT: mov.s v22[2], v25[0]
+; CHECK-GI-NEXT: mov.s v26[2], v29[0]
+; CHECK-GI-NEXT: ldr s28, [sp, #312]
; CHECK-GI-NEXT: ldr s27, [sp, #344]
-; CHECK-GI-NEXT: ldr s24, [sp, #376]
-; CHECK-GI-NEXT: ldr s28, [sp, #408]
+; CHECK-GI-NEXT: ldr s25, [sp, #376]
+; CHECK-GI-NEXT: ldr s29, [sp, #408]
; CHECK-GI-NEXT: mov.s v6[3], v16[0]
; CHECK-GI-NEXT: ldr s16, [sp, #440]
; CHECK-GI-NEXT: mov.s v7[3], v21[0]
; CHECK-GI-NEXT: ldr s21, [sp, #472]
-; CHECK-GI-NEXT: mov.s v25[3], w3
-; CHECK-GI-NEXT: mov.s v26[3], w7
-; CHECK-GI-NEXT: mov.s v17[3], v29[0]
+; CHECK-GI-NEXT: mov.s v23[3], w3
+; CHECK-GI-NEXT: mov.s v24[3], w7
+; CHECK-GI-NEXT: mov.s v17[3], v28[0]
; CHECK-GI-NEXT: mov.s v18[3], v27[0]
-; CHECK-GI-NEXT: mov.s v19[3], v24[0]
-; CHECK-GI-NEXT: mov.s v20[3], v28[0]
+; CHECK-GI-NEXT: mov.s v19[3], v25[0]
+; CHECK-GI-NEXT: mov.s v20[3], v29[0]
; CHECK-GI-NEXT: mov.s v22[3], v16[0]
-; CHECK-GI-NEXT: mov.s v23[3], v21[0]
+; CHECK-GI-NEXT: mov.s v26[3], v21[0]
; CHECK-GI-NEXT: uzp1.8h v0, v0, v1
; CHECK-GI-NEXT: uzp1.8h v1, v2, v3
; CHECK-GI-NEXT: uzp1.8h v2, v4, v5
; CHECK-GI-NEXT: uzp1.8h v3, v6, v7
; CHECK-GI-NEXT: ldr x29, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT: uzp1.8h v16, v25, v26
+; CHECK-GI-NEXT: uzp1.8h v16, v23, v24
; CHECK-GI-NEXT: uzp1.8h v4, v17, v18
; CHECK-GI-NEXT: uzp1.8h v5, v19, v20
-; CHECK-GI-NEXT: uzp1.8h v6, v22, v23
+; CHECK-GI-NEXT: uzp1.8h v6, v22, v26
; CHECK-GI-NEXT: uzp1.16b v1, v1, v2
; CHECK-GI-NEXT: uzp1.16b v0, v16, v0
; CHECK-GI-NEXT: uzp1.16b v2, v3, v4
diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
index 44b92e6ccd088f..a854cb7fec9917 100644
--- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
@@ -368,28 +368,26 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-GI-NEXT: mov.16b v5, v4
-; CHECK-GI-NEXT: mov.b v5[1], v4[0]
-; CHECK-GI-NEXT: mov.b v5[2], v4[0]
-; CHECK-GI-NEXT: mov.b v5[3], v4[0]
-; CHECK-GI-NEXT: mov.b v5[4], v4[0]
-; CHECK-GI-NEXT: mov.b v5[5], v4[0]
-; CHECK-GI-NEXT: mov.b v5[6], v4[0]
-; CHECK-GI-NEXT: mov.b v5[7], v4[0]
-; CHECK-GI-NEXT: fmov s4, w8
+; CHECK-GI-NEXT: mov.b v4[1], w0
+; CHECK-GI-NEXT: mov.b v4[2], w0
+; CHECK-GI-NEXT: mov.b v4[3], w0
+; CHECK-GI-NEXT: mov.b v4[4], w0
+; CHECK-GI-NEXT: mov.b v4[5], w0
+; CHECK-GI-NEXT: mov.b v4[6], w0
+; CHECK-GI-NEXT: mov.b v4[7], w0
+; CHECK-GI-NEXT: mov.b v4[8], w8
+; CHECK-GI-NEXT: mov.b v4[9], w8
+; CHECK-GI-NEXT: mov.b v4[10], w8
+; CHECK-GI-NEXT: mov.b v4[11], w8
+; CHECK-GI-NEXT: mov.b v4[12], w8
+; CHECK-GI-NEXT: mov.b v4[13], w8
+; CHECK-GI-NEXT: mov.b v4[14], w8
+; CHECK-GI-NEXT: mov.b v4[15], w8
; CHECK-GI-NEXT: adrp x8, .LCPI10_1
-; CHECK-GI-NEXT: mov.b v5[8], v4[0]
-; CHECK-GI-NEXT: mov.b v5[9], v4[0]
-; CHECK-GI-NEXT: mov.b v5[10], v4[0]
-; CHECK-GI-NEXT: mov.b v5[11], v4[0]
-; CHECK-GI-NEXT: mov.b v5[12], v4[0]
-; CHECK-GI-NEXT: mov.b v5[13], v4[0]
-; CHECK-GI-NEXT: mov.b v5[14], v4[0]
-; CHECK-GI-NEXT: mov.b v5[15], v4[0]
-; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI10_1]
+; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI10_1]
; CHECK-GI-NEXT: adrp x8, .LCPI10_0
-; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v5
-; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v5
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI10_0]
; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
; CHECK-GI-NEXT: ret
@@ -488,35 +486,32 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x
; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov w8, #1 // =0x1
-; CHECK-GI-NEXT: fmov s6, w0
; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
; CHECK-GI-NEXT: fmov s4, w8
-; CHECK-GI-NEXT: mov w8, #255 // =0xff
; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-GI-NEXT: mov.16b v5, v4
-; CHECK-GI-NEXT: mov.b v5[1], v4[0]
-; CHECK-GI-NEXT: mov.b v5[2], v4[0]
-; CHECK-GI-NEXT: mov.b v5[3], v4[0]
-; CHECK-GI-NEXT: mov.b v5[4], v4[0]
-; CHECK-GI-NEXT: mov.b v5[5], v4[0]
-; CHECK-GI-NEXT: mov.b v5[6], v4[0]
-; CHECK-GI-NEXT: mov.b v5[7], v4[0]
-; CHECK-GI-NEXT: fmov s4, w8
+; CHECK-GI-NEXT: mov.b v4[1], w8
+; CHECK-GI-NEXT: mov.b v4[2], w8
+; CHECK-GI-NEXT: mov.b v4[3], w8
+; CHECK-GI-NEXT: mov.b v4[4], w8
+; CHECK-GI-NEXT: mov.b v4[5], w8
+; CHECK-GI-NEXT: mov.b v4[6], w8
+; CHECK-GI-NEXT: mov.b v4[7], w8
+; CHECK-GI-NEXT: mov w8, #255 // =0xff
+; CHECK-GI-NEXT: mov.b v4[8], w8
+; CHECK-GI-NEXT: mov.b v4[9], w8
+; CHECK-GI-NEXT: mov.b v4[10], w8
+; CHECK-GI-NEXT: mov.b v4[11], w8
+; CHECK-GI-NEXT: mov.b v4[12], w0
+; CHECK-GI-NEXT: mov.b v4[13], w0
+; CHECK-GI-NEXT: mov.b v4[14], w8
; CHECK-GI-NEXT: adrp x8, .LCPI11_1
-; CHECK-GI-NEXT: mov.b v5[8], v4[0]
-; CHECK-GI-NEXT: mov.b v5[9], v4[0]
-; CHECK-GI-NEXT: mov.b v5[10], v4[0]
-; CHECK-GI-NEXT: mov.b v5[11], v4[0]
-; CHECK-GI-NEXT: mov.b v5[12], v6[0]
-; CHECK-GI-NEXT: mov.b v5[13], v6[0]
-; CHECK-GI-NEXT: mov.b v5[14], v4[0]
-; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI11_1]
+; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI11_1]
; CHECK-GI-NEXT: adrp x8, .LCPI11_0
-; CHECK-GI-NEXT: mov.b v5[15], v6[0]
-; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v5
-; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT: mov.b v4[15], w0
+; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v5
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI11_0]
; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
; CHECK-GI-NEXT: ret
@@ -623,32 +618,30 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s4, w0
; CHECK-GI-NEXT: mov w8, #255 // =0xff
+; CHECK-GI-NEXT: adrp x9, .LCPI12_1
; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT: ldr q5, [x9, :lo12:.LCPI12_1]
; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: mov.16b v5, v4
-; CHECK-GI-NEXT: mov.b v5[1], v4[0]
-; CHECK-GI-NEXT: mov.b v5[2], v4[0]
-; CHECK-GI-NEXT: mov.b v5[3], v4[0]
-; CHECK-GI-NEXT: mov.b v5[4], v4[0]
-; CHECK-GI-NEXT: mov.b v5[5], v4[0]
-; CHECK-GI-NEXT: mov.b v5[6], v4[0]
-; CHECK-GI-NEXT: mov.b v5[7], v4[0]
-; CHECK-GI-NEXT: fmov s4, w8
-; CHECK-GI-NEXT: adrp x8, .LCPI12_1
-; CHECK-GI-NEXT: mov.b v5[8], v4[0]
-; CHECK-GI-NEXT: mov.b v5[9], v4[0]
-; CHECK-GI-NEXT: mov.b v5[10], v4[0]
-; CHECK-GI-NEXT: mov.b v5[11], v4[0]
-; CHECK-GI-NEXT: mov.b v5[12], v4[0]
-; CHECK-GI-NEXT: mov.b v5[13], v4[0]
-; CHECK-GI-NEXT: mov.b v5[14], v4[0]
-; CHECK-GI-NEXT: mov.b v5[15], v4[0]
-; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI12_1]
+; CHECK-GI-NEXT: mov.b v4[1], w0
+; CHECK-GI-NEXT: tbl.16b v2, { v2, v3 }, v5
+; CHECK-GI-NEXT: mov.b v4[2], w0
+; CHECK-GI-NEXT: mov.b v4[3], w0
+; CHECK-GI-NEXT: mov.b v4[4], w0
+; CHECK-GI-NEXT: mov.b v4[5], w0
+; CHECK-GI-NEXT: mov.b v4[6], w0
+; CHECK-GI-NEXT: mov.b v4[7], w0
+; CHECK-GI-NEXT: mov.b v4[8], w8
+; CHECK-GI-NEXT: mov.b v4[9], w8
+; CHECK-GI-NEXT: mov.b v4[10], w8
+; CHECK-GI-NEXT: mov.b v4[11], w8
+; CHECK-GI-NEXT: mov.b v4[12], w8
+; CHECK-GI-NEXT: mov.b v4[13], w8
+; CHECK-GI-NEXT: mov.b v4[14], w8
+; CHECK-GI-NEXT: mov.b v4[15], w8
; CHECK-GI-NEXT: adrp x8, .LCPI12_0
-; CHECK-GI-NEXT: tbl.16b v2, { v2, v3 }, v4
-; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v5
+; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v4
; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI12_0]
; CHECK-GI-NEXT: tbl.16b v0, { v2, v3 }, v0
; CHECK-GI-NEXT: ret
@@ -774,30 +767,28 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16
; CHECK-GI-NEXT: mov w8, #255 // =0xff
; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: fmov s6, w8
-; CHECK-GI-NEXT: adrp x8, .LCPI13_1
; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT: mov.16b v5, v4
-; CHECK-GI-NEXT: mov.b v5[1], v4[0]
-; CHECK-GI-NEXT: mov.b v5[2], v4[0]
-; CHECK-GI-NEXT: mov.b v5[3], v4[0]
-; CHECK-GI-NEXT: mov.b v5[4], v4[0]
-; CHECK-GI-NEXT: mov.b v5[5], v4[0]
-; CHECK-GI-NEXT: mov.b v5[6], v4[0]
-; CHECK-GI-NEXT: mov.b v5[7], v4[0]
-; CHECK-GI-NEXT: mov.b v5[8], v6[0]
-; CHECK-GI-NEXT: mov.b v5[9], v6[0]
-; CHECK-GI-NEXT: mov.b v5[10], v6[0]
-; CHECK-GI-NEXT: mov.b v5[11], v6[0]
-; CHECK-GI-NEXT: mov.b v5[12], v6[0]
-; CHECK-GI-NEXT: mov.b v5[13], v6[0]
-; CHECK-GI-NEXT: mov.b v5[14], v4[0]
-; CHECK-GI-NEXT: mov.b v5[15], v4[0]
-; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI13_1]
+; CHECK-GI-NEXT: mov.b v4[1], w0
+; CHECK-GI-NEXT: mov.b v4[2], w0
+; CHECK-GI-NEXT: mov.b v4[3], w0
+; CHECK-GI-NEXT: mov.b v4[4], w0
+; CHECK-GI-NEXT: mov.b v4[5], w0
+; CHECK-GI-NEXT: mov.b v4[6], w0
+; CHECK-GI-NEXT: mov.b v4[7], w0
+; CHECK-GI-NEXT: mov.b v4[8], w8
+; CHECK-GI-NEXT: mov.b v4[9], w8
+; CHECK-GI-NEXT: mov.b v4[10], w8
+; CHECK-GI-NEXT: mov.b v4[11], w8
+; CHECK-GI-NEXT: mov.b v4[12], w8
+; CHECK-GI-NEXT: mov.b v4[13], w8
+; CHECK-GI-NEXT: adrp x8, .LCPI13_1
+; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI13_1]
; CHECK-GI-NEXT: adrp x8, .LCPI13_0
-; CHECK-GI-NEXT: tbl.16b v2, { v2, v3 }, v4
-; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v5
+; CHECK-GI-NEXT: tbl.16b v2, { v2, v3 }, v5
+; CHECK-GI-NEXT: mov.b v4[14], w0
+; CHECK-GI-NEXT: mov.b v4[15], w0
+; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v4
; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI13_0]
; CHECK-GI-NEXT: tbl.16b v0, { v2, v3 }, v0
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 5de99586f7fc78..79cfeedb74bce0 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -13,7 +13,7 @@ define <4 x i16> @foo1(<2 x i32> %a) {
; CHECK-GI-LABEL: foo1:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov w8, #58712 // =0xe558
-; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: zip1 v0.2s, v1.2s, v0.2s
; CHECK-GI-NEXT: rev32 v0.4h, v0.4h
; CHECK-GI-NEXT: ret
@@ -33,7 +33,7 @@ define <4 x i16> @foo2(<2 x i32> %a) {
; CHECK-GI-LABEL: foo2:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov w8, #712 // =0x2c8
-; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: zip1 v0.2s, v1.2s, v0.2s
; CHECK-GI-NEXT: rev32 v0.4h, v0.4h
; CHECK-GI-NEXT: ret
@@ -60,13 +60,11 @@ define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){
; CHECK-GI-LABEL: bitcast_v4i8_i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: mov h3, v0.h[3]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT: mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT: xtn v0.8b, v0.8h
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: xtn v0.8b, v1.8h
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
%c = add <4 x i8> %a, %b
@@ -87,12 +85,13 @@ define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){
; CHECK-GI-NEXT: add w8, w0, w1
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[2]
+; CHECK-GI-NEXT: mov b0, v0.b[3]
+; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%c = add i32 %a, %b
@@ -117,9 +116,9 @@ define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){
; CHECK-GI-LABEL: bitcast_v2i16_i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: xtn v0.4h, v1.4s
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
%c = add <2 x i16> %a, %b
@@ -419,16 +418,17 @@ define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){
; CHECK-GI-LABEL: bitcast_v2i16_v4i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: xtn v0.4h, v1.4s
; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[2]
+; CHECK-GI-NEXT: mov b0, v0.b[3]
+; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%c = add <2 x i16> %a, %b
@@ -455,13 +455,11 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){
; CHECK-GI-LABEL: bitcast_v4i8_v2i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: mov h3, v0.h[3]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT: mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT: xtn v0.8b, v0.8h
+; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: xtn v0.8b, v1.8h
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
@@ -515,10 +513,12 @@ define <4 x i64> @bitcast_v8i32_v4i64(<8 x i32> %a, <8 x i32> %b){
;
; CHECK-GI-LABEL: bitcast_v8i32_v4i64:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: add v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT: add v1.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT: mov x8, v0.d[1]
-; CHECK-GI-NEXT: mov x9, v1.d[1]
+; CHECK-GI-NEXT: add v2.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT: add v3.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT: mov x8, v2.d[1]
+; CHECK-GI-NEXT: mov x9, v3.d[1]
+; CHECK-GI-NEXT: mov v0.d[0], v2.d[0]
+; CHECK-GI-NEXT: mov v1.d[0], v3.d[0]
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: mov v1.d[1], x9
; CHECK-GI-NEXT: ret
@@ -574,10 +574,12 @@ define <4 x i64> @bitcast_v16i16_v4i64(<16 x i16> %a, <16 x i16> %b){
;
; CHECK-GI-LABEL: bitcast_v16i16_v4i64:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h
-; CHECK-GI-NEXT: add v1.8h, v1.8h, v3.8h
-; CHECK-GI-NEXT: mov x8, v0.d[1]
-; CHECK-GI-NEXT: mov x9, v1.d[1]
+; CHECK-GI-NEXT: add v2.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT: add v3.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT: mov x8, v2.d[1]
+; CHECK-GI-NEXT: mov x9, v3.d[1]
+; CHECK-GI-NEXT: mov v0.d[0], v2.d[0]
+; CHECK-GI-NEXT: mov v1.d[0], v3.d[0]
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: mov v1.d[1], x9
; CHECK-GI-NEXT: ret
@@ -614,14 +616,18 @@ define <8 x i64> @bitcast_v16i32_v8i64(<16 x i32> %a, <16 x i32> %b){
;
; CHECK-GI-LABEL: bitcast_v16i32_v8i64:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: add v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: add v1.4s, v1.4s, v5.4s
-; CHECK-GI-NEXT: add v2.4s, v2.4s, v6.4s
-; CHECK-GI-NEXT: add v3.4s, v3.4s, v7.4s
-; CHECK-GI-NEXT: mov x8, v0.d[1]
-; CHECK-GI-NEXT: mov x9, v1.d[1]
-; CHECK-GI-NEXT: mov x10, v2.d[1]
-; CHECK-GI-NEXT: mov x11, v3.d[1]
+; CHECK-GI-NEXT: add v4.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT: add v5.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT: add v6.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT: add v7.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT: mov x8, v4.d[1]
+; CHECK-GI-NEXT: mov x9, v5.d[1]
+; CHECK-GI-NEXT: mov x10, v6.d[1]
+; CHECK-GI-NEXT: mov x11, v7.d[1]
+; CHECK-GI-NEXT: mov v0.d[0], v4.d[0]
+; CHECK-GI-NEXT: mov v1.d[0], v5.d[0]
+; CHECK-GI-NEXT: mov v2.d[0], v6.d[0]
+; CHECK-GI-NEXT: mov v3.d[0], v7.d[0]
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: mov v1.d[1], x9
; CHECK-GI-NEXT: mov v2.d[1], x10
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 071613b9cc011e..9ee924dd2548a6 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -110,8 +110,8 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %a){
; CHECK-GI-LABEL: bswap_v2i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov v0.h[1], w8
; CHECK-GI-NEXT: rev16 v0.8b, v0.8b
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
@@ -146,7 +146,7 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: rev w8, w8
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index f6eeeef4faf7ed..18570b2d793ff6 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -11,12 +11,13 @@ define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
; CHECK-GI-LABEL: concat1:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s2, v0.s[1]
+; CHECK-GI-NEXT: mov w8, v0.s[1]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov s2, v1.s[1]
-; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v2.b[0]
+; CHECK-GI-NEXT: mov w9, v1.s[1]
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.b[2], w8
+; CHECK-GI-NEXT: mov v0.b[3], w9
; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
@@ -32,22 +33,20 @@ define <8 x i8> @concat2(<4 x i8> %A, <4 x i8> %B) {
;
; CHECK-GI-LABEL: concat2:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov h2, v1.h[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h3, v0.h[1]
-; CHECK-GI-NEXT: mov h4, v1.h[2]
-; CHECK-GI-NEXT: mov h5, v1.h[3]
-; CHECK-GI-NEXT: mov h6, v0.h[3]
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: mov v0.h[1], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[2], v4.h[0]
-; CHECK-GI-NEXT: mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v5.h[0]
-; CHECK-GI-NEXT: mov v0.h[3], v6.h[0]
-; CHECK-GI-NEXT: xtn v1.8b, v1.8h
-; CHECK-GI-NEXT: xtn v0.8b, v0.8h
+; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov v3.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v3.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v3.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT: mov v3.h[3], v1.h[3]
+; CHECK-GI-NEXT: xtn v0.8b, v2.8h
+; CHECK-GI-NEXT: xtn v1.8b, v3.8h
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -75,14 +74,16 @@ define <4 x i16> @concat4(<2 x i16> %A, <2 x i16> %B) {
;
; CHECK-GI-LABEL: concat4:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov s2, v1.s[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s3, v0.s[1]
-; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
-; CHECK-GI-NEXT: mov v0.s[1], v3.s[0]
-; CHECK-GI-NEXT: xtn v1.4h, v1.4s
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: xtn v2.4h, v2.4s
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: xtn v1.4h, v0.4s
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -145,8 +146,9 @@ define <4 x half> @concat9(<2 x half> %A, <2 x half> %B) {
;
; CHECK-GI-LABEL: concat9:
; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
@@ -181,12 +183,14 @@ define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) {
;
; CHECK-GI-LABEL: concat_v8s16_v2s16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: dup v0.4s, w8
; CHECK-GI-NEXT: ldr h1, [x0]
; CHECK-GI-NEXT: ldr h2, [x0, #2]
+; CHECK-GI-NEXT: dup v0.4s, w8
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: xtn v2.4h, v0.4s
-; CHECK-GI-NEXT: xtn v0.4h, v1.4s
+; CHECK-GI-NEXT: xtn v1.4h, v1.4s
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w8
@@ -208,9 +212,10 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) {
; CHECK-GI-NEXT: dup v0.8h, w8
; CHECK-GI-NEXT: xtn v1.8b, v0.8h
; CHECK-GI-NEXT: ldr s0, [x0]
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov v0.s[3], w8
; CHECK-GI-NEXT: ret
%a = load <4 x i8>, ptr %ptr
%b = shufflevector <4 x i8> %a, <4 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -218,24 +223,13 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) {
}
define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %ptrD) {
-; CHECK-SD-LABEL: concat_v16s8_v4s8_load:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr s0, [x0]
-; CHECK-SD-NEXT: ld1 { v0.s }[1], [x1]
-; CHECK-SD-NEXT: ld1 { v0.s }[2], [x2]
-; CHECK-SD-NEXT: ld1 { v0.s }[3], [x3]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: concat_v16s8_v4s8_load:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr s0, [x0]
-; CHECK-GI-NEXT: ldr s1, [x1]
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: ldr s1, [x2]
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT: ldr s1, [x3]
-; CHECK-GI-NEXT: mov v0.s[3], v1.s[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: concat_v16s8_v4s8_load:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr s0, [x0]
+; CHECK-NEXT: ld1 { v0.s }[1], [x1]
+; CHECK-NEXT: ld1 { v0.s }[2], [x2]
+; CHECK-NEXT: ld1 { v0.s }[3], [x3]
+; CHECK-NEXT: ret
%A = load <4 x i8>, ptr %ptrA
%B = load <4 x i8>, ptr %ptrB
%C = load <4 x i8>, ptr %ptrC
@@ -261,41 +255,35 @@ define <16 x i8> @concat_v16s8_v4s8_reg(<4 x i8> %A, <4 x i8> %B, <4 x i8> %C, <
;
; CHECK-GI-LABEL: concat_v16s8_v4s8_reg:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov h4, v1.h[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h5, v0.h[1]
+; CHECK-GI-NEXT: mov v4.h[0], v0.h[0]
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov v5.h[0], v1.h[0]
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT: mov h6, v1.h[2]
-; CHECK-GI-NEXT: mov h7, v1.h[3]
-; CHECK-GI-NEXT: mov h16, v2.h[1]
-; CHECK-GI-NEXT: mov h17, v0.h[3]
-; CHECK-GI-NEXT: mov h18, v2.h[3]
-; CHECK-GI-NEXT: mov v1.h[1], v4.h[0]
-; CHECK-GI-NEXT: mov h4, v0.h[2]
-; CHECK-GI-NEXT: mov v0.h[1], v5.h[0]
-; CHECK-GI-NEXT: mov h5, v2.h[2]
-; CHECK-GI-NEXT: mov v2.h[1], v16.h[0]
-; CHECK-GI-NEXT: mov v1.h[2], v6.h[0]
-; CHECK-GI-NEXT: mov h6, v3.h[1]
-; CHECK-GI-NEXT: mov v0.h[2], v4.h[0]
-; CHECK-GI-NEXT: mov v2.h[2], v5.h[0]
-; CHECK-GI-NEXT: mov h4, v3.h[2]
-; CHECK-GI-NEXT: mov h5, v3.h[3]
-; CHECK-GI-NEXT: mov v1.h[3], v7.h[0]
-; CHECK-GI-NEXT: mov v3.h[1], v6.h[0]
-; CHECK-GI-NEXT: mov v0.h[3], v17.h[0]
-; CHECK-GI-NEXT: mov v2.h[3], v18.h[0]
-; CHECK-GI-NEXT: xtn v1.8b, v1.8h
-; CHECK-GI-NEXT: mov v3.h[2], v4.h[0]
-; CHECK-GI-NEXT: xtn v0.8b, v0.8h
-; CHECK-GI-NEXT: xtn v2.8b, v2.8h
-; CHECK-GI-NEXT: mov v3.h[3], v5.h[0]
+; CHECK-GI-NEXT: mov v6.h[0], v2.h[0]
+; CHECK-GI-NEXT: mov v7.h[0], v3.h[0]
+; CHECK-GI-NEXT: mov v4.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v5.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v6.h[1], v2.h[1]
+; CHECK-GI-NEXT: mov v7.h[1], v3.h[1]
+; CHECK-GI-NEXT: mov v4.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v5.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v6.h[2], v2.h[2]
+; CHECK-GI-NEXT: mov v7.h[2], v3.h[2]
+; CHECK-GI-NEXT: mov v4.h[3], v0.h[3]
+; CHECK-GI-NEXT: mov v5.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v6.h[3], v2.h[3]
+; CHECK-GI-NEXT: mov v7.h[3], v3.h[3]
+; CHECK-GI-NEXT: xtn v0.8b, v4.8h
+; CHECK-GI-NEXT: xtn v1.8b, v5.8h
+; CHECK-GI-NEXT: xtn v2.8b, v6.8h
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: xtn v1.8b, v7.8h
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: xtn v1.8b, v3.8h
; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: mov v0.s[3], w8
@@ -320,27 +308,29 @@ define <8 x i16> @concat_v8s16_v2s16_reg(<2 x i16> %A, <2 x i16> %B, <2 x i16> %
;
; CHECK-GI-LABEL: concat_v8s16_v2s16_reg:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov s4, v1.s[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s5, v0.s[1]
+; CHECK-GI-NEXT: mov v4.s[0], v0.s[0]
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov v5.s[0], v1.s[0]
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT: mov v1.s[1], v4.s[0]
-; CHECK-GI-NEXT: mov s4, v2.s[1]
-; CHECK-GI-NEXT: mov v0.s[1], v5.s[0]
+; CHECK-GI-NEXT: mov v4.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v5.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v1.s[0], v2.s[0]
+; CHECK-GI-NEXT: xtn v0.4h, v4.4s
+; CHECK-GI-NEXT: xtn v4.4h, v5.4s
+; CHECK-GI-NEXT: mov v1.s[1], v2.s[1]
+; CHECK-GI-NEXT: mov v2.s[0], v3.s[0]
+; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: xtn v1.4h, v1.4s
-; CHECK-GI-NEXT: mov v2.s[1], v4.s[0]
-; CHECK-GI-NEXT: mov s4, v3.s[1]
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: mov v2.s[1], v3.s[1]
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: fmov w8, s4
; CHECK-GI-NEXT: xtn v2.4h, v2.4s
-; CHECK-GI-NEXT: mov v3.s[1], v4.s[0]
-; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: mov v0.s[1], w8
-; CHECK-GI-NEXT: xtn v1.4h, v3.4s
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: mov v0.s[3], w8
; CHECK-GI-NEXT: ret
%b = shufflevector <2 x i16> %A, <2 x i16> %B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index de108b0bc2b7a0..e19e2ead11f4d0 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -161,27 +161,21 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: fabs v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fabs v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: fabs v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fabs_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index 6913a62fb266c1..b15579199a0598 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -188,33 +188,25 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h6, v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: fadd v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fadd_v7f16:
@@ -537,33 +529,25 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h6, v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: fsub v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fsub v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: fsub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fsub_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index a5d7ae147ffda2..8ca1e9ee5b6178 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -556,7 +556,7 @@ define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double>
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: cset w19, lt
; CHECK-GI-NEXT: bl __lttf2
-; CHECK-GI-NEXT: fmov d0, x19
+; CHECK-GI-NEXT: mov v0.d[0], x19
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: cset w8, lt
; CHECK-GI-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload
@@ -663,29 +663,29 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: cset w22, lt
; CHECK-GI-NEXT: bl __lttf2
-; CHECK-GI-NEXT: ldp q0, q2, [sp, #64] // 32-byte Folded Reload
; CHECK-GI-NEXT: sbfx x8, x21, #0, #1
-; CHECK-GI-NEXT: ldp q4, q3, [sp, #96] // 32-byte Folded Reload
-; CHECK-GI-NEXT: sbfx x9, x22, #0, #1
-; CHECK-GI-NEXT: fmov d1, x8
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v2.d[1], v0.d[0]
-; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: cset w8, lt
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: mov v1.d[0], x8
+; CHECK-GI-NEXT: sbfx x8, x22, #0, #1
+; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
+; CHECK-GI-NEXT: ldp q4, q3, [sp, #96] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: mov v1.d[1], x8
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT: cset w8, lt
; CHECK-GI-NEXT: sbfx x8, x8, #0, #1
-; CHECK-GI-NEXT: mov v1.d[1], x9
-; CHECK-GI-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT: bic v0.16b, v3.16b, v0.16b
; CHECK-GI-NEXT: and x9, x19, x8
; CHECK-GI-NEXT: bic x8, x20, x8
; CHECK-GI-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload
; CHECK-GI-NEXT: orr x8, x9, x8
-; CHECK-GI-NEXT: bic v1.16b, v3.16b, v1.16b
-; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-GI-NEXT: fmov d2, x8
-; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #176
@@ -831,21 +831,21 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x
; CHECK-GI-NEXT: fcmp d2, d5
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
-; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: cset w9, mi
-; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: fmov d2, x9
+; CHECK-GI-NEXT: mov v2.d[0], x9
+; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff
; CHECK-GI-NEXT: fcmgt v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT: mov v1.s[1], w8
+; CHECK-GI-NEXT: mov v3.s[0], w9
; CHECK-GI-NEXT: mov v1.s[2], w8
-; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff
; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT: fmov s2, w8
-; CHECK-GI-NEXT: mov v2.s[1], w8
+; CHECK-GI-NEXT: mov v3.s[1], w9
; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: neg v1.4s, v1.4s
-; CHECK-GI-NEXT: mov v2.s[2], w8
+; CHECK-GI-NEXT: mov v3.s[2], w9
; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: eor v1.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT: eor v1.16b, v0.16b, v3.16b
; CHECK-GI-NEXT: and v0.16b, v6.16b, v0.16b
; CHECK-GI-NEXT: and v1.16b, v7.16b, v1.16b
; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
@@ -902,18 +902,18 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d,
; CHECK-GI-LABEL: v3f32_float:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #31 // =0x1f
+; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff
; CHECK-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT: fmov s4, w8
+; CHECK-GI-NEXT: mov v4.s[0], w8
+; CHECK-GI-NEXT: mov v5.s[0], w9
; CHECK-GI-NEXT: mov v4.s[1], w8
+; CHECK-GI-NEXT: mov v5.s[1], w9
; CHECK-GI-NEXT: mov v4.s[2], w8
-; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: mov v1.s[1], w8
+; CHECK-GI-NEXT: mov v5.s[2], w9
; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: neg v4.4s, v4.4s
-; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: mov v1.s[2], w8
-; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: neg v1.4s, v4.4s
+; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b
; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
@@ -980,18 +980,18 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i
; CHECK-GI-LABEL: v3f32_i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #31 // =0x1f
+; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff
; CHECK-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT: fmov s4, w8
+; CHECK-GI-NEXT: mov v4.s[0], w8
+; CHECK-GI-NEXT: mov v5.s[0], w9
; CHECK-GI-NEXT: mov v4.s[1], w8
+; CHECK-GI-NEXT: mov v5.s[1], w9
; CHECK-GI-NEXT: mov v4.s[2], w8
-; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: mov v1.s[1], w8
+; CHECK-GI-NEXT: mov v5.s[2], w9
; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: neg v4.4s, v4.4s
-; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: mov v1.s[2], w8
-; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: neg v1.4s, v4.4s
+; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b
; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
@@ -1106,44 +1106,38 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
; CHECK-GI-NOFP16-LABEL: v7f16_half:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: mov w8, #15 // =0xf
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: fmov s4, w8
-; CHECK-GI-NOFP16-NEXT: mov h16, v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov w8, #65535 // =0xffff
-; CHECK-GI-NOFP16-NEXT: mov h17, v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h18, v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov h19, v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: fmov s5, w8
+; CHECK-GI-NOFP16-NEXT: mov w9, #65535 // =0xffff
+; CHECK-GI-NOFP16-NEXT: fmov s7, w9
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v7.h[1], w9
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], w8
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v7.h[2], w9
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v7.16b, v4.16b
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT: fmov s6, w8
-; CHECK-GI-NOFP16-NEXT: mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v7.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v17.16b, v6.16b
+; CHECK-GI-NOFP16-NEXT: mov v5.h[3], w8
+; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v6.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT: mov v7.h[3], w9
; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v18.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v17.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v16.h[2], v19.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v7.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT: mov v17.h[2], v6.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v16.4s, v16.4h
-; CHECK-GI-NOFP16-NEXT: mov v7.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v17.h[3], v6.h[0]
-; CHECK-GI-NOFP16-NEXT: fcmgt v1.4s, v16.4s, v5.4s
-; CHECK-GI-NOFP16-NEXT: mov v7.h[4], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v17.h[4], v6.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[4], w8
+; CHECK-GI-NOFP16-NEXT: fcmgt v1.4s, v6.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: mov v7.h[4], w9
+; CHECK-GI-NOFP16-NEXT: mov v5.h[5], w8
; CHECK-GI-NOFP16-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-NOFP16-NEXT: mov v7.h[5], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v17.h[5], v6.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v7.h[6], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v17.h[6], v6.h[0]
-; CHECK-GI-NOFP16-NEXT: ushl v0.8h, v0.8h, v7.8h
-; CHECK-GI-NOFP16-NEXT: neg v1.8h, v7.8h
+; CHECK-GI-NOFP16-NEXT: mov v7.h[5], w9
+; CHECK-GI-NOFP16-NEXT: mov v5.h[6], w8
+; CHECK-GI-NOFP16-NEXT: mov v7.h[6], w9
+; CHECK-GI-NOFP16-NEXT: ushl v0.8h, v0.8h, v5.8h
+; CHECK-GI-NOFP16-NEXT: neg v1.8h, v5.8h
; CHECK-GI-NOFP16-NEXT: sshl v0.8h, v0.8h, v1.8h
-; CHECK-GI-NOFP16-NEXT: eor v1.16b, v0.16b, v17.16b
+; CHECK-GI-NOFP16-NEXT: eor v1.16b, v0.16b, v7.16b
; CHECK-GI-NOFP16-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-GI-NOFP16-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-GI-NOFP16-NEXT: orr v0.16b, v0.16b, v1.16b
@@ -1152,28 +1146,26 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
; CHECK-GI-FP16-LABEL: v7f16_half:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: mov w8, #15 // =0xf
+; CHECK-GI-FP16-NEXT: mov w9, #65535 // =0xffff
; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h
; CHECK-GI-FP16-NEXT: fmov s4, w8
-; CHECK-GI-FP16-NEXT: mov w8, #65535 // =0xffff
-; CHECK-GI-FP16-NEXT: fmov s6, w8
-; CHECK-GI-FP16-NEXT: mov v5.16b, v4.16b
-; CHECK-GI-FP16-NEXT: mov v7.16b, v6.16b
-; CHECK-GI-FP16-NEXT: mov v5.h[1], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov v7.h[1], v6.h[0]
-; CHECK-GI-FP16-NEXT: mov v5.h[2], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov v7.h[2], v6.h[0]
-; CHECK-GI-FP16-NEXT: mov v5.h[3], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov v7.h[3], v6.h[0]
-; CHECK-GI-FP16-NEXT: mov v5.h[4], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov v7.h[4], v6.h[0]
-; CHECK-GI-FP16-NEXT: mov v5.h[5], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov v7.h[5], v6.h[0]
-; CHECK-GI-FP16-NEXT: mov v5.h[6], v4.h[0]
-; CHECK-GI-FP16-NEXT: mov v7.h[6], v6.h[0]
-; CHECK-GI-FP16-NEXT: ushl v0.8h, v0.8h, v5.8h
-; CHECK-GI-FP16-NEXT: neg v1.8h, v5.8h
+; CHECK-GI-FP16-NEXT: fmov s5, w9
+; CHECK-GI-FP16-NEXT: mov v4.h[1], w8
+; CHECK-GI-FP16-NEXT: mov v5.h[1], w9
+; CHECK-GI-FP16-NEXT: mov v4.h[2], w8
+; CHECK-GI-FP16-NEXT: mov v5.h[2], w9
+; CHECK-GI-FP16-NEXT: mov v4.h[3], w8
+; CHECK-GI-FP16-NEXT: mov v5.h[3], w9
+; CHECK-GI-FP16-NEXT: mov v4.h[4], w8
+; CHECK-GI-FP16-NEXT: mov v5.h[4], w9
+; CHECK-GI-FP16-NEXT: mov v4.h[5], w8
+; CHECK-GI-FP16-NEXT: mov v5.h[5], w9
+; CHECK-GI-FP16-NEXT: mov v4.h[6], w8
+; CHECK-GI-FP16-NEXT: mov v5.h[6], w9
+; CHECK-GI-FP16-NEXT: ushl v0.8h, v0.8h, v4.8h
+; CHECK-GI-FP16-NEXT: neg v1.8h, v4.8h
; CHECK-GI-FP16-NEXT: sshl v0.8h, v0.8h, v1.8h
-; CHECK-GI-FP16-NEXT: eor v1.16b, v0.16b, v7.16b
+; CHECK-GI-FP16-NEXT: eor v1.16b, v0.16b, v5.16b
; CHECK-GI-FP16-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-GI-FP16-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-GI-FP16-NEXT: orr v0.16b, v0.16b, v1.16b
@@ -1599,59 +1591,52 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
;
; CHECK-GI-NOFP16-LABEL: v7f16_i32:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
; CHECK-GI-NOFP16-NEXT: mov w8, #31 // =0x1f
-; CHECK-GI-NOFP16-NEXT: mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h5, v1.h[5]
-; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #32]
-; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[6]
-; CHECK-GI-NOFP16-NEXT: fmov s16, w0
-; CHECK-GI-NOFP16-NEXT: fmov s18, w4
+; CHECK-GI-NOFP16-NEXT: mov v4.s[0], w8
+; CHECK-GI-NOFP16-NEXT: mov w9, #-1 // =0xffffffff
+; CHECK-GI-NOFP16-NEXT: mov v5.s[0], w0
+; CHECK-GI-NOFP16-NEXT: mov v6.s[0], w9
+; CHECK-GI-NOFP16-NEXT: mov v7.s[0], w7
+; CHECK-GI-NOFP16-NEXT: ldr s16, [sp]
+; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #24]
+; CHECK-GI-NOFP16-NEXT: ldr s18, [sp, #32]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v4.s[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v5.s[1], w1
+; CHECK-GI-NOFP16-NEXT: mov v17.s[1], v18.s[0]
+; CHECK-GI-NOFP16-NEXT: mov v6.s[1], w9
+; CHECK-GI-NOFP16-NEXT: mov v7.s[1], v16.s[0]
+; CHECK-GI-NOFP16-NEXT: ldr s16, [sp, #8]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: mov v4.s[2], w8
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fmov s3, w8
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT: ldr s5, [sp]
-; CHECK-GI-NOFP16-NEXT: mov v16.s[1], w1
-; CHECK-GI-NOFP16-NEXT: mov v18.s[1], w5
-; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w8
-; CHECK-GI-NOFP16-NEXT: fmov w9, s5
-; CHECK-GI-NOFP16-NEXT: fmov s5, w7
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v6.h[0]
-; CHECK-GI-NOFP16-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v7.h[0]
-; CHECK-GI-NOFP16-NEXT: ldr s7, [sp, #24]
-; CHECK-GI-NOFP16-NEXT: mov v16.s[2], w2
-; CHECK-GI-NOFP16-NEXT: mov v5.s[1], w9
-; CHECK-GI-NOFP16-NEXT: fmov w9, s6
-; CHECK-GI-NOFP16-NEXT: ldr s6, [sp, #16]
-; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w8
-; CHECK-GI-NOFP16-NEXT: mov w8, #-1 // =0xffffffff
-; CHECK-GI-NOFP16-NEXT: mov v7.s[1], v17.s[0]
-; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #40]
+; CHECK-GI-NOFP16-NEXT: mov v5.s[2], w2
+; CHECK-GI-NOFP16-NEXT: mov v6.s[2], w9
+; CHECK-GI-NOFP16-NEXT: mov v7.s[2], v16.s[0]
+; CHECK-GI-NOFP16-NEXT: ldr s16, [sp, #40]
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov v18.s[2], w6
-; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: mov v16.s[3], w3
-; CHECK-GI-NOFP16-NEXT: mov v5.s[2], w9
-; CHECK-GI-NOFP16-NEXT: mov v7.s[2], v17.s[0]
-; CHECK-GI-NOFP16-NEXT: fcmgt v2.4s, v4.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: fmov s4, w8
-; CHECK-GI-NOFP16-NEXT: mov v4.s[1], w8
-; CHECK-GI-NOFP16-NEXT: ushl v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: neg v3.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v4.s[2], w8
-; CHECK-GI-NOFP16-NEXT: sshl v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: fmov w8, s6
-; CHECK-GI-NOFP16-NEXT: mov v5.s[3], w8
-; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v4.16b
-; CHECK-GI-NOFP16-NEXT: and v2.16b, v18.16b, v2.16b
-; CHECK-GI-NOFP16-NEXT: and v1.16b, v7.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT: bsl v0.16b, v16.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT: mov v17.s[2], v16.s[0]
+; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v5.s[3], w3
+; CHECK-GI-NOFP16-NEXT: fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v3.s[0], w4
+; CHECK-GI-NOFP16-NEXT: ushl v2.4s, v2.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: neg v4.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w5
+; CHECK-GI-NOFP16-NEXT: sshl v2.4s, v2.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: ldr s4, [sp, #16]
+; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w6
+; CHECK-GI-NOFP16-NEXT: mov v7.s[3], v4.s[0]
+; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v6.16b
+; CHECK-GI-NOFP16-NEXT: and v2.16b, v3.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT: and v1.16b, v17.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT: bsl v0.16b, v5.16b, v7.16b
; CHECK-GI-NOFP16-NEXT: orr v1.16b, v2.16b, v1.16b
; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[1]
; CHECK-GI-NOFP16-NEXT: mov s3, v0.s[2]
@@ -1670,59 +1655,56 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
; CHECK-GI-FP16-LABEL: v7f16_i32:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT: mov w10, #31 // =0x1f
-; CHECK-GI-FP16-NEXT: ldr s3, [sp]
-; CHECK-GI-FP16-NEXT: fmov s2, w10
-; CHECK-GI-FP16-NEXT: fmov s6, w0
-; CHECK-GI-FP16-NEXT: ldr s4, [sp, #8]
-; CHECK-GI-FP16-NEXT: fmov s17, w4
-; CHECK-GI-FP16-NEXT: ldr s7, [sp, #24]
+; CHECK-GI-FP16-NEXT: mov w9, #31 // =0x1f
+; CHECK-GI-FP16-NEXT: mov v4.s[0], w0
+; CHECK-GI-FP16-NEXT: mov v2.s[0], w9
+; CHECK-GI-FP16-NEXT: mov v5.s[0], w7
+; CHECK-GI-FP16-NEXT: ldr s6, [sp]
+; CHECK-GI-FP16-NEXT: mov v7.s[0], w4
; CHECK-GI-FP16-NEXT: ldr s16, [sp, #32]
+; CHECK-GI-FP16-NEXT: ldr s17, [sp, #8]
; CHECK-GI-FP16-NEXT: umov w8, v0.h[4]
-; CHECK-GI-FP16-NEXT: umov w9, v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v2.s[1], w10
-; CHECK-GI-FP16-NEXT: mov v6.s[1], w1
-; CHECK-GI-FP16-NEXT: mov v17.s[1], w5
-; CHECK-GI-FP16-NEXT: mov v7.s[1], v16.s[0]
+; CHECK-GI-FP16-NEXT: umov w10, v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v4.s[1], w1
+; CHECK-GI-FP16-NEXT: mov v2.s[1], w9
+; CHECK-GI-FP16-NEXT: mov v5.s[1], v6.s[0]
+; CHECK-GI-FP16-NEXT: ldr s6, [sp, #24]
+; CHECK-GI-FP16-NEXT: mov v7.s[1], w5
+; CHECK-GI-FP16-NEXT: mov v6.s[1], v16.s[0]
; CHECK-GI-FP16-NEXT: ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT: fmov s1, w8
+; CHECK-GI-FP16-NEXT: mov v1.s[0], w8
; CHECK-GI-FP16-NEXT: umov w8, v0.h[6]
-; CHECK-GI-FP16-NEXT: mov v2.s[2], w10
; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-GI-FP16-NEXT: mov v6.s[2], w2
-; CHECK-GI-FP16-NEXT: mov v17.s[2], w6
-; CHECK-GI-FP16-NEXT: mov v7.s[2], v16.s[0]
-; CHECK-GI-FP16-NEXT: mov v1.s[1], w9
-; CHECK-GI-FP16-NEXT: mov w9, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT: fmov s5, w9
+; CHECK-GI-FP16-NEXT: mov v2.s[2], w9
+; CHECK-GI-FP16-NEXT: mov v4.s[2], w2
+; CHECK-GI-FP16-NEXT: mov v5.s[2], v17.s[0]
+; CHECK-GI-FP16-NEXT: mov v7.s[2], w6
; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT: mov v6.s[3], w3
-; CHECK-GI-FP16-NEXT: mov v1.s[2], w8
-; CHECK-GI-FP16-NEXT: fmov w8, s3
-; CHECK-GI-FP16-NEXT: fmov s3, w7
-; CHECK-GI-FP16-NEXT: mov v5.s[1], w9
+; CHECK-GI-FP16-NEXT: mov v6.s[2], v16.s[0]
+; CHECK-GI-FP16-NEXT: mov v1.s[1], w10
+; CHECK-GI-FP16-NEXT: mov w10, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT: mov v3.s[0], w10
+; CHECK-GI-FP16-NEXT: mov v4.s[3], w3
; CHECK-GI-FP16-NEXT: sshr v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT: mov v3.s[1], w8
-; CHECK-GI-FP16-NEXT: fmov w8, s4
-; CHECK-GI-FP16-NEXT: ldr s4, [sp, #16]
+; CHECK-GI-FP16-NEXT: mov v1.s[2], w8
+; CHECK-GI-FP16-NEXT: mov v3.s[1], w10
; CHECK-GI-FP16-NEXT: ushl v1.4s, v1.4s, v2.4s
; CHECK-GI-FP16-NEXT: neg v2.4s, v2.4s
-; CHECK-GI-FP16-NEXT: mov v5.s[2], w9
-; CHECK-GI-FP16-NEXT: mov v3.s[2], w8
+; CHECK-GI-FP16-NEXT: mov v3.s[2], w10
; CHECK-GI-FP16-NEXT: sshl v1.4s, v1.4s, v2.4s
-; CHECK-GI-FP16-NEXT: fmov w8, s4
-; CHECK-GI-FP16-NEXT: eor v2.16b, v1.16b, v5.16b
-; CHECK-GI-FP16-NEXT: and v1.16b, v17.16b, v1.16b
-; CHECK-GI-FP16-NEXT: mov v3.s[3], w8
-; CHECK-GI-FP16-NEXT: and v2.16b, v7.16b, v2.16b
-; CHECK-GI-FP16-NEXT: bsl v0.16b, v6.16b, v3.16b
+; CHECK-GI-FP16-NEXT: ldr s2, [sp, #16]
+; CHECK-GI-FP16-NEXT: mov v5.s[3], v2.s[0]
+; CHECK-GI-FP16-NEXT: eor v3.16b, v1.16b, v3.16b
+; CHECK-GI-FP16-NEXT: and v1.16b, v7.16b, v1.16b
+; CHECK-GI-FP16-NEXT: and v2.16b, v6.16b, v3.16b
+; CHECK-GI-FP16-NEXT: bsl v0.16b, v4.16b, v5.16b
; CHECK-GI-FP16-NEXT: orr v1.16b, v1.16b, v2.16b
; CHECK-GI-FP16-NEXT: mov s2, v0.s[1]
; CHECK-GI-FP16-NEXT: mov s3, v0.s[2]
; CHECK-GI-FP16-NEXT: mov s4, v0.s[3]
+; CHECK-GI-FP16-NEXT: fmov w0, s0
; CHECK-GI-FP16-NEXT: mov s5, v1.s[1]
; CHECK-GI-FP16-NEXT: mov s6, v1.s[2]
-; CHECK-GI-FP16-NEXT: fmov w0, s0
; CHECK-GI-FP16-NEXT: fmov w4, s1
; CHECK-GI-FP16-NEXT: fmov w1, s2
; CHECK-GI-FP16-NEXT: fmov w2, s3
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 84376107679d84..a42ec8e253be29 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -156,8 +156,8 @@ define <3 x float> @copysign_v3f32(<3 x float> %a, <3 x float> %b) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #-2147483648 // =0x80000000
; CHECK-GI-NEXT: mov w9, #2147483647 // =0x7fffffff
-; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: mov v2.s[0], w9
+; CHECK-GI-NEXT: mov v3.s[0], w8
; CHECK-GI-NEXT: mov v2.s[1], w9
; CHECK-GI-NEXT: mov v3.s[1], w8
; CHECK-GI-NEXT: mov v2.s[2], w9
@@ -207,22 +207,20 @@ define <7 x half> @copysign_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-GI-NEXT: mov w9, #32767 // =0x7fff
; CHECK-GI-NEXT: fmov s2, w9
; CHECK-GI-NEXT: fmov s3, w8
-; CHECK-GI-NEXT: mov v4.16b, v2.16b
-; CHECK-GI-NEXT: mov v5.16b, v3.16b
-; CHECK-GI-NEXT: mov v4.h[1], v2.h[0]
-; CHECK-GI-NEXT: mov v5.h[1], v3.h[0]
-; CHECK-GI-NEXT: mov v4.h[2], v2.h[0]
-; CHECK-GI-NEXT: mov v5.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v4.h[3], v2.h[0]
-; CHECK-GI-NEXT: mov v5.h[3], v3.h[0]
-; CHECK-GI-NEXT: mov v4.h[4], v2.h[0]
-; CHECK-GI-NEXT: mov v5.h[4], v3.h[0]
-; CHECK-GI-NEXT: mov v4.h[5], v2.h[0]
-; CHECK-GI-NEXT: mov v5.h[5], v3.h[0]
-; CHECK-GI-NEXT: mov v4.h[6], v2.h[0]
-; CHECK-GI-NEXT: mov v5.h[6], v3.h[0]
-; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b
-; CHECK-GI-NEXT: and v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT: mov v2.h[1], w9
+; CHECK-GI-NEXT: mov v3.h[1], w8
+; CHECK-GI-NEXT: mov v2.h[2], w9
+; CHECK-GI-NEXT: mov v3.h[2], w8
+; CHECK-GI-NEXT: mov v2.h[3], w9
+; CHECK-GI-NEXT: mov v3.h[3], w8
+; CHECK-GI-NEXT: mov v2.h[4], w9
+; CHECK-GI-NEXT: mov v3.h[4], w8
+; CHECK-GI-NEXT: mov v2.h[5], w9
+; CHECK-GI-NEXT: mov v3.h[5], w8
+; CHECK-GI-NEXT: mov v2.h[6], w9
+; CHECK-GI-NEXT: mov v3.h[6], w8
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index 1c761ea083028a..b408e9c1bd4e60 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -164,27 +164,21 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
; CHECK-GI-NOFP16-LABEL: ceil_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: frintp v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: frintp v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frintp v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: ceil_v7f16:
@@ -469,27 +463,21 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
; CHECK-GI-NOFP16-LABEL: floor_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: frintm v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: frintm v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frintm v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: floor_v7f16:
@@ -774,27 +762,21 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
; CHECK-GI-NOFP16-LABEL: nearbyint_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: frinti v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: frinti v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frinti v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: nearbyint_v7f16:
@@ -1079,27 +1061,21 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
; CHECK-GI-NOFP16-LABEL: roundeven_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: frintn v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: frintn v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frintn v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: roundeven_v7f16:
@@ -1384,27 +1360,21 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
; CHECK-GI-NOFP16-LABEL: rint_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: frintx v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: frintx v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frintx v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: rint_v7f16:
@@ -1689,27 +1659,21 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
; CHECK-GI-NOFP16-LABEL: round_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: frinta v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: frinta v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frinta v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: round_v7f16:
@@ -1994,27 +1958,21 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
; CHECK-GI-NOFP16-LABEL: trunc_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: frintz v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: frintz v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: frintz v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: trunc_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index d73a5dc73eefcd..5bdccccc62b99c 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -188,33 +188,25 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h6, v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: fdiv v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: fdiv v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: fdiv v1.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fdiv_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index 93d3d96d67b650..30ce389f231281 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -678,12 +678,12 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: fcvt s1, h11
@@ -701,18 +701,19 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl expf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -789,21 +790,21 @@ define <4 x half> @exp_v4f16(<4 x half> %a) {
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl expf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[3], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -919,12 +920,12 @@ define <8 x half> @exp_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: fcvt s1, h11
@@ -947,21 +948,21 @@ define <8 x half> @exp_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl expf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[5], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[7], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -1155,7 +1156,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: fcvt s1, h8
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: fcvt s1, h9
@@ -1180,7 +1181,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: fcvt s1, h13
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -1231,7 +1232,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl expf
; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[1], v2.h[0]
; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -1257,7 +1258,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: fcvt h2, s0
-; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[6], v0.h[0]
; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
@@ -1948,12 +1949,12 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: fcvt s1, h11
@@ -1971,18 +1972,19 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl exp2f
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -2059,21 +2061,21 @@ define <4 x half> @exp2_v4f16(<4 x half> %a) {
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl exp2f
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[3], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -2189,12 +2191,12 @@ define <8 x half> @exp2_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: fcvt s1, h11
@@ -2217,21 +2219,21 @@ define <8 x half> @exp2_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl exp2f
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[5], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[7], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -2425,7 +2427,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: fcvt s1, h8
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: fcvt s1, h9
@@ -2450,7 +2452,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: fcvt s1, h13
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -2501,7 +2503,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl exp2f
; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[1], v2.h[0]
; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -2527,7 +2529,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: fcvt h2, s0
-; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[6], v0.h[0]
; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
@@ -3218,12 +3220,12 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: fcvt s1, h11
@@ -3241,18 +3243,19 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl logf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -3329,21 +3332,21 @@ define <4 x half> @log_v4f16(<4 x half> %a) {
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl logf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[3], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -3459,12 +3462,12 @@ define <8 x half> @log_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: fcvt s1, h11
@@ -3487,21 +3490,21 @@ define <8 x half> @log_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl logf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[5], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[7], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -3695,7 +3698,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: fcvt s1, h8
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: fcvt s1, h9
@@ -3720,7 +3723,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: fcvt s1, h13
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -3771,7 +3774,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl logf
; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[1], v2.h[0]
; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -3797,7 +3800,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: fcvt h2, s0
-; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[6], v0.h[0]
; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
@@ -4488,12 +4491,12 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: fcvt s1, h11
@@ -4511,18 +4514,19 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log2f
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -4599,21 +4603,21 @@ define <4 x half> @log2_v4f16(<4 x half> %a) {
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log2f
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[3], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -4729,12 +4733,12 @@ define <8 x half> @log2_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: fcvt s1, h11
@@ -4757,21 +4761,21 @@ define <8 x half> @log2_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log2f
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[5], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[7], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -4965,7 +4969,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: fcvt s1, h8
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: fcvt s1, h9
@@ -4990,7 +4994,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: fcvt s1, h13
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -5041,7 +5045,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log2f
; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[1], v2.h[0]
; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -5067,7 +5071,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: fcvt h2, s0
-; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[6], v0.h[0]
; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
@@ -5758,12 +5762,12 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: fcvt s1, h11
@@ -5781,18 +5785,19 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log10f
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -5869,21 +5874,21 @@ define <4 x half> @log10_v4f16(<4 x half> %a) {
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log10f
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[3], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -5999,12 +6004,12 @@ define <8 x half> @log10_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: fcvt s1, h11
@@ -6027,21 +6032,21 @@ define <8 x half> @log10_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log10f
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[5], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[7], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -6235,7 +6240,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: fcvt s1, h8
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: fcvt s1, h9
@@ -6260,7 +6265,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: fcvt s1, h13
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -6311,7 +6316,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl log10f
; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[1], v2.h[0]
; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -6337,7 +6342,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: fcvt h2, s0
-; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[6], v0.h[0]
; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index 2ea7e0f3c44a9a..aa20304e52a951 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -11,13 +11,15 @@ define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
; CHECK-GI-LABEL: interleave2_v4f16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: dup v2.4s, w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: xtn v2.4h, v2.4s
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: xtn v0.4h, v2.4s
+; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: mov v2.s[0], w9
+; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: zip1 v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT: mov v2.s[1], w8
+; CHECK-GI-NEXT: zip1 v0.4h, v1.4h, v2.4h
; CHECK-GI-NEXT: ret
%retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
ret <4 x half> %retval
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index 357d91960624bd..fb12f8acf17453 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -664,33 +664,25 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-NOFP16-GI: // %bb.0: // %entry
; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h
; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[4]
-; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[5]
-; CHECK-NOFP16-GI-NEXT: mov h6, v1.h[4]
-; CHECK-NOFP16-GI-NEXT: mov h7, v1.h[5]
-; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4]
; CHECK-NOFP16-GI-NEXT: fmin v2.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov h3, v0.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v6.h[1], v7.h[0]
-; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v3.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v6.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v6.4h
-; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: fmin v1.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5]
+; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
+; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT: fmin v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2]
; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v5.h[0]
-; CHECK-NOFP16-GI-NEXT: mov h2, v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3]
; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: min_v7f16:
@@ -770,33 +762,25 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-NOFP16-GI: // %bb.0: // %entry
; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h
; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[4]
-; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[5]
-; CHECK-NOFP16-GI-NEXT: mov h6, v1.h[4]
-; CHECK-NOFP16-GI-NEXT: mov h7, v1.h[5]
-; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4]
; CHECK-NOFP16-GI-NEXT: fmax v2.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov h3, v0.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v6.h[1], v7.h[0]
-; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v3.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v6.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v6.4h
-; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: fmax v1.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5]
+; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
+; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT: fmax v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2]
; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v5.h[0]
-; CHECK-NOFP16-GI-NEXT: mov h2, v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3]
; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: max_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index 61199f82615bbe..64f0da8b4cd0f9 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -664,33 +664,25 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-NOFP16-GI: // %bb.0: // %entry
; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h
; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[4]
-; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[5]
-; CHECK-NOFP16-GI-NEXT: mov h6, v1.h[4]
-; CHECK-NOFP16-GI-NEXT: mov h7, v1.h[5]
-; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4]
; CHECK-NOFP16-GI-NEXT: fminnm v2.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov h3, v0.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v6.h[1], v7.h[0]
-; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v3.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v6.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v6.4h
-; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: fminnm v1.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5]
+; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
+; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT: fminnm v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2]
; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v5.h[0]
-; CHECK-NOFP16-GI-NEXT: mov h2, v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3]
; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: min_v7f16:
@@ -770,33 +762,25 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-NOFP16-GI: // %bb.0: // %entry
; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v0.4h
; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[4]
-; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[5]
-; CHECK-NOFP16-GI-NEXT: mov h6, v1.h[4]
-; CHECK-NOFP16-GI-NEXT: mov h7, v1.h[5]
-; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[0], v0.h[4]
; CHECK-NOFP16-GI-NEXT: fmaxnm v2.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov h3, v0.h[6]
-; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v6.h[1], v7.h[0]
-; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v3.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v6.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT: mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT: fcvtl v2.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v6.4h
-; CHECK-NOFP16-GI-NEXT: mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: fmaxnm v1.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[1], v1.h[5]
+; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h
+; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT: fmaxnm v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT: mov v0.h[2], v2.h[2]
; CHECK-NOFP16-GI-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v5.h[0]
-; CHECK-NOFP16-GI-NEXT: mov h2, v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[3], v2.h[3]
; CHECK-NOFP16-GI-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT: mov h1, v1.h[2]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT: mov v0.h[6], v1.h[2]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: max_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 4b019b57d968d3..7bcaae5a77eac5 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -257,39 +257,29 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h7, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h16, v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h17, v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h18, v2.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h19, v2.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT: mov h2, v2.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: fmla v5.4s, v4.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v18.h[1], v19.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v5.4s
-; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v16.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v18.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v16.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v18.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fmla v4.4s, v3.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v5.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v4.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v6.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v2.h[5]
+; CHECK-GI-NOFP16-NEXT: fcvtn v5.4h, v5.4s
+; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v2.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v5.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v5.h[1]
+; CHECK-GI-NOFP16-NEXT: fmla v3.4s, v2.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v5.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[3]
; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fma_v7f16:
@@ -864,46 +854,36 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h16, v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: fmul v3.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: mov h4, v2.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v7.h[1], v16.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v7.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov h3, v2.h[4]
-; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v6.4s, v7.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov h1, v2.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: fmul v2.4s, v5.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[5]
+; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6]
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fmuladd_v7f16:
@@ -1362,46 +1342,36 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h16, v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: fmul v3.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: mov h4, v2.h[5]
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v7.h[1], v16.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v7.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov h3, v2.h[4]
-; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v6.4s, v7.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov h1, v2.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: fmul v2.4s, v5.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[5]
+; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6]
+; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fmul_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index 1f41f2385c3357..bd3d1353e643e5 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -188,33 +188,25 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h6, v1.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: fmul v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[6]
-; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v2.h[3]
; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fmul_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index cc0f7d2fd6075d..a0e9edff733e09 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -162,27 +162,21 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: fneg v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fneg v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: fneg v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fabs_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index 8d40121ad4543f..6e8cd0c8c00b41 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -965,22 +965,22 @@ define <4 x half> @pow_v4f16(<4 x half> %a, <4 x half> %b) {
; CHECK-GI-NEXT: fcvt s2, h9
; CHECK-GI-NEXT: fcvt h0, s0
; CHECK-GI-NEXT: fcvt s1, h12
-; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s2
; CHECK-GI-NEXT: bl powf
; CHECK-GI-NEXT: fcvt s2, h10
; CHECK-GI-NEXT: fcvt h0, s0
; CHECK-GI-NEXT: fcvt s1, h13
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s2
; CHECK-GI-NEXT: bl powf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[3], v0.h[0]
diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
index 5dbcaa4a5fda17..62fc1c0854ca8b 100644
--- a/llvm/test/CodeGen/AArch64/fpowi.ll
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -869,22 +869,22 @@ define <4 x half> @powi_v4f16(<4 x half> %a, i32 %b) {
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
; CHECK-GI-NEXT: mov w0, w19
-; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl __powisf2
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
; CHECK-GI-NEXT: mov w0, w19
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl __powisf2
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[3], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 0c880592d955b7..20b5567e973d09 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -2585,7 +2585,7 @@ define <3 x i64> @fptos_v3f32_v3i64(<3 x float> %a) {
;
; CHECK-GI-LABEL: fptos_v3f32_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov s1, v0.s[2]
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[2]
; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s
; CHECK-GI-NEXT: fcvtl v1.2d, v1.2s
; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d
@@ -2614,7 +2614,7 @@ define <3 x i64> @fptou_v3f32_v3i64(<3 x float> %a) {
;
; CHECK-GI-LABEL: fptou_v3f32_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov s1, v0.s[2]
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[2]
; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s
; CHECK-GI-NEXT: fcvtl v1.2d, v1.2s
; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d
@@ -3181,10 +3181,10 @@ define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) {
; CHECK-GI-LABEL: fptos_v3f32_v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: mov s2, v0.s[2]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -3202,10 +3202,10 @@ define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) {
; CHECK-GI-LABEL: fptou_v3f32_v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: mov s2, v0.s[2]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -6077,10 +6077,10 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
@@ -6110,10 +6110,10 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NOFP16-NEXT: ret
;
@@ -7297,7 +7297,7 @@ define <2 x i64> @fptos_v2f128_v2i64(<2 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov x19, x0
; CHECK-GI-NEXT: bl __fixtfdi
-; CHECK-GI-NEXT: fmov d0, x19
+; CHECK-GI-NEXT: mov v0.d[0], x19
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.d[1], x0
; CHECK-GI-NEXT: add sp, sp, #32
@@ -7340,7 +7340,7 @@ define <2 x i64> @fptou_v2f128_v2i64(<2 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov x19, x0
; CHECK-GI-NEXT: bl __fixunstfdi
-; CHECK-GI-NEXT: fmov d0, x19
+; CHECK-GI-NEXT: mov v0.d[0], x19
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.d[1], x0
; CHECK-GI-NEXT: add sp, sp, #32
@@ -7496,7 +7496,7 @@ define <2 x i32> @fptos_v2f128_v2i32(<2 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w19, w0
; CHECK-GI-NEXT: bl __fixtfsi
-; CHECK-GI-NEXT: fmov s0, w19
+; CHECK-GI-NEXT: mov v0.s[0], w19
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.s[1], w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -7539,7 +7539,7 @@ define <2 x i32> @fptou_v2f128_v2i32(<2 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w19, w0
; CHECK-GI-NEXT: bl __fixunstfsi
-; CHECK-GI-NEXT: fmov s0, w19
+; CHECK-GI-NEXT: mov v0.s[0], w19
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.s[1], w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -7591,7 +7591,7 @@ define <3 x i32> @fptos_v3f128_v3i32(<3 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w20, w0
; CHECK-GI-NEXT: bl __fixtfsi
-; CHECK-GI-NEXT: fmov s0, w19
+; CHECK-GI-NEXT: mov v0.s[0], w19
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-GI-NEXT: mov v0.s[1], w20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
@@ -7644,7 +7644,7 @@ define <3 x i32> @fptou_v3f128_v3i32(<3 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w20, w0
; CHECK-GI-NEXT: bl __fixunstfsi
-; CHECK-GI-NEXT: fmov s0, w19
+; CHECK-GI-NEXT: mov v0.s[0], w19
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-GI-NEXT: mov v0.s[1], w20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
@@ -7689,9 +7689,8 @@ define <2 x i16> @fptos_v2f128_v2i16(<2 x fp128> %a) {
; CHECK-GI-NEXT: mov w19, w0
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: fmov s0, w19
-; CHECK-GI-NEXT: fmov s1, w0
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], w0
; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #32
@@ -7734,9 +7733,8 @@ define <2 x i16> @fptou_v2f128_v2i16(<2 x fp128> %a) {
; CHECK-GI-NEXT: mov w19, w0
; CHECK-GI-NEXT: bl __fixunstfsi
; CHECK-GI-NEXT: fmov s0, w19
-; CHECK-GI-NEXT: fmov s1, w0
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], w0
; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #32
@@ -7791,12 +7789,10 @@ define <3 x i16> @fptos_v3f128_v3i16(<3 x fp128> %a) {
; CHECK-GI-NEXT: mov w20, w0
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: fmov s0, w19
-; CHECK-GI-NEXT: fmov s1, w20
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.h[1], w20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: fmov s1, w0
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[2], w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
@@ -7850,12 +7846,10 @@ define <3 x i16> @fptou_v3f128_v3i16(<3 x fp128> %a) {
; CHECK-GI-NEXT: mov w20, w0
; CHECK-GI-NEXT: bl __fixunstfsi
; CHECK-GI-NEXT: fmov s0, w19
-; CHECK-GI-NEXT: fmov s1, w20
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.h[1], w20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: fmov s1, w0
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[2], w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ret
@@ -7896,7 +7890,7 @@ define <2 x i8> @fptos_v2f128_v2i8(<2 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w19, w0
; CHECK-GI-NEXT: bl __fixtfsi
-; CHECK-GI-NEXT: fmov s0, w19
+; CHECK-GI-NEXT: mov v0.s[0], w19
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.s[1], w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -7939,7 +7933,7 @@ define <2 x i8> @fptou_v2f128_v2i8(<2 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w19, w0
; CHECK-GI-NEXT: bl __fixunstfsi
-; CHECK-GI-NEXT: fmov s0, w19
+; CHECK-GI-NEXT: mov v0.s[0], w19
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.s[1], w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index aec5d7959226c3..c0d4ddef23132d 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -261,9 +261,9 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: fcvt s2, d2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
; CHECK-GI-NEXT: mov v0.s[2], v2.s[0]
; CHECK-GI-NEXT: ret
entry:
@@ -363,9 +363,9 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
; CHECK-GI-LABEL: fptrunc_v2f32_v2f16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index 1a10fd2f1cdc3d..fe5146d79895cb 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -952,22 +952,22 @@ define <4 x half> @frem_v4f16(<4 x half> %a, <4 x half> %b) {
; CHECK-GI-NEXT: fcvt s2, h9
; CHECK-GI-NEXT: fcvt h0, s0
; CHECK-GI-NEXT: fcvt s1, h12
-; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s2
; CHECK-GI-NEXT: bl fmodf
; CHECK-GI-NEXT: fcvt s2, h10
; CHECK-GI-NEXT: fcvt h0, s0
; CHECK-GI-NEXT: fcvt s1, h13
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s2
; CHECK-GI-NEXT: bl fmodf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[3], v0.h[0]
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index 0b34f9570fa77b..557add3a4eaeb2 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -678,12 +678,12 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: fcvt s1, h11
@@ -701,18 +701,19 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl sinf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -789,21 +790,21 @@ define <4 x half> @sin_v4f16(<4 x half> %a) {
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl sinf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[3], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -919,12 +920,12 @@ define <8 x half> @sin_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: fcvt s1, h11
@@ -947,21 +948,21 @@ define <8 x half> @sin_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl sinf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[5], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[7], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -1155,7 +1156,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: fcvt s1, h8
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: fcvt s1, h9
@@ -1180,7 +1181,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: fcvt s1, h13
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -1231,7 +1232,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl sinf
; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[1], v2.h[0]
; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -1257,7 +1258,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: fcvt h2, s0
-; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[6], v0.h[0]
; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
@@ -1948,12 +1949,12 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: fcvt s1, h11
@@ -1971,18 +1972,19 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl cosf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -2059,21 +2061,21 @@ define <4 x half> @cos_v4f16(<4 x half> %a) {
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl cosf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[3], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -2189,12 +2191,12 @@ define <8 x half> @cos_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: fcvt s1, h9
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: fcvt s1, h10
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: fcvt s1, h11
@@ -2217,21 +2219,21 @@ define <8 x half> @cos_v8f16(<8 x half> %a) {
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl cosf
-; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload
; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
; CHECK-GI-NEXT: ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #48] // 32-byte Folded Reload
; CHECK-GI-NEXT: ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT: ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT: mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT: ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[5], v3.h[0]
; CHECK-GI-NEXT: mov v1.h[6], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[7], v0.h[0]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
@@ -2425,7 +2427,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: fcvt s1, h8
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: fcvt s1, h9
@@ -2450,7 +2452,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: fcvt s1, h13
; CHECK-GI-NEXT: fcvt h0, s0
-; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -2501,7 +2503,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: fmov s0, s1
; CHECK-GI-NEXT: bl cosf
; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x29, x30, [sp, #304] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[1], v2.h[0]
; CHECK-GI-NEXT: ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -2527,7 +2529,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[5], v2.h[0]
; CHECK-GI-NEXT: fcvt h2, s0
-; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v3.h[6], v0.h[0]
; CHECK-GI-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v1.h[6], v0.h[0]
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 4b48bcc5508db0..6c5fd8e52b017c 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -196,27 +196,21 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
; CHECK-GI-NOFP16-LABEL: sqrt_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: fsqrt v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: fsqrt v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: fsqrt v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v2.h[2]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: sqrt_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 6baf1a84d407c4..b00e5d6c701d8b 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1228,18 +1228,18 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32>
; CHECK-GI-LABEL: v3i32_i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #31 // =0x1f
+; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff
; CHECK-GI-NEXT: cmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT: fmov s4, w8
+; CHECK-GI-NEXT: mov v4.s[0], w8
+; CHECK-GI-NEXT: mov v5.s[0], w9
; CHECK-GI-NEXT: mov v4.s[1], w8
+; CHECK-GI-NEXT: mov v5.s[1], w9
; CHECK-GI-NEXT: mov v4.s[2], w8
-; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: mov v1.s[1], w8
+; CHECK-GI-NEXT: mov v5.s[2], w9
; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: neg v4.4s, v4.4s
-; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: mov v1.s[2], w8
-; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: neg v1.4s, v4.4s
+; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b
; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 8b82004388b095..296e267a9c7f0b 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -250,23 +250,13 @@ entry:
}
define <3 x float> @insert_v3f32_0(<3 x float> %a, float %b, i32 %c) {
-; CHECK-SD-LABEL: insert_v3f32_0:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $s1 killed $s1 def $q1
-; CHECK-SD-NEXT: mov v1.s[1], v0.s[1]
-; CHECK-SD-NEXT: mov v1.s[2], v0.s[2]
-; CHECK-SD-NEXT: mov v0.16b, v1.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: insert_v3f32_0:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NEXT: // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT: mov s0, v0.s[2]
-; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
-; CHECK-GI-NEXT: mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: insert_v3f32_0:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
entry:
%d = insertelement <3 x float> %a, float %b, i32 0
ret <3 x float> %d
@@ -281,10 +271,11 @@ define <3 x float> @insert_v3f32_2(<3 x float> %a, float %b, i32 %c) {
;
; CHECK-GI-LABEL: insert_v3f32_2:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov s2, v0.s[1]
+; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
; CHECK-GI-NEXT: // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v2.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
; CHECK-GI-NEXT: ret
entry:
%d = insertelement <3 x float> %a, float %b, i32 2
@@ -983,11 +974,9 @@ define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) {
;
; CHECK-GI-LABEL: insert_v3i32_0:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: fmov s1, w0
-; CHECK-GI-NEXT: mov w9, v0.s[2]
-; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: mov v1.s[2], w9
+; CHECK-GI-NEXT: mov v1.s[0], w0
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: ret
entry:
@@ -1003,10 +992,10 @@ define <3 x i32> @insert_v3i32_2(<3 x i32> %a, i32 %b, i32 %c) {
;
; CHECK-GI-LABEL: insert_v3i32_2:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: fmov s1, w0
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v1.s[2], w0
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: ret
entry:
%d = insertelement <3 x i32> %a, i32 %b, i32 2
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 7a4c5cee27b805..4ac04798e15481 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -3309,30 +3309,28 @@ define <3 x double> @stofp_v3i8_v3f64(<3 x i8> %a) {
; CHECK-GI-LABEL: stofp_v3i8_v3f64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w1
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: fmov s1, w2
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], w1
+; CHECK-GI-NEXT: mov v0.h[2], w2
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
-; CHECK-GI-NEXT: mov h2, v0.h[1]
; CHECK-GI-NEXT: mov s1, v0.s[1]
+; CHECK-GI-NEXT: mov h2, v0.h[1]
; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
; CHECK-GI-NEXT: mov h2, v1.h[1]
; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
; CHECK-GI-NEXT: smov x8, v0.s[0]
; CHECK-GI-NEXT: smov x9, v0.s[1]
-; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0
-; CHECK-GI-NEXT: fmov d1, x8
-; CHECK-GI-NEXT: smov x8, v0.s[0]
+; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: smov x8, v1.s[0]
+; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: smov x9, v1.s[1]
+; CHECK-GI-NEXT: mov v1.d[0], x8
; CHECK-GI-NEXT: mov v1.d[1], x9
-; CHECK-GI-NEXT: smov x9, v0.s[1]
-; CHECK-GI-NEXT: fmov d2, x8
-; CHECK-GI-NEXT: scvtf v0.2d, v1.2d
-; CHECK-GI-NEXT: mov v2.d[1], x9
+; CHECK-GI-NEXT: scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT: scvtf v2.2d, v1.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
-; CHECK-GI-NEXT: scvtf v2.2d, v2.2d
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: ret
@@ -3363,30 +3361,28 @@ define <3 x double> @utofp_v3i8_v3f64(<3 x i8> %a) {
; CHECK-GI-LABEL: utofp_v3i8_v3f64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w1
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: fmov s1, w2
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT: mov v0.h[1], w1
+; CHECK-GI-NEXT: mov v0.h[2], w2
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT: mov h2, v0.h[1]
; CHECK-GI-NEXT: mov s1, v0.s[1]
+; CHECK-GI-NEXT: mov h2, v0.h[1]
; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
; CHECK-GI-NEXT: mov h2, v1.h[1]
; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
; CHECK-GI-NEXT: mov w8, v0.s[0]
; CHECK-GI-NEXT: mov w9, v0.s[1]
-; CHECK-GI-NEXT: ushll v0.4s, v1.4h, #0
-; CHECK-GI-NEXT: fmov d1, x8
-; CHECK-GI-NEXT: mov w8, v0.s[0]
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: mov w8, v1.s[0]
+; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: mov w9, v1.s[1]
+; CHECK-GI-NEXT: mov v1.d[0], x8
; CHECK-GI-NEXT: mov v1.d[1], x9
-; CHECK-GI-NEXT: mov w9, v0.s[1]
-; CHECK-GI-NEXT: fmov d2, x8
-; CHECK-GI-NEXT: ucvtf v0.2d, v1.2d
-; CHECK-GI-NEXT: mov v2.d[1], x9
+; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT: ucvtf v2.2d, v1.2d
; CHECK-GI-NEXT: mov d1, v0.d[1]
-; CHECK-GI-NEXT: ucvtf v2.2d, v2.2d
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: ret
@@ -4479,13 +4475,13 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: scvtf v1.2d, v2.2d
+; CHECK-GI-NEXT: scvtf v2.2d, v2.2d
; CHECK-GI-NEXT: scvtf v0.2d, v0.2d
-; CHECK-GI-NEXT: fcvtn v1.2s, v1.2d
-; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-GI-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d
+; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v2.s[0]
; CHECK-GI-NEXT: ret
entry:
%c = sitofp <3 x i64> %a to <3 x float>
@@ -4511,13 +4507,13 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: ucvtf v1.2d, v2.2d
+; CHECK-GI-NEXT: ucvtf v2.2d, v2.2d
; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-GI-NEXT: fcvtn v1.2s, v1.2d
-; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-GI-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d
+; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d
+; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], v2.s[0]
; CHECK-GI-NEXT: ret
entry:
%c = uitofp <3 x i64> %a to <3 x float>
@@ -5267,10 +5263,8 @@ define <3 x float> @stofp_v3i8_v3f32(<3 x i8> %a) {
; CHECK-GI-LABEL: stofp_v3i8_v3f32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w1
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: fmov s1, w2
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], w1
+; CHECK-GI-NEXT: mov v0.h[2], w2
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: mov s1, v0.s[1]
@@ -5302,11 +5296,9 @@ define <3 x float> @utofp_v3i8_v3f32(<3 x i8> %a) {
; CHECK-GI-LABEL: utofp_v3i8_v3f32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w1
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: fmov s1, w2
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT: mov v0.h[1], w1
+; CHECK-GI-NEXT: mov v0.h[2], w2
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mov s1, v0.s[1]
; CHECK-GI-NEXT: mov h2, v0.h[1]
@@ -6227,9 +6219,9 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: scvtf v0.2d, v0.2d
; CHECK-GI-NOFP16-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -6276,9 +6268,9 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: ucvtf v0.2d, v0.2d
; CHECK-GI-NOFP16-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -7215,9 +7207,9 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
; CHECK-GI-LABEL: stofp_v2i32_v2f16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: scvtf v0.2s, v0.2s
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -7238,9 +7230,9 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
; CHECK-GI-LABEL: utofp_v2i32_v2f16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ucvtf v0.2s, v0.2s
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -7448,9 +7440,9 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
; CHECK-GI-NOFP16-NEXT: shl v0.2s, v0.2s, #16
; CHECK-GI-NOFP16-NEXT: sshr v0.2s, v0.2s, #16
; CHECK-GI-NOFP16-NEXT: scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -7459,8 +7451,8 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
; CHECK-GI-FP16-LABEL: stofp_v2i16_v2f16:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
@@ -7491,9 +7483,9 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
; CHECK-GI-NOFP16-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-GI-NOFP16-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NOFP16-NEXT: ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -7502,8 +7494,8 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
; CHECK-GI-FP16-LABEL: utofp_v2i16_v2f16:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
@@ -7986,9 +7978,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
; CHECK-GI-NOFP16-NEXT: shl v0.2s, v0.2s, #24
; CHECK-GI-NOFP16-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-GI-NOFP16-NEXT: scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -7997,9 +7989,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
; CHECK-GI-FP16-LABEL: stofp_v2i8_v2f16:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-FP16-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-FP16-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-FP16-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-FP16-NEXT: xtn v0.4h, v1.4s
; CHECK-GI-FP16-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-FP16-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
@@ -8048,9 +8040,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
; CHECK-GI-NOFP16-NEXT: movi d1, #0x0000ff000000ff
; CHECK-GI-NOFP16-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NOFP16-NEXT: ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -8059,16 +8051,16 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
; CHECK-GI-FP16-LABEL: utofp_v2i8_v2f16:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-FP16-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-FP16-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-FP16-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-GI-FP16-NEXT: xtn v0.4h, v1.4s
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-FP16-NEXT: movi d1, #0x0000ff000000ff
; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-FP16-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-GI-FP16-NEXT: mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
@@ -8105,10 +8097,8 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
; CHECK-GI-NOFP16-LABEL: stofp_v3i8_v3f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fmov s0, w0
-; CHECK-GI-NOFP16-NEXT: fmov s1, w1
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fmov s1, w2
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w1
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w2
; CHECK-GI-NOFP16-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NOFP16-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
@@ -8126,10 +8116,8 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
; CHECK-GI-FP16-LABEL: stofp_v3i8_v3f16:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: fmov s0, w0
-; CHECK-GI-FP16-NEXT: fmov s1, w1
-; CHECK-GI-FP16-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-FP16-NEXT: fmov s1, w2
-; CHECK-GI-FP16-NEXT: mov v0.b[2], v1.b[0]
+; CHECK-GI-FP16-NEXT: mov v0.b[1], w1
+; CHECK-GI-FP16-NEXT: mov v0.b[2], w2
; CHECK-GI-FP16-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: ret
@@ -8162,11 +8150,9 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
; CHECK-GI-NOFP16-LABEL: utofp_v3i8_v3f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fmov s0, w0
-; CHECK-GI-NOFP16-NEXT: fmov s1, w1
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT: fmov s1, w2
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[0]
; CHECK-GI-NOFP16-NEXT: movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w1
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w2
; CHECK-GI-NOFP16-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1]
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
@@ -8183,10 +8169,8 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
; CHECK-GI-FP16-LABEL: utofp_v3i8_v3f16:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: fmov s0, w0
-; CHECK-GI-FP16-NEXT: fmov s1, w1
-; CHECK-GI-FP16-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-FP16-NEXT: fmov s1, w2
-; CHECK-GI-FP16-NEXT: mov v0.b[2], v1.b[0]
+; CHECK-GI-FP16-NEXT: mov v0.b[1], w1
+; CHECK-GI-FP16-NEXT: mov v0.b[2], w2
; CHECK-GI-FP16-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index 51d17ad0644f15..c1ea891bc86e7e 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -267,21 +267,21 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
; GISEL-NEXT: bl exp10f
; GISEL-NEXT: fcvt s1, h9
; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill
; GISEL-NEXT: fmov s0, s1
; GISEL-NEXT: bl exp10f
; GISEL-NEXT: fcvt s1, h10
; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill
+; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; GISEL-NEXT: fmov s0, s1
; GISEL-NEXT: bl exp10f
-; GISEL-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; GISEL-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload
; GISEL-NEXT: fcvt h0, s0
+; GISEL-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; GISEL-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload
; GISEL-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload
; GISEL-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload
-; GISEL-NEXT: mov v1.h[1], v2.h[0]
-; GISEL-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; GISEL-NEXT: mov v1.h[1], v3.h[0]
; GISEL-NEXT: mov v1.h[2], v2.h[0]
; GISEL-NEXT: mov v1.h[3], v0.h[0]
; GISEL-NEXT: mov v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index c3c0ec5e3d9d8d..a4d1c53c272aa1 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -118,7 +118,7 @@ define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b){
;
; CHECK-GI-LABEL: load_v2i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr b0, [x0]
+; CHECK-GI-NEXT: ld1 { v0.b }[0], [x0]
; CHECK-GI-NEXT: ldr b1, [x0, #1]
; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -158,8 +158,8 @@ define <2 x i16> @load_v2i16(ptr %ptr){
; CHECK-GI-LABEL: load_v2i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x0, #2]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT: add x8, x0, #2
+; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
@@ -235,6 +235,7 @@ define <7 x i8> @load_v7i8(ptr %ptr){
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr b0, [x0]
; CHECK-GI-NEXT: ldr b1, [x0, #1]
+; CHECK-GI-NEXT: mov v0.b[0], v0.b[0]
; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
; CHECK-GI-NEXT: ldr b1, [x0, #2]
; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
@@ -261,10 +262,10 @@ define <3 x i16> @load_v3i16(ptr %ptr){
; CHECK-GI-LABEL: load_v3i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x0, #2]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: ldr h1, [x0, #4]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT: add x8, x0, #2
+; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT: add x8, x0, #4
+; CHECK-GI-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%a = load <3 x i16>, ptr %ptr
@@ -280,18 +281,18 @@ define <7 x i16> @load_v7i16(ptr %ptr){
; CHECK-GI-LABEL: load_v7i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x0, #2]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: ldr h1, [x0, #4]
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT: ldr h1, [x0, #6]
-; CHECK-GI-NEXT: mov v0.h[3], v1.h[0]
-; CHECK-GI-NEXT: ldr h1, [x0, #8]
-; CHECK-GI-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-GI-NEXT: ldr h1, [x0, #10]
-; CHECK-GI-NEXT: mov v0.h[5], v1.h[0]
-; CHECK-GI-NEXT: ldr h1, [x0, #12]
-; CHECK-GI-NEXT: mov v0.h[6], v1.h[0]
+; CHECK-GI-NEXT: add x8, x0, #2
+; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT: add x8, x0, #4
+; CHECK-GI-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-GI-NEXT: add x8, x0, #6
+; CHECK-GI-NEXT: ld1 { v0.h }[3], [x8]
+; CHECK-GI-NEXT: add x8, x0, #8
+; CHECK-GI-NEXT: ld1 { v0.h }[4], [x8]
+; CHECK-GI-NEXT: add x8, x0, #10
+; CHECK-GI-NEXT: ld1 { v0.h }[5], [x8]
+; CHECK-GI-NEXT: add x8, x0, #12
+; CHECK-GI-NEXT: ld1 { v0.h }[6], [x8]
; CHECK-GI-NEXT: ret
%a = load <7 x i16>, ptr %ptr
ret <7 x i16> %a
@@ -305,10 +306,11 @@ define <3 x i32> @load_v3i32(ptr %ptr){
;
; CHECK-GI-LABEL: load_v3i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldp s0, s1, [x0]
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: ldr s1, [x0, #8]
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: ldr s0, [x0]
+; CHECK-GI-NEXT: add x8, x0, #4
+; CHECK-GI-NEXT: ld1 { v0.s }[1], [x8]
+; CHECK-GI-NEXT: add x8, x0, #8
+; CHECK-GI-NEXT: ld1 { v0.s }[2], [x8]
; CHECK-GI-NEXT: ret
%a = load <3 x i32>, ptr %ptr
ret <3 x i32> %a
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 50c0c8b11e7517..dbb4270fb8002e 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1120,12 +1120,10 @@ define <4 x i16> @vselect_constant_cond_zero_v4i16(<4 x i16> %a) {
; CHECK-GI-NEXT: mov w8, #1 // =0x1
; CHECK-GI-NEXT: mov w9, #0 // =0x0
; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: mov v3.16b, v1.16b
-; CHECK-GI-NEXT: mov v3.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov v3.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v3.b[3], v1.b[0]
-; CHECK-GI-NEXT: ushll v1.8h, v3.8b, #0
+; CHECK-GI-NEXT: mov v1.b[1], w9
+; CHECK-GI-NEXT: mov v1.b[2], w9
+; CHECK-GI-NEXT: mov v1.b[3], w8
+; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-NEXT: shl v1.4h, v1.4h, #15
; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #15
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
@@ -1148,10 +1146,9 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) {
; CHECK-GI-NEXT: mov w9, #0 // =0x0
; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: mov v3.16b, v1.16b
-; CHECK-GI-NEXT: mov v3.h[1], v2.h[0]
-; CHECK-GI-NEXT: mov v2.h[1], v1.h[0]
-; CHECK-GI-NEXT: ushll v1.4s, v3.4h, #0
+; CHECK-GI-NEXT: mov v2.h[1], w8
+; CHECK-GI-NEXT: mov v1.h[1], w9
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
; CHECK-GI-NEXT: mov v1.d[1], v2.d[0]
; CHECK-GI-NEXT: shl v1.4s, v1.4s, #31
@@ -1199,12 +1196,10 @@ define <4 x i16> @vselect_constant_cond_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-GI-NEXT: mov w8, #1 // =0x1
; CHECK-GI-NEXT: mov w9, #0 // =0x0
; CHECK-GI-NEXT: fmov s2, w8
-; CHECK-GI-NEXT: fmov s3, w9
-; CHECK-GI-NEXT: mov v4.16b, v2.16b
-; CHECK-GI-NEXT: mov v4.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov v4.b[2], v3.b[0]
-; CHECK-GI-NEXT: mov v4.b[3], v2.b[0]
-; CHECK-GI-NEXT: ushll v2.8h, v4.8b, #0
+; CHECK-GI-NEXT: mov v2.b[1], w9
+; CHECK-GI-NEXT: mov v2.b[2], w9
+; CHECK-GI-NEXT: mov v2.b[3], w8
+; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-GI-NEXT: shl v2.4h, v2.4h, #15
; CHECK-GI-NEXT: sshr v2.4h, v2.4h, #15
; CHECK-GI-NEXT: bif v0.8b, v1.8b, v2.8b
@@ -1227,10 +1222,9 @@ define <4 x i32> @vselect_constant_cond_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-GI-NEXT: mov w9, #0 // =0x0
; CHECK-GI-NEXT: fmov s2, w8
; CHECK-GI-NEXT: fmov s3, w9
-; CHECK-GI-NEXT: mov v4.16b, v2.16b
-; CHECK-GI-NEXT: mov v4.h[1], v3.h[0]
-; CHECK-GI-NEXT: mov v3.h[1], v2.h[0]
-; CHECK-GI-NEXT: ushll v2.4s, v4.4h, #0
+; CHECK-GI-NEXT: mov v3.h[1], w8
+; CHECK-GI-NEXT: mov v2.h[1], w9
+; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
; CHECK-GI-NEXT: shl v2.4s, v2.4s, #31
diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
index 59958afdd0d1e9..adc89f7a0d99d8 100644
--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -2674,10 +2674,10 @@ define <4 x i32> @fcmal4xfloat(<4 x float> %A, <4 x float> %B) {
; CHECK-GI-NEXT: mov w8, #1 // =0x1
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: mov v1.16b, v0.16b
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v0.h[0]
-; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v1.h[1], w8
; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-NEXT: mov v1.d[1], v0.d[0]
; CHECK-GI-NEXT: shl v0.4s, v1.4s, #31
; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #31
@@ -2725,10 +2725,10 @@ define <4 x i32> @fcmnv4xfloat(<4 x float> %A, <4 x float> %B) {
; CHECK-GI-NEXT: mov w8, #0 // =0x0
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: mov v1.16b, v0.16b
-; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v0.h[0]
-; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v1.h[1], w8
; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-NEXT: mov v1.d[1], v0.d[0]
; CHECK-GI-NEXT: shl v0.4s, v1.4s, #31
; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #31
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll
index 402682c89124bd..de2c9d50b80540 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll
@@ -1267,87 +1267,87 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) {
; CHECK-GI-LABEL: v20:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s4, [sp, #8]
-; CHECK-GI-NEXT: fmov s1, w0
+; CHECK-GI-NEXT: ldr s18, [sp, #8]
+; CHECK-GI-NEXT: mov v1.s[0], w0
+; CHECK-GI-NEXT: mov v4.s[0], w4
+; CHECK-GI-NEXT: ldr s17, [sp, #96]
+; CHECK-GI-NEXT: ldr s25, [sp, #104]
+; CHECK-GI-NEXT: mov v0.s[1], v18.s[0]
+; CHECK-GI-NEXT: ldr s18, [sp, #128]
+; CHECK-GI-NEXT: ldr s26, [sp, #136]
+; CHECK-GI-NEXT: ldr s22, [sp, #16]
; CHECK-GI-NEXT: ldr s2, [sp, #32]
-; CHECK-GI-NEXT: ldr s19, [sp, #40]
-; CHECK-GI-NEXT: fmov s3, w4
-; CHECK-GI-NEXT: mov v0.s[1], v4.s[0]
-; CHECK-GI-NEXT: ldr s16, [sp, #96]
-; CHECK-GI-NEXT: ldr s22, [sp, #104]
-; CHECK-GI-NEXT: mov v2.s[1], v19.s[0]
-; CHECK-GI-NEXT: ldr s19, [sp, #128]
-; CHECK-GI-NEXT: ldr s23, [sp, #136]
-; CHECK-GI-NEXT: ldr s18, [sp, #16]
+; CHECK-GI-NEXT: mov v17.s[1], v25.s[0]
; CHECK-GI-NEXT: mov v1.s[1], w1
-; CHECK-GI-NEXT: mov v3.s[1], w5
-; CHECK-GI-NEXT: mov v16.s[1], v22.s[0]
-; CHECK-GI-NEXT: mov v19.s[1], v23.s[0]
-; CHECK-GI-NEXT: ldr s4, [sp, #64]
+; CHECK-GI-NEXT: ldr s24, [sp, #40]
+; CHECK-GI-NEXT: mov v18.s[1], v26.s[0]
+; CHECK-GI-NEXT: mov v4.s[1], w5
+; CHECK-GI-NEXT: ldr s3, [sp, #64]
; CHECK-GI-NEXT: ldr s21, [sp, #72]
-; CHECK-GI-NEXT: mov v0.s[2], v18.s[0]
-; CHECK-GI-NEXT: ldr s18, [sp, #160]
-; CHECK-GI-NEXT: ldr s24, [sp, #168]
+; CHECK-GI-NEXT: ldr s19, [sp, #160]
+; CHECK-GI-NEXT: ldr s27, [sp, #168]
; CHECK-GI-NEXT: ldr s20, [sp, #192]
-; CHECK-GI-NEXT: ldr s25, [sp, #200]
+; CHECK-GI-NEXT: ldr s28, [sp, #200]
+; CHECK-GI-NEXT: mov v0.s[2], v22.s[0]
; CHECK-GI-NEXT: ldr s22, [sp, #224]
-; CHECK-GI-NEXT: ldr s27, [sp, #232]
+; CHECK-GI-NEXT: ldr s25, [sp, #232]
; CHECK-GI-NEXT: ldr s23, [sp, #112]
-; CHECK-GI-NEXT: ldr s26, [sp, #144]
-; CHECK-GI-NEXT: mov v18.s[1], v24.s[0]
-; CHECK-GI-NEXT: mov v20.s[1], v25.s[0]
-; CHECK-GI-NEXT: mov v4.s[1], v21.s[0]
-; CHECK-GI-NEXT: mov v22.s[1], v27.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v24.s[0]
+; CHECK-GI-NEXT: ldr s24, [sp, #144]
+; CHECK-GI-NEXT: mov v19.s[1], v27.s[0]
+; CHECK-GI-NEXT: mov v20.s[1], v28.s[0]
+; CHECK-GI-NEXT: mov v3.s[1], v21.s[0]
+; CHECK-GI-NEXT: mov v22.s[1], v25.s[0]
+; CHECK-GI-NEXT: ldr s16, [sp, #48]
; CHECK-GI-NEXT: mov v1.s[2], w2
-; CHECK-GI-NEXT: ldr s17, [sp, #48]
-; CHECK-GI-NEXT: mov v3.s[2], w6
-; CHECK-GI-NEXT: mov v16.s[2], v23.s[0]
-; CHECK-GI-NEXT: mov v19.s[2], v26.s[0]
+; CHECK-GI-NEXT: mov v4.s[2], w6
+; CHECK-GI-NEXT: mov v17.s[2], v23.s[0]
+; CHECK-GI-NEXT: mov v18.s[2], v24.s[0]
; CHECK-GI-NEXT: ldr s7, [sp, #80]
; CHECK-GI-NEXT: ldr s21, [sp, #176]
-; CHECK-GI-NEXT: ldr s24, [sp, #208]
-; CHECK-GI-NEXT: ldr s25, [sp, #240]
-; CHECK-GI-NEXT: mov v2.s[2], v17.s[0]
-; CHECK-GI-NEXT: ldr s17, [sp, #120]
+; CHECK-GI-NEXT: ldr s26, [sp, #208]
+; CHECK-GI-NEXT: ldr s24, [sp, #240]
+; CHECK-GI-NEXT: mov v2.s[2], v16.s[0]
+; CHECK-GI-NEXT: ldr s16, [sp, #120]
; CHECK-GI-NEXT: ldr s23, [sp, #152]
; CHECK-GI-NEXT: ldr s5, [sp, #24]
-; CHECK-GI-NEXT: mov v18.s[2], v21.s[0]
-; CHECK-GI-NEXT: mov v20.s[2], v24.s[0]
-; CHECK-GI-NEXT: mov v4.s[2], v7.s[0]
-; CHECK-GI-NEXT: mov v22.s[2], v25.s[0]
+; CHECK-GI-NEXT: mov v19.s[2], v21.s[0]
+; CHECK-GI-NEXT: mov v20.s[2], v26.s[0]
+; CHECK-GI-NEXT: mov v3.s[2], v7.s[0]
+; CHECK-GI-NEXT: mov v22.s[2], v24.s[0]
; CHECK-GI-NEXT: mov v1.s[3], w3
-; CHECK-GI-NEXT: mov v3.s[3], w7
-; CHECK-GI-NEXT: mov v16.s[3], v17.s[0]
-; CHECK-GI-NEXT: mov v19.s[3], v23.s[0]
+; CHECK-GI-NEXT: mov v4.s[3], w7
+; CHECK-GI-NEXT: mov v17.s[3], v16.s[0]
+; CHECK-GI-NEXT: mov v18.s[3], v23.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #56]
; CHECK-GI-NEXT: ldr s7, [sp, #184]
; CHECK-GI-NEXT: ldr s21, [sp, #216]
-; CHECK-GI-NEXT: ldr s17, [sp, #88]
+; CHECK-GI-NEXT: ldr s16, [sp, #88]
; CHECK-GI-NEXT: mov v0.s[3], v5.s[0]
; CHECK-GI-NEXT: ldr s5, [sp, #248]
; CHECK-GI-NEXT: mov v2.s[3], v6.s[0]
-; CHECK-GI-NEXT: mov v18.s[3], v7.s[0]
+; CHECK-GI-NEXT: mov v19.s[3], v7.s[0]
; CHECK-GI-NEXT: mov v20.s[3], v21.s[0]
-; CHECK-GI-NEXT: mov v4.s[3], v17.s[0]
+; CHECK-GI-NEXT: mov v3.s[3], v16.s[0]
; CHECK-GI-NEXT: mov v22.s[3], v5.s[0]
-; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v3.8h
-; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff
-; CHECK-GI-NEXT: uzp1 v5.8h, v16.8h, v19.8h
+; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v4.8h
+; CHECK-GI-NEXT: movi v4.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT: uzp1 v5.8h, v17.8h, v18.8h
; CHECK-GI-NEXT: dup v6.4s, w8
; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-GI-NEXT: uzp1 v2.8h, v18.8h, v20.8h
-; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-GI-NEXT: uzp1 v2.8h, v19.8h, v20.8h
+; CHECK-GI-NEXT: uzp1 v3.8h, v3.8h, v6.8h
; CHECK-GI-NEXT: uzp1 v6.8h, v22.8h, v6.8h
-; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT: and v5.16b, v5.16b, v3.16b
-; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
-; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-GI-NEXT: and v5.16b, v5.16b, v4.16b
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT: and v2.16b, v2.16b, v4.16b
; CHECK-GI-NEXT: add v1.8h, v1.8h, v5.8h
-; CHECK-GI-NEXT: and v4.16b, v4.16b, v3.16b
-; CHECK-GI-NEXT: and v3.16b, v6.16b, v3.16b
+; CHECK-GI-NEXT: and v3.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT: and v4.16b, v6.16b, v4.16b
; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h
; CHECK-GI-NEXT: ushll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT: add v3.4h, v4.4h, v3.4h
+; CHECK-GI-NEXT: add v3.4h, v3.4h, v4.4h
; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0
; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0
@@ -1459,59 +1459,59 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) {
;
; CHECK-GI-LABEL: i12:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s1, w0
-; CHECK-GI-NEXT: fmov s4, w4
+; CHECK-GI-NEXT: mov v1.s[0], w0
; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s20, [sp, #8]
+; CHECK-GI-NEXT: ldr s19, [sp, #8]
+; CHECK-GI-NEXT: mov v3.s[0], w4
; CHECK-GI-NEXT: ldr s2, [sp, #32]
-; CHECK-GI-NEXT: ldr s21, [sp, #40]
+; CHECK-GI-NEXT: ldr s20, [sp, #40]
; CHECK-GI-NEXT: ldr s16, [sp, #64]
-; CHECK-GI-NEXT: ldr s22, [sp, #72]
+; CHECK-GI-NEXT: ldr s21, [sp, #72]
+; CHECK-GI-NEXT: mov v0.s[1], v19.s[0]
; CHECK-GI-NEXT: ldr s17, [sp, #96]
-; CHECK-GI-NEXT: ldr s23, [sp, #104]
-; CHECK-GI-NEXT: mov v1.s[1], w1
-; CHECK-GI-NEXT: mov v4.s[1], w5
+; CHECK-GI-NEXT: ldr s22, [sp, #104]
+; CHECK-GI-NEXT: mov v2.s[1], v20.s[0]
; CHECK-GI-NEXT: ldr s18, [sp, #128]
-; CHECK-GI-NEXT: ldr s24, [sp, #136]
-; CHECK-GI-NEXT: mov v0.s[1], v20.s[0]
+; CHECK-GI-NEXT: ldr s23, [sp, #136]
+; CHECK-GI-NEXT: mov v1.s[1], w1
; CHECK-GI-NEXT: ldr s19, [sp, #160]
-; CHECK-GI-NEXT: ldr s25, [sp, #168]
-; CHECK-GI-NEXT: mov v2.s[1], v21.s[0]
-; CHECK-GI-NEXT: mov v16.s[1], v22.s[0]
-; CHECK-GI-NEXT: mov v17.s[1], v23.s[0]
-; CHECK-GI-NEXT: mov v18.s[1], v24.s[0]
-; CHECK-GI-NEXT: mov v19.s[1], v25.s[0]
+; CHECK-GI-NEXT: ldr s24, [sp, #168]
+; CHECK-GI-NEXT: mov v3.s[1], w5
; CHECK-GI-NEXT: ldr s6, [sp, #16]
+; CHECK-GI-NEXT: mov v16.s[1], v21.s[0]
+; CHECK-GI-NEXT: mov v17.s[1], v22.s[0]
+; CHECK-GI-NEXT: mov v18.s[1], v23.s[0]
+; CHECK-GI-NEXT: mov v19.s[1], v24.s[0]
; CHECK-GI-NEXT: ldr s7, [sp, #48]
; CHECK-GI-NEXT: ldr s20, [sp, #80]
; CHECK-GI-NEXT: ldr s21, [sp, #112]
-; CHECK-GI-NEXT: ldr s22, [sp, #144]
-; CHECK-GI-NEXT: ldr s23, [sp, #176]
-; CHECK-GI-NEXT: mov v1.s[2], w2
-; CHECK-GI-NEXT: mov v4.s[2], w6
; CHECK-GI-NEXT: mov v0.s[2], v6.s[0]
+; CHECK-GI-NEXT: ldr s6, [sp, #144]
+; CHECK-GI-NEXT: ldr s22, [sp, #176]
+; CHECK-GI-NEXT: mov v1.s[2], w2
+; CHECK-GI-NEXT: mov v3.s[2], w6
; CHECK-GI-NEXT: mov v2.s[2], v7.s[0]
; CHECK-GI-NEXT: mov v16.s[2], v20.s[0]
; CHECK-GI-NEXT: mov v17.s[2], v21.s[0]
-; CHECK-GI-NEXT: mov v18.s[2], v22.s[0]
-; CHECK-GI-NEXT: mov v19.s[2], v23.s[0]
-; CHECK-GI-NEXT: ldr s3, [sp, #24]
+; CHECK-GI-NEXT: mov v18.s[2], v6.s[0]
+; CHECK-GI-NEXT: mov v19.s[2], v22.s[0]
+; CHECK-GI-NEXT: ldr s4, [sp, #24]
; CHECK-GI-NEXT: ldr s5, [sp, #56]
; CHECK-GI-NEXT: ldr s6, [sp, #88]
; CHECK-GI-NEXT: ldr s7, [sp, #120]
; CHECK-GI-NEXT: ldr s20, [sp, #152]
; CHECK-GI-NEXT: ldr s21, [sp, #184]
; CHECK-GI-NEXT: mov v1.s[3], w3
-; CHECK-GI-NEXT: mov v4.s[3], w7
+; CHECK-GI-NEXT: mov v3.s[3], w7
; CHECK-GI-NEXT: movi v22.4s, #15, msl #8
-; CHECK-GI-NEXT: mov v0.s[3], v3.s[0]
+; CHECK-GI-NEXT: mov v0.s[3], v4.s[0]
; CHECK-GI-NEXT: mov v2.s[3], v5.s[0]
; CHECK-GI-NEXT: mov v16.s[3], v6.s[0]
; CHECK-GI-NEXT: mov v17.s[3], v7.s[0]
; CHECK-GI-NEXT: mov v18.s[3], v20.s[0]
; CHECK-GI-NEXT: mov v19.s[3], v21.s[0]
; CHECK-GI-NEXT: and v1.16b, v1.16b, v22.16b
-; CHECK-GI-NEXT: and v3.16b, v4.16b, v22.16b
+; CHECK-GI-NEXT: and v3.16b, v3.16b, v22.16b
; CHECK-GI-NEXT: and v4.16b, v0.16b, v22.16b
; CHECK-GI-NEXT: and v5.16b, v2.16b, v22.16b
; CHECK-GI-NEXT: and v0.16b, v16.16b, v22.16b
diff --git a/llvm/test/CodeGen/AArch64/neon-extmul.ll b/llvm/test/CodeGen/AArch64/neon-extmul.ll
index 3dbc033dfab964..f83ac8ed642cc1 100644
--- a/llvm/test/CodeGen/AArch64/neon-extmul.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extmul.ll
@@ -272,18 +272,18 @@ define <8 x i64> @extaddsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
; CHECK-GI-NEXT: mul x15, x15, x16
; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: fmov x11, d0
-; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: fmov d1, x9
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: mov v1.d[0], x9
; CHECK-GI-NEXT: mul x13, x13, x18
-; CHECK-GI-NEXT: mov v0.d[1], x12
; CHECK-GI-NEXT: mul x11, x11, x14
; CHECK-GI-NEXT: mov x14, v6.d[1]
+; CHECK-GI-NEXT: mov v0.d[1], x12
+; CHECK-GI-NEXT: mov v2.d[0], x10
; CHECK-GI-NEXT: mov v1.d[1], x15
-; CHECK-GI-NEXT: fmov d2, x10
; CHECK-GI-NEXT: mul x14, x14, x17
-; CHECK-GI-NEXT: fmov d3, x11
-; CHECK-GI-NEXT: mov v3.d[1], x13
+; CHECK-GI-NEXT: mov v3.d[0], x11
; CHECK-GI-NEXT: mov v2.d[1], x14
+; CHECK-GI-NEXT: mov v3.d[1], x13
; CHECK-GI-NEXT: ret
entry:
%s0s = sext <8 x i8> %s0 to <8 x i64>
@@ -423,22 +423,22 @@ define <8 x i64> @extmuladdsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b)
; CHECK-GI-NEXT: mul x15, x15, x16
; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: fmov x11, d0
-; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: fmov d1, x9
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: mov v1.d[0], x9
; CHECK-GI-NEXT: mul x13, x13, x18
-; CHECK-GI-NEXT: mov v0.d[1], x12
; CHECK-GI-NEXT: mul x11, x11, x14
; CHECK-GI-NEXT: mov x14, v18.d[1]
+; CHECK-GI-NEXT: mov v0.d[1], x12
+; CHECK-GI-NEXT: mov v6.d[0], x10
; CHECK-GI-NEXT: mov v1.d[1], x15
-; CHECK-GI-NEXT: fmov d6, x10
-; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-GI-NEXT: mul x14, x14, x17
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT: mov v7.d[0], x11
; CHECK-GI-NEXT: add v1.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT: fmov d7, x11
-; CHECK-GI-NEXT: mov v7.d[1], x13
; CHECK-GI-NEXT: mov v6.d[1], x14
-; CHECK-GI-NEXT: add v3.2d, v7.2d, v5.2d
+; CHECK-GI-NEXT: mov v7.d[1], x13
; CHECK-GI-NEXT: add v2.2d, v6.2d, v4.2d
+; CHECK-GI-NEXT: add v3.2d, v7.2d, v5.2d
; CHECK-GI-NEXT: ret
entry:
%s0s = sext <8 x i8> %s0 to <8 x i64>
diff --git a/llvm/test/CodeGen/AArch64/neon-perm.ll b/llvm/test/CodeGen/AArch64/neon-perm.ll
index 15763543113eb0..2897741780f602 100644
--- a/llvm/test/CodeGen/AArch64/neon-perm.ll
+++ b/llvm/test/CodeGen/AArch64/neon-perm.ll
@@ -1741,12 +1741,13 @@ define <4 x i8> @test_vzip1_v4i8(<8 x i8> %p) {
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[2]
+; CHECK-GI-NEXT: mov b0, v0.b[3]
+; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%lo = shufflevector <8 x i8> %p, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll
index af283f6a093e97..3263a5e03c1fdc 100644
--- a/llvm/test/CodeGen/AArch64/ptradd.ll
+++ b/llvm/test/CodeGen/AArch64/ptradd.ll
@@ -77,17 +77,18 @@ define void @vector_gep_v3i32(<3 x ptr> %b, <3 x i32> %off, ptr %p) {
;
; CHECK-GI-LABEL: vector_gep_v3i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: smov x8, v3.s[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: smov x9, v3.s[1]
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: fmov d1, x8
-; CHECK-GI-NEXT: mov w8, v3.s[2]
-; CHECK-GI-NEXT: mov v1.d[1], x9
+; CHECK-GI-NEXT: smov x9, v3.s[0]
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: smov x10, v3.s[1]
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: fmov x8, d1
+; CHECK-GI-NEXT: mov v4.d[0], x9
; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: mov w8, v3.s[2]
+; CHECK-GI-NEXT: mov v4.d[1], x10
; CHECK-GI-NEXT: add x8, x9, w8, sxtw
-; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-GI-NEXT: str x8, [x0, #16]
; CHECK-GI-NEXT: str q0, [x0]
; CHECK-GI-NEXT: ret
@@ -166,17 +167,18 @@ define void @vector_gep_v3i64(<3 x ptr> %b, <3 x i64> %off, ptr %p) {
;
; CHECK-GI-LABEL: vector_gep_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT: fmov x8, d2
; CHECK-GI-NEXT: fmov x9, d5
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: fmov x8, d1
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: fmov x8, d2
; CHECK-GI-NEXT: add x8, x8, x9
-; CHECK-GI-NEXT: str x8, [x0, #16]
; CHECK-GI-NEXT: add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT: str x8, [x0, #16]
; CHECK-GI-NEXT: str q0, [x0]
; CHECK-GI-NEXT: ret
entry:
@@ -206,13 +208,21 @@ entry:
}
define void @vector_gep_v4i128(<2 x ptr> %b, <2 x i128> %off, ptr %p) {
-; CHECK-LABEL: vector_gep_v4i128:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov d1, x0
-; CHECK-NEXT: mov v1.d[1], x2
-; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: str q0, [x4]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: vector_gep_v4i128:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmov d1, x0
+; CHECK-SD-NEXT: mov v1.d[1], x2
+; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT: str q0, [x4]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: vector_gep_v4i128:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v1.d[0], x0
+; CHECK-GI-NEXT: mov v1.d[1], x2
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT: str q0, [x4]
+; CHECK-GI-NEXT: ret
entry:
%g = getelementptr i8, <2 x ptr> %b, <2 x i128> %off
store <2 x ptr> %g, ptr %p
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index fa0447c2c5d798..adac75758220e2 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -165,18 +165,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
; CHECK-GI-NEXT: mov b2, v0.b[1]
-; CHECK-GI-NEXT: mov b3, v1.b[1]
-; CHECK-GI-NEXT: mov b4, v0.b[2]
-; CHECK-GI-NEXT: mov b5, v0.b[3]
-; CHECK-GI-NEXT: mov b6, v1.b[3]
-; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v1.b[2]
-; CHECK-GI-NEXT: mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT: mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT: mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT: sqadd v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT: mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b4, v1.b[1]
+; CHECK-GI-NEXT: mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT: mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: mov b0, v0.b[3]
+; CHECK-GI-NEXT: mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v1.b[2]
+; CHECK-GI-NEXT: mov b1, v1.b[3]
+; CHECK-GI-NEXT: mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT: mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT: mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT: mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT: sqadd v0.8b, v3.8b, v5.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: str w8, [x2]
; CHECK-GI-NEXT: ret
@@ -249,12 +251,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-GI-LABEL: v2i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x0, #2]
-; CHECK-GI-NEXT: ldr h2, [x1]
-; CHECK-GI-NEXT: ldr h3, [x1, #2]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT: sqadd v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT: ldr h1, [x1]
+; CHECK-GI-NEXT: add x8, x0, #2
+; CHECK-GI-NEXT: add x9, x1, #2
+; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT: sqadd v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: str h0, [x2]
; CHECK-GI-NEXT: str h1, [x2, #2]
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 5237a3491de9b4..277e7e9491015b 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -219,7 +219,7 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) {
;
; CHECK-GI-LABEL: sext_v3i8_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: mov v0.s[0], w0
; CHECK-GI-NEXT: mov v0.s[1], w1
; CHECK-GI-NEXT: mov v0.s[2], w2
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
@@ -245,8 +245,8 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) {
; CHECK-GI-LABEL: sext_v3i8_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #24 // =0x18
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: mov v0.s[1], w1
; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w2
@@ -280,7 +280,7 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
;
; CHECK-GI-LABEL: sext_v3i8_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: mov v0.s[0], w0
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
; CHECK-GI-NEXT: sxtb x8, w2
; CHECK-GI-NEXT: fmov d2, x8
@@ -307,7 +307,7 @@ define <3 x i32> @sext_v3i16_v3i32(<3 x i16> %a) {
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: smov w8, v0.h[0]
; CHECK-GI-NEXT: smov w9, v0.h[1]
-; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: smov w8, v0.h[2]
; CHECK-GI-NEXT: mov v1.s[1], w9
; CHECK-GI-NEXT: mov v1.s[2], w8
@@ -382,7 +382,7 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v3i10_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: mov v0.s[0], w0
; CHECK-GI-NEXT: mov v0.s[1], w1
; CHECK-GI-NEXT: mov v0.s[2], w2
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
@@ -408,8 +408,8 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) {
; CHECK-GI-LABEL: sext_v3i10_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #22 // =0x16
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: mov v0.s[1], w1
; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w2
@@ -443,7 +443,7 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v3i10_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: mov v0.s[0], w0
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
; CHECK-GI-NEXT: sbfx x8, x2, #0, #10
; CHECK-GI-NEXT: fmov d2, x8
@@ -1024,34 +1024,34 @@ define <16 x i16> @sext_v16i10_v16i16(<16 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v16i10_v16i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s4, w0
-; CHECK-GI-NEXT: fmov s5, w4
-; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s1, [sp, #8]
-; CHECK-GI-NEXT: ldr s2, [sp, #32]
-; CHECK-GI-NEXT: ldr s3, [sp, #40]
-; CHECK-GI-NEXT: mov v4.s[1], w1
-; CHECK-GI-NEXT: mov v5.s[1], w5
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: mov v1.s[0], w4
+; CHECK-GI-NEXT: ldr s2, [sp]
+; CHECK-GI-NEXT: ldr s3, [sp, #8]
+; CHECK-GI-NEXT: ldr s4, [sp, #32]
+; CHECK-GI-NEXT: ldr s5, [sp, #40]
; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT: ldr s1, [sp, #16]
-; CHECK-GI-NEXT: ldr s3, [sp, #48]
-; CHECK-GI-NEXT: mov v4.s[2], w2
-; CHECK-GI-NEXT: mov v5.s[2], w6
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov v4.s[1], v5.s[0]
+; CHECK-GI-NEXT: ldr s3, [sp, #16]
+; CHECK-GI-NEXT: mov v0.s[1], w1
+; CHECK-GI-NEXT: mov v1.s[1], w5
+; CHECK-GI-NEXT: ldr s5, [sp, #48]
; CHECK-GI-NEXT: mov v2.s[2], v3.s[0]
-; CHECK-GI-NEXT: ldr s1, [sp, #24]
-; CHECK-GI-NEXT: ldr s3, [sp, #56]
-; CHECK-GI-NEXT: mov v4.s[3], w3
-; CHECK-GI-NEXT: mov v5.s[3], w7
-; CHECK-GI-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-GI-NEXT: mov v4.s[2], v5.s[0]
+; CHECK-GI-NEXT: ldr s3, [sp, #24]
+; CHECK-GI-NEXT: mov v0.s[2], w2
+; CHECK-GI-NEXT: mov v1.s[2], w6
+; CHECK-GI-NEXT: ldr s5, [sp, #56]
; CHECK-GI-NEXT: mov v2.s[3], v3.s[0]
-; CHECK-GI-NEXT: uzp1 v1.8h, v4.8h, v5.8h
-; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT: mov v4.s[3], v5.s[0]
+; CHECK-GI-NEXT: mov v0.s[3], w3
+; CHECK-GI-NEXT: mov v1.s[3], w7
+; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v4.8h
+; CHECK-GI-NEXT: shl v0.8h, v0.8h, #6
; CHECK-GI-NEXT: shl v1.8h, v1.8h, #6
-; CHECK-GI-NEXT: shl v2.8h, v0.8h, #6
-; CHECK-GI-NEXT: sshr v0.8h, v1.8h, #6
-; CHECK-GI-NEXT: sshr v1.8h, v2.8h, #6
+; CHECK-GI-NEXT: sshr v0.8h, v0.8h, #6
+; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #6
; CHECK-GI-NEXT: ret
entry:
%c = sext <16 x i10> %a to <16 x i16>
@@ -1101,36 +1101,36 @@ define <16 x i32> @sext_v16i10_v16i32(<16 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v16i10_v16i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s4, w0
-; CHECK-GI-NEXT: fmov s5, w4
-; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s1, [sp, #8]
-; CHECK-GI-NEXT: ldr s2, [sp, #32]
-; CHECK-GI-NEXT: ldr s3, [sp, #40]
-; CHECK-GI-NEXT: mov v4.s[1], w1
-; CHECK-GI-NEXT: mov v5.s[1], w5
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: mov v1.s[0], w4
+; CHECK-GI-NEXT: ldr s2, [sp]
+; CHECK-GI-NEXT: ldr s3, [sp, #8]
+; CHECK-GI-NEXT: ldr s4, [sp, #32]
+; CHECK-GI-NEXT: ldr s5, [sp, #40]
; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT: ldr s1, [sp, #16]
-; CHECK-GI-NEXT: ldr s3, [sp, #48]
-; CHECK-GI-NEXT: mov v4.s[2], w2
-; CHECK-GI-NEXT: mov v5.s[2], w6
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov v4.s[1], v5.s[0]
+; CHECK-GI-NEXT: ldr s3, [sp, #16]
+; CHECK-GI-NEXT: mov v0.s[1], w1
+; CHECK-GI-NEXT: mov v1.s[1], w5
+; CHECK-GI-NEXT: ldr s5, [sp, #48]
; CHECK-GI-NEXT: mov v2.s[2], v3.s[0]
-; CHECK-GI-NEXT: ldr s1, [sp, #24]
-; CHECK-GI-NEXT: ldr s3, [sp, #56]
-; CHECK-GI-NEXT: mov v4.s[3], w3
-; CHECK-GI-NEXT: mov v5.s[3], w7
-; CHECK-GI-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-GI-NEXT: mov v4.s[2], v5.s[0]
+; CHECK-GI-NEXT: ldr s3, [sp, #24]
+; CHECK-GI-NEXT: mov v0.s[2], w2
+; CHECK-GI-NEXT: mov v1.s[2], w6
+; CHECK-GI-NEXT: ldr s5, [sp, #56]
; CHECK-GI-NEXT: mov v2.s[3], v3.s[0]
-; CHECK-GI-NEXT: shl v1.4s, v4.4s, #22
-; CHECK-GI-NEXT: shl v3.4s, v5.4s, #22
-; CHECK-GI-NEXT: shl v4.4s, v0.4s, #22
-; CHECK-GI-NEXT: shl v5.4s, v2.4s, #22
-; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #22
-; CHECK-GI-NEXT: sshr v1.4s, v3.4s, #22
-; CHECK-GI-NEXT: sshr v2.4s, v4.4s, #22
-; CHECK-GI-NEXT: sshr v3.4s, v5.4s, #22
+; CHECK-GI-NEXT: mov v4.s[3], v5.s[0]
+; CHECK-GI-NEXT: mov v0.s[3], w3
+; CHECK-GI-NEXT: mov v1.s[3], w7
+; CHECK-GI-NEXT: shl v2.4s, v2.4s, #22
+; CHECK-GI-NEXT: shl v3.4s, v4.4s, #22
+; CHECK-GI-NEXT: shl v0.4s, v0.4s, #22
+; CHECK-GI-NEXT: shl v1.4s, v1.4s, #22
+; CHECK-GI-NEXT: sshr v2.4s, v2.4s, #22
+; CHECK-GI-NEXT: sshr v3.4s, v3.4s, #22
+; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #22
+; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #22
; CHECK-GI-NEXT: ret
entry:
%c = sext <16 x i10> %a to <16 x i32>
@@ -1188,50 +1188,50 @@ define <16 x i64> @sext_v16i10_v16i64(<16 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v16i10_v16i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s7, w0
-; CHECK-GI-NEXT: fmov s17, w2
+; CHECK-GI-NEXT: mov v1.s[0], w0
+; CHECK-GI-NEXT: mov v2.s[0], w2
; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: fmov s18, w4
-; CHECK-GI-NEXT: fmov s19, w6
-; CHECK-GI-NEXT: ldr s1, [sp, #8]
-; CHECK-GI-NEXT: ldr s2, [sp, #16]
-; CHECK-GI-NEXT: ldr s3, [sp, #24]
-; CHECK-GI-NEXT: ldr s4, [sp, #32]
-; CHECK-GI-NEXT: ldr s5, [sp, #40]
-; CHECK-GI-NEXT: ldr s6, [sp, #48]
-; CHECK-GI-NEXT: ldr s16, [sp, #56]
-; CHECK-GI-NEXT: mov v7.s[1], w1
-; CHECK-GI-NEXT: mov v17.s[1], w3
-; CHECK-GI-NEXT: mov v18.s[1], w5
-; CHECK-GI-NEXT: mov v19.s[1], w7
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT: mov v6.s[1], v16.s[0]
-; CHECK-GI-NEXT: ushll v1.2d, v7.2s, #0
-; CHECK-GI-NEXT: ushll v3.2d, v17.2s, #0
-; CHECK-GI-NEXT: ushll v5.2d, v18.2s, #0
-; CHECK-GI-NEXT: ushll v7.2d, v19.2s, #0
+; CHECK-GI-NEXT: mov v3.s[0], w4
+; CHECK-GI-NEXT: mov v4.s[0], w6
+; CHECK-GI-NEXT: ldr s5, [sp, #8]
+; CHECK-GI-NEXT: ldr s6, [sp, #16]
+; CHECK-GI-NEXT: ldr s7, [sp, #24]
+; CHECK-GI-NEXT: ldr s16, [sp, #32]
+; CHECK-GI-NEXT: ldr s17, [sp, #40]
+; CHECK-GI-NEXT: ldr s18, [sp, #48]
+; CHECK-GI-NEXT: ldr s19, [sp, #56]
+; CHECK-GI-NEXT: mov v1.s[1], w1
+; CHECK-GI-NEXT: mov v0.s[1], v5.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], w3
+; CHECK-GI-NEXT: mov v3.s[1], w5
+; CHECK-GI-NEXT: mov v4.s[1], w7
+; CHECK-GI-NEXT: mov v6.s[1], v7.s[0]
+; CHECK-GI-NEXT: mov v16.s[1], v17.s[0]
+; CHECK-GI-NEXT: mov v18.s[1], v19.s[0]
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-GI-NEXT: ushll v3.2d, v3.2s, #0
; CHECK-GI-NEXT: ushll v4.2d, v4.2s, #0
-; CHECK-GI-NEXT: ushll v6.2d, v6.2s, #0
+; CHECK-GI-NEXT: ushll v5.2d, v6.2s, #0
+; CHECK-GI-NEXT: ushll v6.2d, v16.2s, #0
+; CHECK-GI-NEXT: ushll v7.2d, v18.2s, #0
+; CHECK-GI-NEXT: shl v0.2d, v0.2d, #54
; CHECK-GI-NEXT: shl v1.2d, v1.2d, #54
+; CHECK-GI-NEXT: shl v2.2d, v2.2d, #54
; CHECK-GI-NEXT: shl v3.2d, v3.2d, #54
+; CHECK-GI-NEXT: shl v16.2d, v4.2d, #54
; CHECK-GI-NEXT: shl v5.2d, v5.2d, #54
+; CHECK-GI-NEXT: shl v6.2d, v6.2d, #54
; CHECK-GI-NEXT: shl v7.2d, v7.2d, #54
-; CHECK-GI-NEXT: shl v16.2d, v0.2d, #54
-; CHECK-GI-NEXT: shl v17.2d, v2.2d, #54
-; CHECK-GI-NEXT: shl v18.2d, v4.2d, #54
-; CHECK-GI-NEXT: shl v19.2d, v6.2d, #54
+; CHECK-GI-NEXT: sshr v4.2d, v0.2d, #54
; CHECK-GI-NEXT: sshr v0.2d, v1.2d, #54
-; CHECK-GI-NEXT: sshr v1.2d, v3.2d, #54
-; CHECK-GI-NEXT: sshr v2.2d, v5.2d, #54
-; CHECK-GI-NEXT: sshr v3.2d, v7.2d, #54
-; CHECK-GI-NEXT: sshr v4.2d, v16.2d, #54
-; CHECK-GI-NEXT: sshr v5.2d, v17.2d, #54
-; CHECK-GI-NEXT: sshr v6.2d, v18.2d, #54
-; CHECK-GI-NEXT: sshr v7.2d, v19.2d, #54
+; CHECK-GI-NEXT: sshr v1.2d, v2.2d, #54
+; CHECK-GI-NEXT: sshr v2.2d, v3.2d, #54
+; CHECK-GI-NEXT: sshr v3.2d, v16.2d, #54
+; CHECK-GI-NEXT: sshr v5.2d, v5.2d, #54
+; CHECK-GI-NEXT: sshr v6.2d, v6.2d, #54
+; CHECK-GI-NEXT: sshr v7.2d, v7.2d, #54
; CHECK-GI-NEXT: ret
entry:
%c = sext <16 x i10> %a to <16 x i64>
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 9c8d3e0f07de87..951458da17c07e 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -537,22 +537,29 @@ define <4 x i8> @shl_v4i8(<4 x i8> %0, <4 x i8> %1){
; CHECK-GI-NEXT: mov h3, v1.h[1]
; CHECK-GI-NEXT: mov h4, v0.h[2]
; CHECK-GI-NEXT: mov h5, v0.h[3]
-; CHECK-GI-NEXT: mov h6, v1.h[3]
-; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: mov h2, v1.h[2]
-; CHECK-GI-NEXT: mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT: mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT: mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT: fmov w9, s3
+; CHECK-GI-NEXT: mov h3, v1.h[3]
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: mov v1.b[1], w9
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: fmov w9, s2
+; CHECK-GI-NEXT: mov v0.b[2], w8
+; CHECK-GI-NEXT: mov v1.b[2], w9
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: fmov w9, s3
+; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: mov v1.b[3], w9
; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[2]
+; CHECK-GI-NEXT: mov b0, v0.b[3]
+; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%3 = shl <4 x i8> %0, %1
@@ -587,10 +594,10 @@ define <2 x i16> @shl_v2i16(<2 x i16> %0, <2 x i16> %1){
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NEXT: mov s3, v1.s[1]
-; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v1.s[1]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v1.h[1], w9
; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
@@ -628,7 +635,7 @@ define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: lsl w8, w8, w9
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%3 = shl <1 x i32> %0, %1
@@ -684,24 +691,31 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %0, <4 x i8> %1){
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h3, v0.h[1]
; CHECK-GI-NEXT: mov h4, v1.h[2]
-; CHECK-GI-NEXT: mov h5, v1.h[3]
-; CHECK-GI-NEXT: mov h6, v0.h[3]
-; CHECK-GI-NEXT: mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: mov v0.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov v1.b[2], v4.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v1.b[3], v5.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v6.b[0]
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov h2, v1.h[3]
+; CHECK-GI-NEXT: fmov w9, s4
+; CHECK-GI-NEXT: mov h4, v0.h[3]
+; CHECK-GI-NEXT: mov v1.b[1], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov h3, v0.h[2]
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v1.b[2], w9
+; CHECK-GI-NEXT: mov v0.b[2], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v1.b[3], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov v0.b[3], w8
; CHECK-GI-NEXT: neg v1.8b, v1.8b
; CHECK-GI-NEXT: sshl v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[2]
+; CHECK-GI-NEXT: mov b0, v0.b[3]
+; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%3 = ashr <4 x i8> %0, %1
@@ -734,11 +748,11 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %0, <2 x i16> %1){
; CHECK-GI-LABEL: ashr_v2i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov s2, v1.s[1]
+; CHECK-GI-NEXT: mov w8, v1.s[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s3, v0.s[1]
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v3.h[0]
+; CHECK-GI-NEXT: mov w9, v0.s[1]
+; CHECK-GI-NEXT: mov v1.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[1], w9
; CHECK-GI-NEXT: neg v1.4h, v1.4h
; CHECK-GI-NEXT: sshl v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
@@ -774,7 +788,7 @@ define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: asr w8, w8, w9
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%3 = ashr <1 x i32> %0, %1
@@ -821,24 +835,31 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %0, <4 x i8> %1){
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h3, v0.h[1]
; CHECK-GI-NEXT: mov h4, v1.h[2]
-; CHECK-GI-NEXT: mov h5, v1.h[3]
-; CHECK-GI-NEXT: mov h6, v0.h[3]
-; CHECK-GI-NEXT: mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: mov v0.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov v1.b[2], v4.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v1.b[3], v5.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v6.b[0]
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov h2, v1.h[3]
+; CHECK-GI-NEXT: fmov w9, s4
+; CHECK-GI-NEXT: mov h4, v0.h[3]
+; CHECK-GI-NEXT: mov v1.b[1], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov h3, v0.h[2]
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: fmov w8, s3
+; CHECK-GI-NEXT: mov v1.b[2], w9
+; CHECK-GI-NEXT: mov v0.b[2], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v1.b[3], w8
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: mov v0.b[3], w8
; CHECK-GI-NEXT: neg v1.8b, v1.8b
; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov b2, v0.b[2]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[2]
+; CHECK-GI-NEXT: mov b0, v0.b[3]
+; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%3 = lshr <4 x i8> %0, %1
@@ -870,11 +891,11 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %0, <2 x i16> %1){
; CHECK-GI-LABEL: lshr_v2i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov s2, v1.s[1]
+; CHECK-GI-NEXT: mov w8, v1.s[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s3, v0.s[1]
-; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT: mov v0.h[1], v3.h[0]
+; CHECK-GI-NEXT: mov w9, v0.s[1]
+; CHECK-GI-NEXT: mov v1.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[1], w9
; CHECK-GI-NEXT: neg v1.4h, v1.4h
; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
@@ -910,7 +931,7 @@ define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: lsr w8, w8, w9
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%3 = lshr <1 x i32> %0, %1
@@ -962,16 +983,12 @@ define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){
; CHECK-GI-LABEL: shl_v3i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w1
-; CHECK-GI-NEXT: fmov s2, w3
-; CHECK-GI-NEXT: fmov s3, w4
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT: fmov s1, w2
-; CHECK-GI-NEXT: mov v2.b[1], v3.b[0]
-; CHECK-GI-NEXT: fmov s3, w5
-; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT: fmov s1, w3
+; CHECK-GI-NEXT: mov v0.b[1], w1
+; CHECK-GI-NEXT: mov v1.b[1], w4
+; CHECK-GI-NEXT: mov v0.b[2], w2
+; CHECK-GI-NEXT: mov v1.b[2], w5
+; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: umov w0, v0.b[0]
; CHECK-GI-NEXT: umov w1, v0.b[1]
; CHECK-GI-NEXT: umov w2, v0.b[2]
@@ -1038,15 +1055,11 @@ define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){
; CHECK-GI-LABEL: ashr_v3i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s0, w3
-; CHECK-GI-NEXT: fmov s1, w4
-; CHECK-GI-NEXT: fmov s2, w1
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
; CHECK-GI-NEXT: fmov s1, w0
-; CHECK-GI-NEXT: mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT: fmov s2, w5
-; CHECK-GI-NEXT: mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT: fmov s2, w2
-; CHECK-GI-NEXT: mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT: mov v0.b[1], w4
+; CHECK-GI-NEXT: mov v1.b[1], w1
+; CHECK-GI-NEXT: mov v0.b[2], w5
+; CHECK-GI-NEXT: mov v1.b[2], w2
; CHECK-GI-NEXT: neg v0.8b, v0.8b
; CHECK-GI-NEXT: sshl v0.8b, v1.8b, v0.8b
; CHECK-GI-NEXT: umov w0, v0.b[0]
@@ -1118,15 +1131,11 @@ define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){
; CHECK-GI-LABEL: lshr_v3i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s0, w3
-; CHECK-GI-NEXT: fmov s1, w4
-; CHECK-GI-NEXT: fmov s2, w1
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
; CHECK-GI-NEXT: fmov s1, w0
-; CHECK-GI-NEXT: mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT: fmov s2, w5
-; CHECK-GI-NEXT: mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT: fmov s2, w2
-; CHECK-GI-NEXT: mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT: mov v0.b[1], w4
+; CHECK-GI-NEXT: mov v1.b[1], w1
+; CHECK-GI-NEXT: mov v0.b[2], w5
+; CHECK-GI-NEXT: mov v1.b[2], w2
; CHECK-GI-NEXT: neg v0.8b, v0.8b
; CHECK-GI-NEXT: ushl v0.8b, v1.8b, v0.8b
; CHECK-GI-NEXT: umov w0, v0.b[0]
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index b1131f287fe9a9..954458e4459749 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -213,17 +213,23 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov h2, v0.h[1]
; CHECK-GI-NEXT: mov h3, v1.h[1]
-; CHECK-GI-NEXT: adrp x8, .LCPI15_0
; CHECK-GI-NEXT: mov h4, v0.h[2]
; CHECK-GI-NEXT: mov h5, v0.h[3]
-; CHECK-GI-NEXT: mov h6, v1.h[3]
-; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: mov h2, v1.h[2]
-; CHECK-GI-NEXT: mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT: mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT: mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT: fmov w9, s3
+; CHECK-GI-NEXT: mov h3, v1.h[3]
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: mov v1.b[1], w9
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: fmov w9, s2
+; CHECK-GI-NEXT: mov v0.b[2], w8
+; CHECK-GI-NEXT: mov v1.b[2], w9
+; CHECK-GI-NEXT: fmov w8, s5
+; CHECK-GI-NEXT: fmov w9, s3
+; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: mov v1.b[3], w9
+; CHECK-GI-NEXT: adrp x8, .LCPI15_0
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI15_0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
@@ -280,11 +286,11 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NEXT: mov s3, v1.s[1]
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v1.s[1]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v1.h[1], w9
; CHECK-GI-NEXT: adrp x8, .LCPI17_0
-; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI17_0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
@@ -397,8 +403,17 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
;
; CHECK-GI-LABEL: shufflevector_v4i8_zeroes:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: dup v0.8b, w8
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NEXT: mov h2, v0.h[2]
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov h1, v0.h[3]
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.b[2], w8
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: dup v0.8b, v0.b[0]
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
%c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -433,8 +448,10 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
;
; CHECK-GI-LABEL: shufflevector_v2i16_zeroes:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: dup v0.4h, w8
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
%c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 0, i32 0>
@@ -493,18 +510,14 @@ define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) {
; CHECK-GI-LABEL: shufflevector_v3i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: fmov s1, w3
; CHECK-GI-NEXT: adrp x8, .LCPI30_0
-; CHECK-GI-NEXT: fmov s2, w3
-; CHECK-GI-NEXT: fmov s3, w4
-; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT: fmov s1, w2
-; CHECK-GI-NEXT: mov v2.b[1], v3.b[0]
-; CHECK-GI-NEXT: fmov s3, w5
-; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT: mov v0.b[1], w1
+; CHECK-GI-NEXT: mov v1.b[1], w4
+; CHECK-GI-NEXT: mov v0.b[2], w2
+; CHECK-GI-NEXT: mov v1.b[2], w5
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI30_0]
-; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT: mov v0.d[1], v2.d[0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT: mov b1, v0.b[1]
; CHECK-GI-NEXT: mov b2, v0.b[2]
@@ -614,7 +627,10 @@ define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) {
;
; CHECK-GI-LABEL: shufflevector_v3i8_zeroes:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: dup v0.8b, w0
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: mov v0.b[1], w1
+; CHECK-GI-NEXT: mov v0.b[2], w2
+; CHECK-GI-NEXT: dup v0.8b, v0.b[0]
; CHECK-GI-NEXT: mov b1, v0.b[1]
; CHECK-GI-NEXT: mov b2, v0.b[2]
; CHECK-GI-NEXT: fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index d8b2762cf15e90..12371ef2c0021b 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -166,18 +166,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
; CHECK-GI-NEXT: mov b2, v0.b[1]
-; CHECK-GI-NEXT: mov b3, v1.b[1]
-; CHECK-GI-NEXT: mov b4, v0.b[2]
-; CHECK-GI-NEXT: mov b5, v0.b[3]
-; CHECK-GI-NEXT: mov b6, v1.b[3]
-; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v1.b[2]
-; CHECK-GI-NEXT: mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT: mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT: mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT: sqsub v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT: mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b4, v1.b[1]
+; CHECK-GI-NEXT: mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT: mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: mov b0, v0.b[3]
+; CHECK-GI-NEXT: mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v1.b[2]
+; CHECK-GI-NEXT: mov b1, v1.b[3]
+; CHECK-GI-NEXT: mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT: mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT: mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT: mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT: sqsub v0.8b, v3.8b, v5.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: str w8, [x2]
; CHECK-GI-NEXT: ret
@@ -250,12 +252,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-GI-LABEL: v2i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x0, #2]
-; CHECK-GI-NEXT: ldr h2, [x1]
-; CHECK-GI-NEXT: ldr h3, [x1, #2]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT: sqsub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT: ldr h1, [x1]
+; CHECK-GI-NEXT: add x8, x0, #2
+; CHECK-GI-NEXT: add x9, x1, #2
+; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT: sqsub v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: str h0, [x2]
; CHECK-GI-NEXT: str h1, [x2, #2]
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index afc0d8704ebace..e99935e8677fc7 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -162,18 +162,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
; CHECK-GI-NEXT: mov b2, v0.b[1]
-; CHECK-GI-NEXT: mov b3, v1.b[1]
-; CHECK-GI-NEXT: mov b4, v0.b[2]
-; CHECK-GI-NEXT: mov b5, v0.b[3]
-; CHECK-GI-NEXT: mov b6, v1.b[3]
-; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v1.b[2]
-; CHECK-GI-NEXT: mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT: mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT: mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT: uqadd v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT: mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b4, v1.b[1]
+; CHECK-GI-NEXT: mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT: mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: mov b0, v0.b[3]
+; CHECK-GI-NEXT: mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v1.b[2]
+; CHECK-GI-NEXT: mov b1, v1.b[3]
+; CHECK-GI-NEXT: mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT: mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT: mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT: mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT: uqadd v0.8b, v3.8b, v5.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: str w8, [x2]
; CHECK-GI-NEXT: ret
@@ -248,12 +250,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-GI-LABEL: v2i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x0, #2]
-; CHECK-GI-NEXT: ldr h2, [x1]
-; CHECK-GI-NEXT: ldr h3, [x1, #2]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT: uqadd v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT: ldr h1, [x1]
+; CHECK-GI-NEXT: add x8, x0, #2
+; CHECK-GI-NEXT: add x9, x1, #2
+; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT: uqadd v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: str h0, [x2]
; CHECK-GI-NEXT: str h1, [x2, #2]
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index dfcbe96ea948a8..cdba9625431a58 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -163,18 +163,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
; CHECK-GI-NEXT: mov b2, v0.b[1]
-; CHECK-GI-NEXT: mov b3, v1.b[1]
-; CHECK-GI-NEXT: mov b4, v0.b[2]
-; CHECK-GI-NEXT: mov b5, v0.b[3]
-; CHECK-GI-NEXT: mov b6, v1.b[3]
-; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT: mov b2, v1.b[2]
-; CHECK-GI-NEXT: mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT: mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT: mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT: mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT: uqsub v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT: mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b4, v1.b[1]
+; CHECK-GI-NEXT: mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT: mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: mov b0, v0.b[3]
+; CHECK-GI-NEXT: mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT: mov b4, v1.b[2]
+; CHECK-GI-NEXT: mov b1, v1.b[3]
+; CHECK-GI-NEXT: mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT: mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT: mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT: mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT: uqsub v0.8b, v3.8b, v5.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: str w8, [x2]
; CHECK-GI-NEXT: ret
@@ -245,12 +247,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-GI-LABEL: v2i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x0, #2]
-; CHECK-GI-NEXT: ldr h2, [x1]
-; CHECK-GI-NEXT: ldr h3, [x1, #2]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT: uqsub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT: ldr h1, [x1]
+; CHECK-GI-NEXT: add x8, x0, #2
+; CHECK-GI-NEXT: add x9, x1, #2
+; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT: uqsub v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: str h0, [x2]
; CHECK-GI-NEXT: str h1, [x2, #2]
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index c81fd26a775256..294f26dc0385f8 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -3812,49 +3812,49 @@ define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) {
;
; CHECK-GI-LABEL: add_v24i8_v24i16_zext:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s4, w0
-; CHECK-GI-NEXT: fmov s5, w4
-; CHECK-GI-NEXT: ldr s0, [sp]
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: mov v1.s[0], w4
+; CHECK-GI-NEXT: ldr s2, [sp]
; CHECK-GI-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-NEXT: ldr s1, [sp, #32]
+; CHECK-GI-NEXT: ldr s3, [sp, #32]
; CHECK-GI-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-NEXT: ldr s2, [sp, #64]
+; CHECK-GI-NEXT: ldr s4, [sp, #64]
; CHECK-GI-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-NEXT: ldr s3, [sp, #96]
+; CHECK-GI-NEXT: ldr s5, [sp, #96]
; CHECK-GI-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-NEXT: mov v4.s[1], w1
-; CHECK-GI-NEXT: mov v5.s[1], w5
-; CHECK-GI-NEXT: mov v0.s[1], v6.s[0]
-; CHECK-GI-NEXT: mov v1.s[1], v7.s[0]
-; CHECK-GI-NEXT: mov v2.s[1], v16.s[0]
-; CHECK-GI-NEXT: mov v3.s[1], v17.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v6.s[0]
+; CHECK-GI-NEXT: mov v3.s[1], v7.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], w1
+; CHECK-GI-NEXT: mov v1.s[1], w5
+; CHECK-GI-NEXT: mov v4.s[1], v16.s[0]
+; CHECK-GI-NEXT: mov v5.s[1], v17.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #16]
; CHECK-GI-NEXT: ldr s7, [sp, #48]
; CHECK-GI-NEXT: ldr s16, [sp, #80]
; CHECK-GI-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-NEXT: mov v4.s[2], w2
-; CHECK-GI-NEXT: mov v5.s[2], w6
-; CHECK-GI-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-NEXT: mov v1.s[2], v7.s[0]
-; CHECK-GI-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-NEXT: mov v3.s[2], v17.s[0]
+; CHECK-GI-NEXT: mov v2.s[2], v6.s[0]
+; CHECK-GI-NEXT: mov v3.s[2], v7.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #24]
+; CHECK-GI-NEXT: mov v0.s[2], w2
+; CHECK-GI-NEXT: mov v1.s[2], w6
+; CHECK-GI-NEXT: mov v4.s[2], v16.s[0]
+; CHECK-GI-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-NEXT: ldr s7, [sp, #56]
; CHECK-GI-NEXT: ldr s16, [sp, #88]
; CHECK-GI-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-NEXT: mov v4.s[3], w3
-; CHECK-GI-NEXT: mov v5.s[3], w7
-; CHECK-GI-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-GI-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-GI-NEXT: mov v2.s[3], v16.s[0]
-; CHECK-GI-NEXT: mov v3.s[3], v17.s[0]
-; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v5.8h
+; CHECK-GI-NEXT: mov v2.s[3], v6.s[0]
+; CHECK-GI-NEXT: mov v3.s[3], v7.s[0]
+; CHECK-GI-NEXT: mov v0.s[3], w3
+; CHECK-GI-NEXT: mov v1.s[3], w7
+; CHECK-GI-NEXT: mov v4.s[3], v16.s[0]
+; CHECK-GI-NEXT: mov v5.s[3], v17.s[0]
; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-NEXT: xtn v1.8b, v1.8h
+; CHECK-GI-NEXT: uzp1 v2.8h, v4.8h, v5.8h
+; CHECK-GI-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: xtn v2.8b, v2.8h
; CHECK-GI-NEXT: uaddlv h0, v0.16b
-; CHECK-GI-NEXT: uaddlv h1, v1.8b
+; CHECK-GI-NEXT: uaddlv h1, v2.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
@@ -3938,49 +3938,49 @@ define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) {
;
; CHECK-GI-LABEL: add_v24i8_v24i16_sext:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s4, w0
-; CHECK-GI-NEXT: fmov s5, w4
-; CHECK-GI-NEXT: ldr s0, [sp]
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: mov v1.s[0], w4
+; CHECK-GI-NEXT: ldr s2, [sp]
; CHECK-GI-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-NEXT: ldr s1, [sp, #32]
+; CHECK-GI-NEXT: ldr s3, [sp, #32]
; CHECK-GI-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-NEXT: ldr s2, [sp, #64]
+; CHECK-GI-NEXT: ldr s4, [sp, #64]
; CHECK-GI-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-NEXT: ldr s3, [sp, #96]
+; CHECK-GI-NEXT: ldr s5, [sp, #96]
; CHECK-GI-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-NEXT: mov v4.s[1], w1
-; CHECK-GI-NEXT: mov v5.s[1], w5
-; CHECK-GI-NEXT: mov v0.s[1], v6.s[0]
-; CHECK-GI-NEXT: mov v1.s[1], v7.s[0]
-; CHECK-GI-NEXT: mov v2.s[1], v16.s[0]
-; CHECK-GI-NEXT: mov v3.s[1], v17.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v6.s[0]
+; CHECK-GI-NEXT: mov v3.s[1], v7.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], w1
+; CHECK-GI-NEXT: mov v1.s[1], w5
+; CHECK-GI-NEXT: mov v4.s[1], v16.s[0]
+; CHECK-GI-NEXT: mov v5.s[1], v17.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #16]
; CHECK-GI-NEXT: ldr s7, [sp, #48]
; CHECK-GI-NEXT: ldr s16, [sp, #80]
; CHECK-GI-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-NEXT: mov v4.s[2], w2
-; CHECK-GI-NEXT: mov v5.s[2], w6
-; CHECK-GI-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-NEXT: mov v1.s[2], v7.s[0]
-; CHECK-GI-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-NEXT: mov v3.s[2], v17.s[0]
+; CHECK-GI-NEXT: mov v2.s[2], v6.s[0]
+; CHECK-GI-NEXT: mov v3.s[2], v7.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #24]
+; CHECK-GI-NEXT: mov v0.s[2], w2
+; CHECK-GI-NEXT: mov v1.s[2], w6
+; CHECK-GI-NEXT: mov v4.s[2], v16.s[0]
+; CHECK-GI-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-NEXT: ldr s7, [sp, #56]
; CHECK-GI-NEXT: ldr s16, [sp, #88]
; CHECK-GI-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-NEXT: mov v4.s[3], w3
-; CHECK-GI-NEXT: mov v5.s[3], w7
-; CHECK-GI-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-GI-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-GI-NEXT: mov v2.s[3], v16.s[0]
-; CHECK-GI-NEXT: mov v3.s[3], v17.s[0]
-; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v5.8h
+; CHECK-GI-NEXT: mov v2.s[3], v6.s[0]
+; CHECK-GI-NEXT: mov v3.s[3], v7.s[0]
+; CHECK-GI-NEXT: mov v0.s[3], w3
+; CHECK-GI-NEXT: mov v1.s[3], w7
+; CHECK-GI-NEXT: mov v4.s[3], v16.s[0]
+; CHECK-GI-NEXT: mov v5.s[3], v17.s[0]
; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-NEXT: xtn v1.8b, v1.8h
+; CHECK-GI-NEXT: uzp1 v2.8h, v4.8h, v5.8h
+; CHECK-GI-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: xtn v2.8b, v2.8h
; CHECK-GI-NEXT: saddlv h0, v0.16b
-; CHECK-GI-NEXT: saddlv h1, v1.8b
+; CHECK-GI-NEXT: saddlv h1, v2.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
@@ -4125,49 +4125,49 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
;
; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: fmov s4, w0
-; CHECK-GI-BASE-NEXT: fmov s5, w4
-; CHECK-GI-BASE-NEXT: ldr s0, [sp]
+; CHECK-GI-BASE-NEXT: mov v0.s[0], w0
+; CHECK-GI-BASE-NEXT: mov v1.s[0], w4
+; CHECK-GI-BASE-NEXT: ldr s2, [sp]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-BASE-NEXT: ldr s1, [sp, #32]
+; CHECK-GI-BASE-NEXT: ldr s3, [sp, #32]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-BASE-NEXT: ldr s2, [sp, #64]
+; CHECK-GI-BASE-NEXT: ldr s4, [sp, #64]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-BASE-NEXT: ldr s3, [sp, #96]
+; CHECK-GI-BASE-NEXT: ldr s5, [sp, #96]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-BASE-NEXT: mov v4.s[1], w1
-; CHECK-GI-BASE-NEXT: mov v5.s[1], w5
-; CHECK-GI-BASE-NEXT: mov v0.s[1], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v1.s[1], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v2.s[1], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[1], v17.s[0]
+; CHECK-GI-BASE-NEXT: mov v2.s[1], v6.s[0]
+; CHECK-GI-BASE-NEXT: mov v3.s[1], v7.s[0]
+; CHECK-GI-BASE-NEXT: mov v0.s[1], w1
+; CHECK-GI-BASE-NEXT: mov v1.s[1], w5
+; CHECK-GI-BASE-NEXT: mov v4.s[1], v16.s[0]
+; CHECK-GI-BASE-NEXT: mov v5.s[1], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #16]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #48]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #80]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-BASE-NEXT: mov v4.s[2], w2
-; CHECK-GI-BASE-NEXT: mov v5.s[2], w6
-; CHECK-GI-BASE-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v1.s[2], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[2], v17.s[0]
+; CHECK-GI-BASE-NEXT: mov v2.s[2], v6.s[0]
+; CHECK-GI-BASE-NEXT: mov v3.s[2], v7.s[0]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #24]
+; CHECK-GI-BASE-NEXT: mov v0.s[2], w2
+; CHECK-GI-BASE-NEXT: mov v1.s[2], w6
+; CHECK-GI-BASE-NEXT: mov v4.s[2], v16.s[0]
+; CHECK-GI-BASE-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #56]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #88]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-BASE-NEXT: mov v4.s[3], w3
-; CHECK-GI-BASE-NEXT: mov v5.s[3], w7
-; CHECK-GI-BASE-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v2.s[3], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[3], v17.s[0]
-; CHECK-GI-BASE-NEXT: uzp1 v4.8h, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT: mov v2.s[3], v6.s[0]
+; CHECK-GI-BASE-NEXT: mov v3.s[3], v7.s[0]
+; CHECK-GI-BASE-NEXT: mov v0.s[3], w3
+; CHECK-GI-BASE-NEXT: mov v1.s[3], w7
+; CHECK-GI-BASE-NEXT: mov v4.s[3], v16.s[0]
+; CHECK-GI-BASE-NEXT: mov v5.s[3], v17.s[0]
; CHECK-GI-BASE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-BASE-NEXT: uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-BASE-NEXT: xtn v1.8b, v1.8h
+; CHECK-GI-BASE-NEXT: uzp1 v2.8h, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-GI-BASE-NEXT: xtn v2.8b, v2.8h
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
-; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b
+; CHECK-GI-BASE-NEXT: uaddlv h1, v2.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
@@ -4176,55 +4176,55 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
;
; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT: fmov s4, w0
-; CHECK-GI-DOT-NEXT: fmov s5, w4
-; CHECK-GI-DOT-NEXT: ldr s0, [sp]
+; CHECK-GI-DOT-NEXT: mov v0.s[0], w0
+; CHECK-GI-DOT-NEXT: mov v2.s[0], w4
+; CHECK-GI-DOT-NEXT: ldr s1, [sp]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-DOT-NEXT: ldr s1, [sp, #32]
+; CHECK-GI-DOT-NEXT: ldr s3, [sp, #32]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-DOT-NEXT: ldr s2, [sp, #64]
+; CHECK-GI-DOT-NEXT: ldr s4, [sp, #64]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-DOT-NEXT: ldr s3, [sp, #96]
+; CHECK-GI-DOT-NEXT: ldr s5, [sp, #96]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-DOT-NEXT: mov v4.s[1], w1
-; CHECK-GI-DOT-NEXT: mov v5.s[1], w5
-; CHECK-GI-DOT-NEXT: mov v0.s[1], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v1.s[1], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[1], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[1], v17.s[0]
+; CHECK-GI-DOT-NEXT: mov v1.s[1], v6.s[0]
+; CHECK-GI-DOT-NEXT: mov v3.s[1], v7.s[0]
+; CHECK-GI-DOT-NEXT: mov v0.s[1], w1
+; CHECK-GI-DOT-NEXT: mov v2.s[1], w5
+; CHECK-GI-DOT-NEXT: mov v4.s[1], v16.s[0]
+; CHECK-GI-DOT-NEXT: mov v5.s[1], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #16]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #48]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #80]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-DOT-NEXT: mov v4.s[2], w2
-; CHECK-GI-DOT-NEXT: mov v5.s[2], w6
-; CHECK-GI-DOT-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v1.s[2], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[2], v17.s[0]
+; CHECK-GI-DOT-NEXT: mov v1.s[2], v6.s[0]
+; CHECK-GI-DOT-NEXT: mov v3.s[2], v7.s[0]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #24]
+; CHECK-GI-DOT-NEXT: mov v0.s[2], w2
+; CHECK-GI-DOT-NEXT: mov v4.s[2], v16.s[0]
+; CHECK-GI-DOT-NEXT: mov v2.s[2], w6
+; CHECK-GI-DOT-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #56]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #88]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-DOT-NEXT: mov v4.s[3], w3
-; CHECK-GI-DOT-NEXT: mov v5.s[3], w7
-; CHECK-GI-DOT-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[3], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[3], v17.s[0]
-; CHECK-GI-DOT-NEXT: uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-DOT-NEXT: mov v1.s[3], v6.s[0]
+; CHECK-GI-DOT-NEXT: mov v3.s[3], v7.s[0]
+; CHECK-GI-DOT-NEXT: mov v0.s[3], w3
+; CHECK-GI-DOT-NEXT: mov v2.s[3], w7
+; CHECK-GI-DOT-NEXT: mov v4.s[3], v16.s[0]
+; CHECK-GI-DOT-NEXT: mov v5.s[3], v17.s[0]
+; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
-; CHECK-GI-DOT-NEXT: movi v3.8b, #1
-; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: xtn v1.8b, v1.8h
-; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0]
-; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v3.16b
-; CHECK-GI-DOT-NEXT: udot v4.4s, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s
+; CHECK-GI-DOT-NEXT: uzp1 v3.8h, v4.8h, v5.8h
+; CHECK-GI-DOT-NEXT: movi v4.8b, #1
+; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: xtn v3.8b, v3.8h
+; CHECK-GI-DOT-NEXT: mov v4.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v4.16b
+; CHECK-GI-DOT-NEXT: udot v1.4s, v3.16b, v2.16b
+; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v1.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
@@ -4398,49 +4398,49 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
;
; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: fmov s4, w0
-; CHECK-GI-BASE-NEXT: fmov s5, w4
-; CHECK-GI-BASE-NEXT: ldr s0, [sp]
+; CHECK-GI-BASE-NEXT: mov v0.s[0], w0
+; CHECK-GI-BASE-NEXT: mov v1.s[0], w4
+; CHECK-GI-BASE-NEXT: ldr s2, [sp]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-BASE-NEXT: ldr s1, [sp, #32]
+; CHECK-GI-BASE-NEXT: ldr s3, [sp, #32]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-BASE-NEXT: ldr s2, [sp, #64]
+; CHECK-GI-BASE-NEXT: ldr s4, [sp, #64]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-BASE-NEXT: ldr s3, [sp, #96]
+; CHECK-GI-BASE-NEXT: ldr s5, [sp, #96]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-BASE-NEXT: mov v4.s[1], w1
-; CHECK-GI-BASE-NEXT: mov v5.s[1], w5
-; CHECK-GI-BASE-NEXT: mov v0.s[1], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v1.s[1], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v2.s[1], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[1], v17.s[0]
+; CHECK-GI-BASE-NEXT: mov v2.s[1], v6.s[0]
+; CHECK-GI-BASE-NEXT: mov v3.s[1], v7.s[0]
+; CHECK-GI-BASE-NEXT: mov v0.s[1], w1
+; CHECK-GI-BASE-NEXT: mov v1.s[1], w5
+; CHECK-GI-BASE-NEXT: mov v4.s[1], v16.s[0]
+; CHECK-GI-BASE-NEXT: mov v5.s[1], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #16]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #48]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #80]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-BASE-NEXT: mov v4.s[2], w2
-; CHECK-GI-BASE-NEXT: mov v5.s[2], w6
-; CHECK-GI-BASE-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v1.s[2], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[2], v17.s[0]
+; CHECK-GI-BASE-NEXT: mov v2.s[2], v6.s[0]
+; CHECK-GI-BASE-NEXT: mov v3.s[2], v7.s[0]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #24]
+; CHECK-GI-BASE-NEXT: mov v0.s[2], w2
+; CHECK-GI-BASE-NEXT: mov v1.s[2], w6
+; CHECK-GI-BASE-NEXT: mov v4.s[2], v16.s[0]
+; CHECK-GI-BASE-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #56]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #88]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-BASE-NEXT: mov v4.s[3], w3
-; CHECK-GI-BASE-NEXT: mov v5.s[3], w7
-; CHECK-GI-BASE-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v2.s[3], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[3], v17.s[0]
-; CHECK-GI-BASE-NEXT: uzp1 v4.8h, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT: mov v2.s[3], v6.s[0]
+; CHECK-GI-BASE-NEXT: mov v3.s[3], v7.s[0]
+; CHECK-GI-BASE-NEXT: mov v0.s[3], w3
+; CHECK-GI-BASE-NEXT: mov v1.s[3], w7
+; CHECK-GI-BASE-NEXT: mov v4.s[3], v16.s[0]
+; CHECK-GI-BASE-NEXT: mov v5.s[3], v17.s[0]
; CHECK-GI-BASE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-BASE-NEXT: uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-BASE-NEXT: xtn v1.8b, v1.8h
+; CHECK-GI-BASE-NEXT: uzp1 v2.8h, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-GI-BASE-NEXT: xtn v2.8b, v2.8h
; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
-; CHECK-GI-BASE-NEXT: saddlv h1, v1.8b
+; CHECK-GI-BASE-NEXT: saddlv h1, v2.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
@@ -4449,55 +4449,55 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
;
; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT: fmov s4, w0
-; CHECK-GI-DOT-NEXT: fmov s5, w4
-; CHECK-GI-DOT-NEXT: ldr s0, [sp]
+; CHECK-GI-DOT-NEXT: mov v0.s[0], w0
+; CHECK-GI-DOT-NEXT: mov v2.s[0], w4
+; CHECK-GI-DOT-NEXT: ldr s1, [sp]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-DOT-NEXT: ldr s1, [sp, #32]
+; CHECK-GI-DOT-NEXT: ldr s3, [sp, #32]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-DOT-NEXT: ldr s2, [sp, #64]
+; CHECK-GI-DOT-NEXT: ldr s4, [sp, #64]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-DOT-NEXT: ldr s3, [sp, #96]
+; CHECK-GI-DOT-NEXT: ldr s5, [sp, #96]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-DOT-NEXT: mov v4.s[1], w1
-; CHECK-GI-DOT-NEXT: mov v5.s[1], w5
-; CHECK-GI-DOT-NEXT: mov v0.s[1], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v1.s[1], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[1], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[1], v17.s[0]
+; CHECK-GI-DOT-NEXT: mov v1.s[1], v6.s[0]
+; CHECK-GI-DOT-NEXT: mov v3.s[1], v7.s[0]
+; CHECK-GI-DOT-NEXT: mov v0.s[1], w1
+; CHECK-GI-DOT-NEXT: mov v2.s[1], w5
+; CHECK-GI-DOT-NEXT: mov v4.s[1], v16.s[0]
+; CHECK-GI-DOT-NEXT: mov v5.s[1], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #16]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #48]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #80]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-DOT-NEXT: mov v4.s[2], w2
-; CHECK-GI-DOT-NEXT: mov v5.s[2], w6
-; CHECK-GI-DOT-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v1.s[2], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[2], v17.s[0]
+; CHECK-GI-DOT-NEXT: mov v1.s[2], v6.s[0]
+; CHECK-GI-DOT-NEXT: mov v3.s[2], v7.s[0]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #24]
+; CHECK-GI-DOT-NEXT: mov v0.s[2], w2
+; CHECK-GI-DOT-NEXT: mov v4.s[2], v16.s[0]
+; CHECK-GI-DOT-NEXT: mov v2.s[2], w6
+; CHECK-GI-DOT-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #56]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #88]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-DOT-NEXT: mov v4.s[3], w3
-; CHECK-GI-DOT-NEXT: mov v5.s[3], w7
-; CHECK-GI-DOT-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[3], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[3], v17.s[0]
-; CHECK-GI-DOT-NEXT: uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-DOT-NEXT: mov v1.s[3], v6.s[0]
+; CHECK-GI-DOT-NEXT: mov v3.s[3], v7.s[0]
+; CHECK-GI-DOT-NEXT: mov v0.s[3], w3
+; CHECK-GI-DOT-NEXT: mov v2.s[3], w7
+; CHECK-GI-DOT-NEXT: mov v4.s[3], v16.s[0]
+; CHECK-GI-DOT-NEXT: mov v5.s[3], v17.s[0]
+; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
-; CHECK-GI-DOT-NEXT: movi v3.8b, #1
-; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: xtn v1.8b, v1.8h
-; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0]
-; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v3.16b
-; CHECK-GI-DOT-NEXT: sdot v4.4s, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s
+; CHECK-GI-DOT-NEXT: uzp1 v3.8h, v4.8h, v5.8h
+; CHECK-GI-DOT-NEXT: movi v4.8b, #1
+; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: xtn v3.8b, v3.8h
+; CHECK-GI-DOT-NEXT: mov v4.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v4.16b
+; CHECK-GI-DOT-NEXT: sdot v1.4s, v3.16b, v2.16b
+; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v1.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index 3c86f4bf9eb213..ef2f297f204c1c 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -136,7 +136,7 @@ define <2 x i8> @xtn_v2i128_v2i8(<2 x i128> %a) {
;
; CHECK-GI-LABEL: xtn_v2i128_v2i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov d0, x0
+; CHECK-GI-NEXT: mov v0.d[0], x0
; CHECK-GI-NEXT: mov v0.d[1], x2
; CHECK-GI-NEXT: xtn v0.2s, v0.2d
; CHECK-GI-NEXT: ret
@@ -174,7 +174,7 @@ define <2 x i16> @xtn_v2i128_v2i16(<2 x i128> %a) {
;
; CHECK-GI-LABEL: xtn_v2i128_v2i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov d0, x0
+; CHECK-GI-NEXT: mov v0.d[0], x0
; CHECK-GI-NEXT: mov v0.d[1], x2
; CHECK-GI-NEXT: xtn v0.2s, v0.2d
; CHECK-GI-NEXT: ret
@@ -203,7 +203,7 @@ define <2 x i32> @xtn_v2i128_v2i32(<2 x i128> %a) {
;
; CHECK-GI-LABEL: xtn_v2i128_v2i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov d0, x0
+; CHECK-GI-NEXT: mov v0.d[0], x0
; CHECK-GI-NEXT: mov v0.d[1], x2
; CHECK-GI-NEXT: xtn v0.2s, v0.2d
; CHECK-GI-NEXT: ret
@@ -213,11 +213,17 @@ entry:
}
define <2 x i64> @xtn_v2i128_v2i64(<2 x i128> %a) {
-; CHECK-LABEL: xtn_v2i128_v2i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: mov v0.d[1], x2
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: xtn_v2i128_v2i64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmov d0, x0
+; CHECK-SD-NEXT: mov v0.d[1], x2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: xtn_v2i128_v2i64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v0.d[0], x0
+; CHECK-GI-NEXT: mov v0.d[1], x2
+; CHECK-GI-NEXT: ret
entry:
%arg1 = trunc <2 x i128> %a to <2 x i64>
ret <2 x i64> %arg1
@@ -294,10 +300,10 @@ define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) {
;
; CHECK-GI-LABEL: xtn_v3i32_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: mov s2, v0.s[2]
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -321,11 +327,9 @@ define <3 x i16> @xtn_v3i64_v3i16(<3 x i64> %a) {
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: fmov s1, w9
; CHECK-GI-NEXT: fmov x8, d2
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[1], w9
+; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
@@ -346,10 +350,10 @@ define <3 x i32> @xtn_v3i64_v3i32(<3 x i64> %a) {
; CHECK-GI-LABEL: xtn_v3i64_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov x8, d0
-; CHECK-GI-NEXT: fmov x9, d1
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: fmov x8, d1
+; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: fmov x8, d2
-; CHECK-GI-NEXT: mov v0.s[1], w9
; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 716d2398996be2..9f3450be607fa5 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -242,16 +242,15 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) {
;
; CHECK-GI-LABEL: zext_v3i8_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: mov v0.s[0], w0
; CHECK-GI-NEXT: mov w8, #255 // =0xff
; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v2.16b, v1.16b
+; CHECK-GI-NEXT: mov v1.h[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v2.h[1], v1.h[0]
+; CHECK-GI-NEXT: mov v1.h[2], w8
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: mov v2.h[2], v1.h[0]
-; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i8> %a to <3 x i16>
@@ -272,8 +271,8 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) {
; CHECK-GI-LABEL: zext_v3i8_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #255 // =0xff
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: mov v0.s[1], w1
; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w2
@@ -305,7 +304,7 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) {
;
; CHECK-GI-LABEL: zext_v3i8_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: mov v0.s[0], w0
; CHECK-GI-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
; CHECK-GI-NEXT: and x8, x2, #0xff
@@ -332,7 +331,7 @@ define <3 x i32> @zext_v3i16_v3i32(<3 x i16> %a) {
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: umov w8, v0.h[0]
; CHECK-GI-NEXT: umov w9, v0.h[1]
-; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: umov w8, v0.h[2]
; CHECK-GI-NEXT: mov v1.s[1], w9
; CHECK-GI-NEXT: mov v1.s[2], w8
@@ -407,16 +406,15 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v3i10_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: mov v0.s[0], w0
; CHECK-GI-NEXT: mov w8, #1023 // =0x3ff
; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v2.16b, v1.16b
+; CHECK-GI-NEXT: mov v1.h[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v2.h[1], v1.h[0]
+; CHECK-GI-NEXT: mov v1.h[2], w8
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: mov v2.h[2], v1.h[0]
-; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: ret
entry:
%c = zext <3 x i10> %a to <3 x i16>
@@ -437,8 +435,8 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) {
; CHECK-GI-LABEL: zext_v3i10_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #1023 // =0x3ff
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: mov v1.s[0], w8
; CHECK-GI-NEXT: mov v0.s[1], w1
; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w2
@@ -469,7 +467,7 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v3i10_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: mov v0.s[0], w0
; CHECK-GI-NEXT: adrp x8, .LCPI27_0
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI27_0]
@@ -1098,33 +1096,33 @@ define <16 x i32> @zext_v16i10_v16i32(<16 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v16i10_v16i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s4, w0
-; CHECK-GI-NEXT: fmov s5, w4
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: mov v1.s[0], w4
; CHECK-GI-NEXT: ldr s2, [sp]
-; CHECK-GI-NEXT: ldr s0, [sp, #8]
-; CHECK-GI-NEXT: ldr s3, [sp, #32]
-; CHECK-GI-NEXT: ldr s1, [sp, #40]
+; CHECK-GI-NEXT: ldr s3, [sp, #8]
+; CHECK-GI-NEXT: ldr s4, [sp, #32]
+; CHECK-GI-NEXT: ldr s5, [sp, #40]
; CHECK-GI-NEXT: movi v6.4s, #3, msl #8
-; CHECK-GI-NEXT: mov v4.s[1], w1
-; CHECK-GI-NEXT: mov v5.s[1], w5
-; CHECK-GI-NEXT: mov v2.s[1], v0.s[0]
-; CHECK-GI-NEXT: mov v3.s[1], v1.s[0]
-; CHECK-GI-NEXT: ldr s0, [sp, #16]
-; CHECK-GI-NEXT: ldr s1, [sp, #48]
-; CHECK-GI-NEXT: mov v4.s[2], w2
-; CHECK-GI-NEXT: mov v5.s[2], w6
-; CHECK-GI-NEXT: mov v2.s[2], v0.s[0]
-; CHECK-GI-NEXT: mov v3.s[2], v1.s[0]
-; CHECK-GI-NEXT: ldr s0, [sp, #24]
-; CHECK-GI-NEXT: ldr s1, [sp, #56]
-; CHECK-GI-NEXT: mov v4.s[3], w3
-; CHECK-GI-NEXT: mov v5.s[3], w7
-; CHECK-GI-NEXT: mov v2.s[3], v0.s[0]
-; CHECK-GI-NEXT: mov v3.s[3], v1.s[0]
-; CHECK-GI-NEXT: and v0.16b, v4.16b, v6.16b
-; CHECK-GI-NEXT: and v1.16b, v5.16b, v6.16b
+; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
+; CHECK-GI-NEXT: mov v4.s[1], v5.s[0]
+; CHECK-GI-NEXT: ldr s3, [sp, #16]
+; CHECK-GI-NEXT: mov v0.s[1], w1
+; CHECK-GI-NEXT: mov v1.s[1], w5
+; CHECK-GI-NEXT: ldr s5, [sp, #48]
+; CHECK-GI-NEXT: mov v2.s[2], v3.s[0]
+; CHECK-GI-NEXT: mov v4.s[2], v5.s[0]
+; CHECK-GI-NEXT: ldr s3, [sp, #24]
+; CHECK-GI-NEXT: mov v0.s[2], w2
+; CHECK-GI-NEXT: mov v1.s[2], w6
+; CHECK-GI-NEXT: ldr s5, [sp, #56]
+; CHECK-GI-NEXT: mov v2.s[3], v3.s[0]
+; CHECK-GI-NEXT: mov v4.s[3], v5.s[0]
+; CHECK-GI-NEXT: mov v0.s[3], w3
+; CHECK-GI-NEXT: mov v1.s[3], w7
; CHECK-GI-NEXT: and v2.16b, v2.16b, v6.16b
-; CHECK-GI-NEXT: and v3.16b, v3.16b, v6.16b
+; CHECK-GI-NEXT: and v3.16b, v4.16b, v6.16b
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v6.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v6.16b
; CHECK-GI-NEXT: ret
entry:
%c = zext <16 x i10> %a to <16 x i32>
@@ -1176,44 +1174,44 @@ define <16 x i64> @zext_v16i10_v16i64(<16 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v16i10_v16i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s16, w0
-; CHECK-GI-NEXT: fmov s17, w2
-; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: fmov s18, w4
-; CHECK-GI-NEXT: fmov s19, w6
-; CHECK-GI-NEXT: ldr s1, [sp, #8]
-; CHECK-GI-NEXT: ldr s2, [sp, #16]
-; CHECK-GI-NEXT: ldr s3, [sp, #24]
-; CHECK-GI-NEXT: ldr s4, [sp, #32]
-; CHECK-GI-NEXT: ldr s5, [sp, #40]
-; CHECK-GI-NEXT: ldr s6, [sp, #48]
-; CHECK-GI-NEXT: ldr s7, [sp, #56]
-; CHECK-GI-NEXT: mov v16.s[1], w1
-; CHECK-GI-NEXT: mov v17.s[1], w3
-; CHECK-GI-NEXT: mov v18.s[1], w5
-; CHECK-GI-NEXT: mov v19.s[1], w7
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[1], v5.s[0]
+; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: mov v1.s[0], w2
+; CHECK-GI-NEXT: ldr s3, [sp]
+; CHECK-GI-NEXT: mov v2.s[0], w4
+; CHECK-GI-NEXT: mov v5.s[0], w6
+; CHECK-GI-NEXT: ldr s4, [sp, #8]
+; CHECK-GI-NEXT: ldr s6, [sp, #16]
+; CHECK-GI-NEXT: ldr s7, [sp, #24]
+; CHECK-GI-NEXT: ldr s16, [sp, #32]
+; CHECK-GI-NEXT: ldr s17, [sp, #40]
+; CHECK-GI-NEXT: ldr s18, [sp, #48]
+; CHECK-GI-NEXT: ldr s19, [sp, #56]
+; CHECK-GI-NEXT: mov v0.s[1], w1
+; CHECK-GI-NEXT: mov v1.s[1], w3
+; CHECK-GI-NEXT: mov v3.s[1], v4.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], w5
+; CHECK-GI-NEXT: mov v5.s[1], w7
; CHECK-GI-NEXT: mov v6.s[1], v7.s[0]
+; CHECK-GI-NEXT: mov v16.s[1], v17.s[0]
+; CHECK-GI-NEXT: mov v18.s[1], v19.s[0]
; CHECK-GI-NEXT: adrp x8, .LCPI54_0
-; CHECK-GI-NEXT: ushll v1.2d, v16.2s, #0
-; CHECK-GI-NEXT: ushll v3.2d, v17.2s, #0
-; CHECK-GI-NEXT: ushll v5.2d, v18.2s, #0
-; CHECK-GI-NEXT: ushll v7.2d, v19.2s, #0
-; CHECK-GI-NEXT: ushll v16.2d, v0.2s, #0
-; CHECK-GI-NEXT: ushll v18.2d, v2.2s, #0
-; CHECK-GI-NEXT: ushll v19.2d, v4.2s, #0
-; CHECK-GI-NEXT: ushll v20.2d, v6.2s, #0
-; CHECK-GI-NEXT: ldr q17, [x8, :lo12:.LCPI54_0]
-; CHECK-GI-NEXT: and v0.16b, v1.16b, v17.16b
-; CHECK-GI-NEXT: and v1.16b, v3.16b, v17.16b
-; CHECK-GI-NEXT: and v2.16b, v5.16b, v17.16b
-; CHECK-GI-NEXT: and v3.16b, v7.16b, v17.16b
-; CHECK-GI-NEXT: and v4.16b, v16.16b, v17.16b
-; CHECK-GI-NEXT: and v5.16b, v18.16b, v17.16b
-; CHECK-GI-NEXT: and v6.16b, v19.16b, v17.16b
-; CHECK-GI-NEXT: and v7.16b, v20.16b, v17.16b
+; CHECK-GI-NEXT: ldr q7, [x8, :lo12:.LCPI54_0]
+; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-GI-NEXT: ushll v4.2d, v5.2s, #0
+; CHECK-GI-NEXT: ushll v5.2d, v3.2s, #0
+; CHECK-GI-NEXT: ushll v6.2d, v6.2s, #0
+; CHECK-GI-NEXT: ushll v16.2d, v16.2s, #0
+; CHECK-GI-NEXT: ushll v17.2d, v18.2s, #0
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v7.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v7.16b
+; CHECK-GI-NEXT: and v2.16b, v2.16b, v7.16b
+; CHECK-GI-NEXT: and v3.16b, v4.16b, v7.16b
+; CHECK-GI-NEXT: and v4.16b, v5.16b, v7.16b
+; CHECK-GI-NEXT: and v5.16b, v6.16b, v7.16b
+; CHECK-GI-NEXT: and v6.16b, v16.16b, v7.16b
+; CHECK-GI-NEXT: and v7.16b, v17.16b, v7.16b
; CHECK-GI-NEXT: ret
entry:
%c = zext <16 x i10> %a to <16 x i64>
>From e870fb76f7ee511de037ff060359f64edd938a41 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Thu, 22 Aug 2024 16:21:05 +0000
Subject: [PATCH 3/4] [AArch64][GlobalISel] TableGen Patterns for Lane 0 Vector
Insert
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 27 ++
...legalizer-lowering-build-vector-to-dup.mir | 11 +-
llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll | 3 +-
llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll | 3 +-
llvm/test/CodeGen/AArch64/aarch64-smull.ll | 100 ++---
llvm/test/CodeGen/AArch64/abs.ll | 3 +-
llvm/test/CodeGen/AArch64/arm64-dup.ll | 57 +--
llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 121 +++---
.../CodeGen/AArch64/arm64-subvector-extend.ll | 198 +++++-----
llvm/test/CodeGen/AArch64/bitcast.ll | 44 +--
llvm/test/CodeGen/AArch64/bswap.ll | 3 +-
llvm/test/CodeGen/AArch64/concat-vector.ll | 91 ++---
llvm/test/CodeGen/AArch64/fcmp.ll | 206 +++++-----
llvm/test/CodeGen/AArch64/fcopysign.ll | 4 +-
.../AArch64/fixed-vector-interleave.ll | 14 +-
llvm/test/CodeGen/AArch64/fptoi.ll | 16 +-
llvm/test/CodeGen/AArch64/icmp.ll | 16 +-
llvm/test/CodeGen/AArch64/insertextract.ll | 34 +-
llvm/test/CodeGen/AArch64/itofp.ll | 36 +-
llvm/test/CodeGen/AArch64/neon-extadd.ll | 142 +++----
llvm/test/CodeGen/AArch64/neon-extmul.ll | 28 +-
llvm/test/CodeGen/AArch64/ptradd.ll | 38 +-
llvm/test/CodeGen/AArch64/sext.ll | 190 ++++-----
llvm/test/CodeGen/AArch64/shift.ll | 9 +-
llvm/test/CodeGen/AArch64/vecreduce-add.ll | 368 +++++++++---------
llvm/test/CodeGen/AArch64/xtn.ll | 28 +-
llvm/test/CodeGen/AArch64/zext.ll | 138 +++----
27 files changed, 924 insertions(+), 1004 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b849e4c50e4fce..3ca92c5ffa6bac 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3518,6 +3518,33 @@ def : Pat <(v2i64 (vec_ins_or_scal_vec (i64
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
+def : Pat<(v2i32 (vector_insert (v2i32 undef), (i32 GPR32:$Rn), (i64 0))),
+ (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), GPR32:$Rn, ssub)>;
+def : Pat<(v4i32 (vector_insert (v4i32 undef), (i32 GPR32:$Rn), (i64 0))),
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GPR32:$Rn, ssub)>;
+def : Pat<(v1i64 (vector_insert (v1i64 undef), (i64 GPR64:$Rn), (i64 0))),
+ (INSERT_SUBREG (v1i64 (IMPLICIT_DEF)), GPR64:$Rn, dsub)>;
+def : Pat<(v2i64 (vector_insert (v2i64 undef), (i64 GPR64:$Rn), (i64 0))),
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GPR64:$Rn, dsub)>;
+
+def : Pat<(v8i8 (vec_ins_or_scal_vec (i8 (vector_extract (v8i8 V64:$Rm), (i64 0))))),
+ (v8i8 V64:$Rm)>;
+def : Pat<(v4i16 (vec_ins_or_scal_vec (i16 (vector_extract (v4i16 V64:$Rm), (i64 0))))),
+ (v4i16 V64:$Rm)>;
+def : Pat<(v2i32 (vec_ins_or_scal_vec (i32 (vector_extract (v2i32 V64:$Rm), (i64 0))))),
+ (v2i32 V64:$Rm)>;
+def : Pat<(v1i64 (vec_ins_or_scal_vec (i32 (vector_extract (v1i64 V64:$Rm), (i64 0))))),
+ (v1i64 V64:$Rm)>;
+
+def : Pat<(v16i8 (vec_ins_or_scal_vec (i8 (vector_extract (v16i8 V128:$Rm), (i64 0))))),
+ (v16i8 V128:$Rm)>;
+def : Pat<(v8i16 (vec_ins_or_scal_vec (i16 (vector_extract (v8i16 V128:$Rm), (i64 0))))),
+ (v8i16 V128:$Rm)>;
+def : Pat<(v4i32 (vec_ins_or_scal_vec (i32 (vector_extract (v4i32 V128:$Rm), (i64 0))))),
+ (v4i32 V128:$Rm)>;
+def : Pat<(v2i64 (vec_ins_or_scal_vec (i64 (vector_extract (v2i64 V128:$Rm), (i64 0))))),
+ (v2i64 V128:$Rm)>;
+
// Match all load 64 bits width whose type is compatible with FPR64
let Predicates = [IsLE] in {
// We must use LD1 to perform vector loads in big-endian.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
index 0115531dfb09ae..22d1ccc056eb48 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
@@ -57,15 +57,12 @@ body: |
; SELECT-NEXT: %r:gpr32 = COPY $w0
; SELECT-NEXT: %q:gpr32 = COPY $w1
; SELECT-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
+ ; SELECT-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], %r, %subreg.ssub
; SELECT-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
- ; SELECT-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[DEF]], %subreg.dsub
- ; SELECT-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 0, %r
+ ; SELECT-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[INSERT_SUBREG]], %subreg.dsub
+ ; SELECT-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG1]], 1, %q
; SELECT-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr]].dsub
- ; SELECT-NEXT: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
- ; SELECT-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY]], %subreg.dsub
- ; SELECT-NEXT: [[INSvi32gpr1:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG1]], 1, %q
- ; SELECT-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr1]].dsub
- ; SELECT-NEXT: $d0 = COPY [[COPY1]]
+ ; SELECT-NEXT: $d0 = COPY [[COPY]]
; SELECT-NEXT: RET_ReallyLR implicit $d0
%r:_(s32) = COPY $w0
%q:_(s32) = COPY $w1
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
index 9734ab35bd6b2d..7f922c00475535 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -76,8 +76,7 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
; CHECK-GI-NEXT: bic w9, w9, w8
; CHECK-GI-NEXT: and w8, w8, w10
; CHECK-GI-NEXT: orr w8, w9, w8
-; CHECK-GI-NEXT: mov v0.s[0], w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: ret
%neg = xor <1 x i32> %C, <i32 -1>
%and = and <1 x i32> %neg, %B
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index 45ad4b07ff66f7..b8eb8269d605c6 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -76,8 +76,7 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
; CHECK-GI-NEXT: and w9, w8, w9
; CHECK-GI-NEXT: bic w8, w10, w8
; CHECK-GI-NEXT: orr w8, w9, w8
-; CHECK-GI-NEXT: mov v0.s[0], w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: ret
%and = and <1 x i32> %C, %B
%neg = xor <1 x i32> %C, <i32 -1>
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index d677526bab0005..c392d8730fb8b8 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -272,20 +272,20 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-GI-NEXT: movi d0, #0x00ffff0000ffff
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b
-; CHECK-GI-NEXT: ldr d1, [x1]
-; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: mov w8, v0.s[0]
; CHECK-GI-NEXT: mov w9, v0.s[1]
-; CHECK-GI-NEXT: mov x11, v1.d[1]
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: mov v0.d[1], x9
-; CHECK-GI-NEXT: fmov x9, d1
-; CHECK-GI-NEXT: fmov x8, d0
-; CHECK-GI-NEXT: mov x10, v0.d[1]
+; CHECK-GI-NEXT: ldr d0, [x1]
+; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: fmov d1, x8
+; CHECK-GI-NEXT: fmov x11, d0
+; CHECK-GI-NEXT: mov v1.d[1], x9
+; CHECK-GI-NEXT: mov x9, v0.d[1]
+; CHECK-GI-NEXT: fmov x10, d1
+; CHECK-GI-NEXT: mov x8, v1.d[1]
+; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: mul x9, x10, x11
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: fmov d0, x10
+; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: ret
%load.A = load <2 x i16>, ptr %A
%load.B = load <2 x i32>, ptr %B
@@ -320,14 +320,14 @@ define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-GI-NEXT: ldr d1, [x1]
; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT: fmov x9, d1
-; CHECK-GI-NEXT: mov x11, v1.d[1]
-; CHECK-GI-NEXT: fmov x8, d0
-; CHECK-GI-NEXT: mov x10, v0.d[1]
+; CHECK-GI-NEXT: fmov x11, d1
+; CHECK-GI-NEXT: mov x9, v1.d[1]
+; CHECK-GI-NEXT: fmov x10, d0
+; CHECK-GI-NEXT: mov x8, v0.d[1]
+; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: mul x9, x10, x11
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: fmov d0, x10
+; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: ret
%load.A = load <2 x i32>, ptr %A
%and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF>
@@ -1046,14 +1046,14 @@ define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
; CHECK-GI-NEXT: adrp x8, .LCPI36_0
; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI36_0]
-; CHECK-GI-NEXT: fmov x8, d0
-; CHECK-GI-NEXT: fmov x9, d1
-; CHECK-GI-NEXT: mov x10, v0.d[1]
-; CHECK-GI-NEXT: mov x11, v1.d[1]
+; CHECK-GI-NEXT: fmov x10, d0
+; CHECK-GI-NEXT: fmov x11, d1
+; CHECK-GI-NEXT: mov x8, v0.d[1]
+; CHECK-GI-NEXT: mov x9, v1.d[1]
+; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: mul x9, x10, x11
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: fmov d0, x10
+; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: ret
%tmp3 = sext <2 x i32> %arg to <2 x i64>
%tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
@@ -1161,14 +1161,14 @@ define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
; CHECK-GI-NEXT: adrp x8, .LCPI40_0
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI40_0]
-; CHECK-GI-NEXT: fmov x8, d0
-; CHECK-GI-NEXT: fmov x9, d1
-; CHECK-GI-NEXT: mov x10, v0.d[1]
-; CHECK-GI-NEXT: mov x11, v1.d[1]
+; CHECK-GI-NEXT: fmov x10, d0
+; CHECK-GI-NEXT: fmov x11, d1
+; CHECK-GI-NEXT: mov x8, v0.d[1]
+; CHECK-GI-NEXT: mov x9, v1.d[1]
+; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: mul x9, x10, x11
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: fmov d0, x10
+; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: ret
%tmp3 = zext <2 x i32> %arg to <2 x i64>
%tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
@@ -1262,15 +1262,15 @@ define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
; CHECK-GI-NEXT: adrp x8, .LCPI43_0
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI43_0]
-; CHECK-GI-NEXT: fmov x8, d0
-; CHECK-GI-NEXT: fmov x9, d1
-; CHECK-GI-NEXT: mov x10, v0.d[1]
-; CHECK-GI-NEXT: mov x11, v1.d[1]
+; CHECK-GI-NEXT: fmov x10, d0
+; CHECK-GI-NEXT: fmov x11, d1
+; CHECK-GI-NEXT: mov x8, v0.d[1]
+; CHECK-GI-NEXT: mov x9, v1.d[1]
; CHECK-GI-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: mul x9, x10, x11
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: fmov d0, x10
+; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: ret
%tmp3 = zext <2 x i32> %arg to <2 x i64>
@@ -1889,15 +1889,15 @@ define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v2.2d, #0x000000000000ff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT: fmov x8, d0
-; CHECK-GI-NEXT: mov x10, v0.d[1]
+; CHECK-GI-NEXT: fmov x10, d0
+; CHECK-GI-NEXT: mov x8, v0.d[1]
; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT: fmov x9, d1
-; CHECK-GI-NEXT: mov x11, v1.d[1]
+; CHECK-GI-NEXT: fmov x11, d1
+; CHECK-GI-NEXT: mov x9, v1.d[1]
+; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: mul x9, x10, x11
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: fmov d0, x10
+; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: ret
entry:
%in1 = zext <2 x i32> %src1 to <2 x i64>
@@ -1945,10 +1945,10 @@ define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
; CHECK-GI-NEXT: fmov x9, d0
; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: mul x9, x9, x12
-; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: fmov d0, x8
; CHECK-GI-NEXT: mul x11, x13, x14
-; CHECK-GI-NEXT: mov v1.d[0], x9
; CHECK-GI-NEXT: mov v0.d[1], x10
+; CHECK-GI-NEXT: fmov d1, x9
; CHECK-GI-NEXT: mov v1.d[1], x11
; CHECK-GI-NEXT: ret
entry:
@@ -1990,9 +1990,9 @@ define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {
; CHECK-GI-NEXT: mul x8, x8, x9
; CHECK-GI-NEXT: mul x9, x12, x9
; CHECK-GI-NEXT: mul x10, x10, x11
-; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: fmov d0, x8
; CHECK-GI-NEXT: mul x11, x13, x11
-; CHECK-GI-NEXT: mov v1.d[0], x9
+; CHECK-GI-NEXT: fmov d1, x9
; CHECK-GI-NEXT: mov v0.d[1], x10
; CHECK-GI-NEXT: mov v1.d[1], x11
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 6da019a79b7277..ffe2b0a600fe6b 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -247,8 +247,7 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: cmp w8, #0
; CHECK-GI-NEXT: cneg w8, w9, le
-; CHECK-GI-NEXT: mov v0.s[0], w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: ret
entry:
%res = call <1 x i32> @llvm.abs.v1i32(<1 x i32> %a, i1 0)
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index a25763e3b15907..4163541cd73c82 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -334,40 +334,25 @@ entry:
}
define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone {
-; CHECK-SD-LABEL: f:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: fmov s0, w0
-; CHECK-SD-NEXT: mov.s v0[1], w1
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: f:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov.s v0[0], w0
-; CHECK-GI-NEXT: mov.s v0[1], w1
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: f:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: mov.s v0[1], w1
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
%vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
ret <2 x i32> %vecinit1
}
define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone {
-; CHECK-SD-LABEL: g:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: fmov s0, w0
-; CHECK-SD-NEXT: mov.s v0[1], w1
-; CHECK-SD-NEXT: mov.s v0[2], w1
-; CHECK-SD-NEXT: mov.s v0[3], w0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: g:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov.s v0[0], w0
-; CHECK-GI-NEXT: mov.s v0[1], w1
-; CHECK-GI-NEXT: mov.s v0[2], w1
-; CHECK-GI-NEXT: mov.s v0[3], w0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: g:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: mov.s v0[1], w1
+; CHECK-NEXT: mov.s v0[2], w1
+; CHECK-NEXT: mov.s v0[3], w0
+; CHECK-NEXT: ret
%vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
%vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
%vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
@@ -376,17 +361,11 @@ define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone {
}
define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone {
-; CHECK-SD-LABEL: h:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: fmov d0, x0
-; CHECK-SD-NEXT: mov.d v0[1], x1
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: h:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov.d v0[0], x0
-; CHECK-GI-NEXT: mov.d v0[1], x1
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: mov.d v0[1], x1
+; CHECK-NEXT: ret
%vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
%vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
ret <2 x i64> %vecinit1
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index c0d91c1e0c836b..a9d16119a48976 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1252,44 +1252,28 @@ define <8 x i16> @scalar_to_vector.v8i16(i16 %a) {
}
define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
-; CHECK-SD-LABEL: scalar_to_vector.v2i32:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: fmov s0, w0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: scalar_to_vector.v2i32:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v0.s[0], w0
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: scalar_to_vector.v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ret
%b = insertelement <2 x i32> undef, i32 %a, i32 0
ret <2 x i32> %b
}
define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
-; CHECK-SD-LABEL: scalar_to_vector.v4i32:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: fmov s0, w0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: scalar_to_vector.v4i32:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v0.s[0], w0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: scalar_to_vector.v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ret
%b = insertelement <4 x i32> undef, i32 %a, i32 0
ret <4 x i32> %b
}
define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
-; CHECK-SD-LABEL: scalar_to_vector.v2i64:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: fmov d0, x0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: scalar_to_vector.v2i64:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov v0.d[0], x0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: scalar_to_vector.v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: ret
%b = insertelement <2 x i64> undef, i64 %a, i32 0
ret <2 x i64> %b
}
@@ -1853,38 +1837,36 @@ define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v16i8_v16i8_v8i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov b3, v0.b[1]
-; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b2, v0.b[1]
+; CHECK-GI-NEXT: mov b3, v0.b[2]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov b4, v0.b[2]
-; CHECK-GI-NEXT: mov v2.b[1], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[3]
-; CHECK-GI-NEXT: mov v2.b[2], v4.b[0]
-; CHECK-GI-NEXT: mov b4, v0.b[4]
-; CHECK-GI-NEXT: mov v2.b[3], v3.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[5]
-; CHECK-GI-NEXT: mov v2.b[4], v4.b[0]
-; CHECK-GI-NEXT: mov b4, v0.b[6]
-; CHECK-GI-NEXT: mov b0, v0.b[7]
-; CHECK-GI-NEXT: mov v2.b[5], v3.b[0]
+; CHECK-GI-NEXT: mov b4, v0.b[3]
+; CHECK-GI-NEXT: mov b5, v0.b[4]
+; CHECK-GI-NEXT: mov b6, v0.b[5]
+; CHECK-GI-NEXT: mov b7, v0.b[6]
+; CHECK-GI-NEXT: mov b16, v0.b[7]
+; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v1.b[1]
+; CHECK-GI-NEXT: mov v0.b[2], v3.b[0]
; CHECK-GI-NEXT: mov b3, v1.b[2]
-; CHECK-GI-NEXT: mov v2.b[6], v4.b[0]
-; CHECK-GI-NEXT: mov v2.b[7], v0.b[0]
-; CHECK-GI-NEXT: mov b0, v1.b[1]
-; CHECK-GI-NEXT: mov v2.b[8], v1.b[0]
-; CHECK-GI-NEXT: mov v2.b[9], v0.b[0]
-; CHECK-GI-NEXT: mov b0, v1.b[3]
-; CHECK-GI-NEXT: mov v2.b[10], v3.b[0]
+; CHECK-GI-NEXT: mov v0.b[3], v4.b[0]
+; CHECK-GI-NEXT: mov v0.b[4], v5.b[0]
+; CHECK-GI-NEXT: mov v0.b[5], v6.b[0]
+; CHECK-GI-NEXT: mov v0.b[6], v7.b[0]
+; CHECK-GI-NEXT: mov v0.b[7], v16.b[0]
+; CHECK-GI-NEXT: mov v0.b[8], v1.b[0]
+; CHECK-GI-NEXT: mov v0.b[9], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v1.b[3]
+; CHECK-GI-NEXT: mov v0.b[10], v3.b[0]
; CHECK-GI-NEXT: mov b3, v1.b[4]
-; CHECK-GI-NEXT: mov v2.b[11], v0.b[0]
-; CHECK-GI-NEXT: mov b0, v1.b[5]
-; CHECK-GI-NEXT: mov v2.b[12], v3.b[0]
+; CHECK-GI-NEXT: mov v0.b[11], v2.b[0]
+; CHECK-GI-NEXT: mov b2, v1.b[5]
+; CHECK-GI-NEXT: mov v0.b[12], v3.b[0]
; CHECK-GI-NEXT: mov b3, v1.b[6]
-; CHECK-GI-NEXT: mov v2.b[13], v0.b[0]
-; CHECK-GI-NEXT: mov b0, v1.b[7]
-; CHECK-GI-NEXT: mov v2.b[14], v3.b[0]
-; CHECK-GI-NEXT: mov v2.b[15], v0.b[0]
-; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: mov b1, v1.b[7]
+; CHECK-GI-NEXT: mov v0.b[13], v2.b[0]
+; CHECK-GI-NEXT: mov v0.b[14], v3.b[0]
+; CHECK-GI-NEXT: mov v0.b[15], v1.b[0]
; CHECK-GI-NEXT: ret
entry:
%vecext = extractelement <16 x i8> %x, i32 0
@@ -2062,7 +2044,7 @@ define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v8i16_v8i16_v4i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v2.16b, v0.16b
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
@@ -2189,12 +2171,10 @@ define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v2i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v0.s[1]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
-; CHECK-GI-NEXT: mov v2.s[2], v1.s[0]
-; CHECK-GI-NEXT: mov v2.s[3], v1.s[1]
-; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT: mov v0.s[3], v1.s[1]
; CHECK-GI-NEXT: ret
entry:
%vecext = extractelement <4 x i32> %x, i32 0
@@ -2244,18 +2224,11 @@ entry:
}
define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
-; CHECK-SD-LABEL: test_concat_v2i64_v2i64_v1i64:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_concat_v2i64_v2i64_v1i64:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.d[0], v0.d[0]
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ret
entry:
%vecext = extractelement <2 x i64> %x, i32 0
%vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index 51ce5360744eb0..9e2a34e8f8a8ed 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -466,62 +466,62 @@ define <32 x i8> @sext_v32i1(<32 x i1> %arg) {
;
; CHECK-GI-LABEL: sext_v32i1:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov.s v3[0], w0
-; CHECK-GI-NEXT: mov.s v5[0], w4
+; CHECK-GI-NEXT: fmov s16, w0
+; CHECK-GI-NEXT: fmov s19, w4
; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s20, [sp, #8]
+; CHECK-GI-NEXT: ldr s21, [sp, #8]
; CHECK-GI-NEXT: ldr s1, [sp, #32]
-; CHECK-GI-NEXT: ldr s21, [sp, #40]
-; CHECK-GI-NEXT: ldr s6, [sp, #64]
-; CHECK-GI-NEXT: ldr s22, [sp, #72]
-; CHECK-GI-NEXT: ldr s7, [sp, #96]
-; CHECK-GI-NEXT: ldr s23, [sp, #104]
-; CHECK-GI-NEXT: mov.s v0[1], v20[0]
-; CHECK-GI-NEXT: mov.s v1[1], v21[0]
-; CHECK-GI-NEXT: ldr s16, [sp, #128]
-; CHECK-GI-NEXT: ldr s24, [sp, #136]
-; CHECK-GI-NEXT: mov.s v3[1], w1
-; CHECK-GI-NEXT: ldr s17, [sp, #160]
+; CHECK-GI-NEXT: ldr s22, [sp, #40]
+; CHECK-GI-NEXT: ldr s2, [sp, #64]
+; CHECK-GI-NEXT: ldr s23, [sp, #72]
+; CHECK-GI-NEXT: ldr s3, [sp, #96]
+; CHECK-GI-NEXT: ldr s24, [sp, #104]
+; CHECK-GI-NEXT: mov.s v16[1], w1
+; CHECK-GI-NEXT: mov.s v19[1], w5
+; CHECK-GI-NEXT: ldr s5, [sp, #128]
+; CHECK-GI-NEXT: ldr s20, [sp, #136]
+; CHECK-GI-NEXT: mov.s v0[1], v21[0]
+; CHECK-GI-NEXT: ldr s7, [sp, #160]
; CHECK-GI-NEXT: ldr s25, [sp, #168]
-; CHECK-GI-NEXT: mov.s v5[1], w5
-; CHECK-GI-NEXT: mov.s v6[1], v22[0]
-; CHECK-GI-NEXT: mov.s v7[1], v23[0]
-; CHECK-GI-NEXT: mov.s v16[1], v24[0]
-; CHECK-GI-NEXT: mov.s v17[1], v25[0]
-; CHECK-GI-NEXT: ldr s4, [sp, #16]
-; CHECK-GI-NEXT: ldr s19, [sp, #48]
+; CHECK-GI-NEXT: mov.s v1[1], v22[0]
+; CHECK-GI-NEXT: mov.s v2[1], v23[0]
+; CHECK-GI-NEXT: mov.s v3[1], v24[0]
+; CHECK-GI-NEXT: mov.s v5[1], v20[0]
+; CHECK-GI-NEXT: mov.s v7[1], v25[0]
+; CHECK-GI-NEXT: ldr s17, [sp, #16]
+; CHECK-GI-NEXT: ldr s18, [sp, #48]
; CHECK-GI-NEXT: ldr s20, [sp, #80]
; CHECK-GI-NEXT: ldr s21, [sp, #112]
; CHECK-GI-NEXT: ldr s22, [sp, #144]
; CHECK-GI-NEXT: ldr s23, [sp, #176]
-; CHECK-GI-NEXT: mov.s v3[2], w2
-; CHECK-GI-NEXT: mov.s v5[2], w6
-; CHECK-GI-NEXT: mov.s v0[2], v4[0]
-; CHECK-GI-NEXT: mov.s v1[2], v19[0]
-; CHECK-GI-NEXT: mov.s v6[2], v20[0]
-; CHECK-GI-NEXT: mov.s v7[2], v21[0]
-; CHECK-GI-NEXT: mov.s v16[2], v22[0]
-; CHECK-GI-NEXT: mov.s v17[2], v23[0]
-; CHECK-GI-NEXT: ldr s2, [sp, #24]
-; CHECK-GI-NEXT: ldr s18, [sp, #56]
-; CHECK-GI-NEXT: ldr s4, [sp, #88]
-; CHECK-GI-NEXT: ldr s19, [sp, #120]
+; CHECK-GI-NEXT: mov.s v16[2], w2
+; CHECK-GI-NEXT: mov.s v19[2], w6
+; CHECK-GI-NEXT: mov.s v0[2], v17[0]
+; CHECK-GI-NEXT: mov.s v1[2], v18[0]
+; CHECK-GI-NEXT: mov.s v2[2], v20[0]
+; CHECK-GI-NEXT: mov.s v3[2], v21[0]
+; CHECK-GI-NEXT: mov.s v5[2], v22[0]
+; CHECK-GI-NEXT: mov.s v7[2], v23[0]
+; CHECK-GI-NEXT: ldr s4, [sp, #24]
+; CHECK-GI-NEXT: ldr s6, [sp, #56]
+; CHECK-GI-NEXT: ldr s17, [sp, #88]
+; CHECK-GI-NEXT: ldr s18, [sp, #120]
; CHECK-GI-NEXT: ldr s20, [sp, #152]
; CHECK-GI-NEXT: ldr s21, [sp, #184]
-; CHECK-GI-NEXT: mov.s v3[3], w3
-; CHECK-GI-NEXT: mov.s v5[3], w7
-; CHECK-GI-NEXT: mov.s v0[3], v2[0]
-; CHECK-GI-NEXT: mov.s v1[3], v18[0]
-; CHECK-GI-NEXT: mov.s v6[3], v4[0]
-; CHECK-GI-NEXT: mov.s v7[3], v19[0]
-; CHECK-GI-NEXT: mov.s v16[3], v20[0]
-; CHECK-GI-NEXT: mov.s v17[3], v21[0]
-; CHECK-GI-NEXT: uzp1.8h v2, v3, v5
+; CHECK-GI-NEXT: mov.s v16[3], w3
+; CHECK-GI-NEXT: mov.s v19[3], w7
+; CHECK-GI-NEXT: mov.s v0[3], v4[0]
+; CHECK-GI-NEXT: mov.s v1[3], v6[0]
+; CHECK-GI-NEXT: mov.s v2[3], v17[0]
+; CHECK-GI-NEXT: mov.s v3[3], v18[0]
+; CHECK-GI-NEXT: mov.s v5[3], v20[0]
+; CHECK-GI-NEXT: mov.s v7[3], v21[0]
+; CHECK-GI-NEXT: uzp1.8h v4, v16, v19
; CHECK-GI-NEXT: uzp1.8h v0, v0, v1
-; CHECK-GI-NEXT: uzp1.8h v1, v6, v7
-; CHECK-GI-NEXT: uzp1.8h v3, v16, v17
-; CHECK-GI-NEXT: uzp1.16b v0, v2, v0
-; CHECK-GI-NEXT: uzp1.16b v1, v1, v3
+; CHECK-GI-NEXT: uzp1.8h v1, v2, v3
+; CHECK-GI-NEXT: uzp1.8h v2, v5, v7
+; CHECK-GI-NEXT: uzp1.16b v0, v4, v0
+; CHECK-GI-NEXT: uzp1.16b v1, v1, v2
; CHECK-GI-NEXT: shl.16b v0, v0, #7
; CHECK-GI-NEXT: shl.16b v1, v1, #7
; CHECK-GI-NEXT: sshr.16b v0, v0, #7
@@ -820,114 +820,114 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) {
; CHECK-GI-NEXT: ldr s1, [sp, #64]
; CHECK-GI-NEXT: ldr s23, [sp, #72]
; CHECK-GI-NEXT: mov.s v0[1], v4[0]
-; CHECK-GI-NEXT: ldr s4, [sp, #160]
-; CHECK-GI-NEXT: ldr s25, [sp, #168]
+; CHECK-GI-NEXT: ldr s28, [sp, #200]
+; CHECK-GI-NEXT: ldr s3, [sp, #128]
; CHECK-GI-NEXT: mov.s v2[1], v5[0]
; CHECK-GI-NEXT: mov.s v1[1], v23[0]
; CHECK-GI-NEXT: ldr s5, [sp, #192]
-; CHECK-GI-NEXT: ldr s28, [sp, #200]
-; CHECK-GI-NEXT: mov.s v4[1], v25[0]
-; CHECK-GI-NEXT: ldr s3, [sp, #128]
; CHECK-GI-NEXT: ldr s7, [sp, #136]
+; CHECK-GI-NEXT: ldr s4, [sp, #160]
+; CHECK-GI-NEXT: ldr s24, [sp, #168]
+; CHECK-GI-NEXT: mov.s v5[1], v28[0]
; CHECK-GI-NEXT: ldr s6, [sp, #48]
; CHECK-GI-NEXT: ldr s21, [sp, #80]
-; CHECK-GI-NEXT: mov.s v5[1], v28[0]
-; CHECK-GI-NEXT: ldr s16, [sp, #112]
-; CHECK-GI-NEXT: ldr s27, [sp, #176]
; CHECK-GI-NEXT: mov.s v3[1], v7[0]
+; CHECK-GI-NEXT: mov.s v4[1], v24[0]
+; CHECK-GI-NEXT: ldr s16, [sp, #112]
+; CHECK-GI-NEXT: ldr s29, [sp, #208]
; CHECK-GI-NEXT: mov.s v0[2], v6[0]
; CHECK-GI-NEXT: mov.s v1[2], v21[0]
-; CHECK-GI-NEXT: ldr s29, [sp, #208]
-; CHECK-GI-NEXT: ldr s20, [sp, #144]
-; CHECK-GI-NEXT: mov.s v2[2], v16[0]
; CHECK-GI-NEXT: ldr s6, [sp, #224]
; CHECK-GI-NEXT: ldr s30, [sp, #232]
-; CHECK-GI-NEXT: mov.s v4[2], v27[0]
-; CHECK-GI-NEXT: ldr s7, [sp, #256]
-; CHECK-GI-NEXT: ldr s31, [sp, #264]
+; CHECK-GI-NEXT: mov.s v2[2], v16[0]
+; CHECK-GI-NEXT: ldr s20, [sp, #144]
+; CHECK-GI-NEXT: ldr s27, [sp, #176]
; CHECK-GI-NEXT: mov.s v5[2], v29[0]
+; CHECK-GI-NEXT: mov.s v6[1], v30[0]
; CHECK-GI-NEXT: ldr s18, [sp, #88]
; CHECK-GI-NEXT: ldr s19, [sp, #120]
+; CHECK-GI-NEXT: ldr s7, [sp, #256]
+; CHECK-GI-NEXT: ldr s31, [sp, #264]
; CHECK-GI-NEXT: mov.s v3[2], v20[0]
-; CHECK-GI-NEXT: ldr s23, [sp, #184]
-; CHECK-GI-NEXT: ldr s24, [sp, #216]
-; CHECK-GI-NEXT: mov.s v6[1], v30[0]
-; CHECK-GI-NEXT: mov.s v7[1], v31[0]
+; CHECK-GI-NEXT: mov.s v4[2], v27[0]
+; CHECK-GI-NEXT: ldr s25, [sp, #216]
+; CHECK-GI-NEXT: ldr s26, [sp, #240]
; CHECK-GI-NEXT: ldr s17, [sp, #56]
; CHECK-GI-NEXT: ldr s22, [sp, #152]
-; CHECK-GI-NEXT: ldr s26, [sp, #240]
-; CHECK-GI-NEXT: ldr s28, [sp, #272]
; CHECK-GI-NEXT: mov.s v1[3], v18[0]
+; CHECK-GI-NEXT: ldr s23, [sp, #184]
; CHECK-GI-NEXT: mov.s v2[3], v19[0]
; CHECK-GI-NEXT: ldr s18, [sp, #320]
; CHECK-GI-NEXT: ldr s27, [sp, #328]
+; CHECK-GI-NEXT: mov.s v7[1], v31[0]
; CHECK-GI-NEXT: ldr s19, [sp, #352]
; CHECK-GI-NEXT: ldr s29, [sp, #360]
-; CHECK-GI-NEXT: mov.s v4[3], v23[0]
-; CHECK-GI-NEXT: mov.s v23[0], w0
-; CHECK-GI-NEXT: mov.s v5[3], v24[0]
-; CHECK-GI-NEXT: mov.s v24[0], w4
+; CHECK-GI-NEXT: mov.s v5[3], v25[0]
+; CHECK-GI-NEXT: mov.s v6[2], v26[0]
+; CHECK-GI-NEXT: fmov s25, w0
+; CHECK-GI-NEXT: fmov s26, w4
+; CHECK-GI-NEXT: ldr s28, [sp, #272]
; CHECK-GI-NEXT: mov.s v0[3], v17[0]
; CHECK-GI-NEXT: ldr s17, [sp, #288]
; CHECK-GI-NEXT: ldr s8, [sp, #296]
; CHECK-GI-NEXT: mov.s v3[3], v22[0]
; CHECK-GI-NEXT: ldr s20, [sp, #384]
-; CHECK-GI-NEXT: mov.s v6[2], v26[0]
+; CHECK-GI-NEXT: mov.s v4[3], v23[0]
; CHECK-GI-NEXT: ldr s30, [sp, #392]
-; CHECK-GI-NEXT: mov.s v7[2], v28[0]
; CHECK-GI-NEXT: ldr s22, [sp, #416]
-; CHECK-GI-NEXT: ldr s28, [sp, #424]
+; CHECK-GI-NEXT: ldr s31, [sp, #424]
+; CHECK-GI-NEXT: ldr s23, [sp, #448]
; CHECK-GI-NEXT: mov.s v18[1], v27[0]
-; CHECK-GI-NEXT: ldr s26, [sp, #448]
; CHECK-GI-NEXT: mov.s v19[1], v29[0]
; CHECK-GI-NEXT: ldr s27, [sp, #456]
-; CHECK-GI-NEXT: ldr s25, [sp, #336]
+; CHECK-GI-NEXT: ldr s24, [sp, #336]
; CHECK-GI-NEXT: mov.s v17[1], v8[0]
-; CHECK-GI-NEXT: mov.s v23[1], w1
-; CHECK-GI-NEXT: mov.s v24[1], w5
+; CHECK-GI-NEXT: mov.s v7[2], v28[0]
+; CHECK-GI-NEXT: mov.s v25[1], w1
+; CHECK-GI-NEXT: mov.s v26[1], w5
; CHECK-GI-NEXT: mov.s v20[1], v30[0]
-; CHECK-GI-NEXT: ldr s29, [sp, #368]
-; CHECK-GI-NEXT: mov.s v22[1], v28[0]
-; CHECK-GI-NEXT: mov.s v26[1], v27[0]
+; CHECK-GI-NEXT: ldr s28, [sp, #368]
+; CHECK-GI-NEXT: mov.s v22[1], v31[0]
+; CHECK-GI-NEXT: mov.s v23[1], v27[0]
; CHECK-GI-NEXT: ldr s9, [sp, #304]
; CHECK-GI-NEXT: ldr s27, [sp, #400]
-; CHECK-GI-NEXT: mov.s v18[2], v25[0]
-; CHECK-GI-NEXT: ldr s25, [sp, #432]
-; CHECK-GI-NEXT: mov.s v19[2], v29[0]
-; CHECK-GI-NEXT: ldr s29, [sp, #464]
+; CHECK-GI-NEXT: mov.s v18[2], v24[0]
+; CHECK-GI-NEXT: ldr s24, [sp, #432]
+; CHECK-GI-NEXT: mov.s v19[2], v28[0]
+; CHECK-GI-NEXT: ldr s28, [sp, #464]
; CHECK-GI-NEXT: ldr s16, [sp, #248]
; CHECK-GI-NEXT: ldr s21, [sp, #280]
; CHECK-GI-NEXT: mov.s v17[2], v9[0]
-; CHECK-GI-NEXT: mov.s v23[2], w2
-; CHECK-GI-NEXT: mov.s v24[2], w6
+; CHECK-GI-NEXT: mov.s v25[2], w2
+; CHECK-GI-NEXT: mov.s v26[2], w6
; CHECK-GI-NEXT: mov.s v20[2], v27[0]
-; CHECK-GI-NEXT: mov.s v22[2], v25[0]
-; CHECK-GI-NEXT: mov.s v26[2], v29[0]
-; CHECK-GI-NEXT: ldr s28, [sp, #312]
+; CHECK-GI-NEXT: mov.s v22[2], v24[0]
+; CHECK-GI-NEXT: mov.s v23[2], v28[0]
+; CHECK-GI-NEXT: ldr s29, [sp, #312]
; CHECK-GI-NEXT: ldr s27, [sp, #344]
-; CHECK-GI-NEXT: ldr s25, [sp, #376]
-; CHECK-GI-NEXT: ldr s29, [sp, #408]
+; CHECK-GI-NEXT: ldr s24, [sp, #376]
+; CHECK-GI-NEXT: ldr s28, [sp, #408]
; CHECK-GI-NEXT: mov.s v6[3], v16[0]
; CHECK-GI-NEXT: ldr s16, [sp, #440]
; CHECK-GI-NEXT: mov.s v7[3], v21[0]
; CHECK-GI-NEXT: ldr s21, [sp, #472]
-; CHECK-GI-NEXT: mov.s v23[3], w3
-; CHECK-GI-NEXT: mov.s v24[3], w7
-; CHECK-GI-NEXT: mov.s v17[3], v28[0]
+; CHECK-GI-NEXT: mov.s v25[3], w3
+; CHECK-GI-NEXT: mov.s v26[3], w7
+; CHECK-GI-NEXT: mov.s v17[3], v29[0]
; CHECK-GI-NEXT: mov.s v18[3], v27[0]
-; CHECK-GI-NEXT: mov.s v19[3], v25[0]
-; CHECK-GI-NEXT: mov.s v20[3], v29[0]
+; CHECK-GI-NEXT: mov.s v19[3], v24[0]
+; CHECK-GI-NEXT: mov.s v20[3], v28[0]
; CHECK-GI-NEXT: mov.s v22[3], v16[0]
-; CHECK-GI-NEXT: mov.s v26[3], v21[0]
+; CHECK-GI-NEXT: mov.s v23[3], v21[0]
; CHECK-GI-NEXT: uzp1.8h v0, v0, v1
; CHECK-GI-NEXT: uzp1.8h v1, v2, v3
; CHECK-GI-NEXT: uzp1.8h v2, v4, v5
; CHECK-GI-NEXT: uzp1.8h v3, v6, v7
; CHECK-GI-NEXT: ldr x29, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT: uzp1.8h v16, v23, v24
+; CHECK-GI-NEXT: uzp1.8h v16, v25, v26
; CHECK-GI-NEXT: uzp1.8h v4, v17, v18
; CHECK-GI-NEXT: uzp1.8h v5, v19, v20
-; CHECK-GI-NEXT: uzp1.8h v6, v22, v26
+; CHECK-GI-NEXT: uzp1.8h v6, v22, v23
; CHECK-GI-NEXT: uzp1.16b v1, v1, v2
; CHECK-GI-NEXT: uzp1.16b v0, v16, v0
; CHECK-GI-NEXT: uzp1.16b v2, v3, v4
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 79cfeedb74bce0..81f77dc50d636f 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -13,7 +13,7 @@ define <4 x i16> @foo1(<2 x i32> %a) {
; CHECK-GI-LABEL: foo1:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov w8, #58712 // =0xe558
-; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: zip1 v0.2s, v1.2s, v0.2s
; CHECK-GI-NEXT: rev32 v0.4h, v0.4h
; CHECK-GI-NEXT: ret
@@ -33,7 +33,7 @@ define <4 x i16> @foo2(<2 x i32> %a) {
; CHECK-GI-LABEL: foo2:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov w8, #712 // =0x2c8
-; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: zip1 v0.2s, v1.2s, v0.2s
; CHECK-GI-NEXT: rev32 v0.4h, v0.4h
; CHECK-GI-NEXT: ret
@@ -513,12 +513,10 @@ define <4 x i64> @bitcast_v8i32_v4i64(<8 x i32> %a, <8 x i32> %b){
;
; CHECK-GI-LABEL: bitcast_v8i32_v4i64:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: add v2.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT: add v3.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT: mov x8, v2.d[1]
-; CHECK-GI-NEXT: mov x9, v3.d[1]
-; CHECK-GI-NEXT: mov v0.d[0], v2.d[0]
-; CHECK-GI-NEXT: mov v1.d[0], v3.d[0]
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT: mov x8, v0.d[1]
+; CHECK-GI-NEXT: mov x9, v1.d[1]
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: mov v1.d[1], x9
; CHECK-GI-NEXT: ret
@@ -574,12 +572,10 @@ define <4 x i64> @bitcast_v16i16_v4i64(<16 x i16> %a, <16 x i16> %b){
;
; CHECK-GI-LABEL: bitcast_v16i16_v4i64:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: add v2.8h, v0.8h, v2.8h
-; CHECK-GI-NEXT: add v3.8h, v1.8h, v3.8h
-; CHECK-GI-NEXT: mov x8, v2.d[1]
-; CHECK-GI-NEXT: mov x9, v3.d[1]
-; CHECK-GI-NEXT: mov v0.d[0], v2.d[0]
-; CHECK-GI-NEXT: mov v1.d[0], v3.d[0]
+; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT: add v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT: mov x8, v0.d[1]
+; CHECK-GI-NEXT: mov x9, v1.d[1]
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: mov v1.d[1], x9
; CHECK-GI-NEXT: ret
@@ -616,18 +612,14 @@ define <8 x i64> @bitcast_v16i32_v8i64(<16 x i32> %a, <16 x i32> %b){
;
; CHECK-GI-LABEL: bitcast_v16i32_v8i64:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: add v4.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: add v5.4s, v1.4s, v5.4s
-; CHECK-GI-NEXT: add v6.4s, v2.4s, v6.4s
-; CHECK-GI-NEXT: add v7.4s, v3.4s, v7.4s
-; CHECK-GI-NEXT: mov x8, v4.d[1]
-; CHECK-GI-NEXT: mov x9, v5.d[1]
-; CHECK-GI-NEXT: mov x10, v6.d[1]
-; CHECK-GI-NEXT: mov x11, v7.d[1]
-; CHECK-GI-NEXT: mov v0.d[0], v4.d[0]
-; CHECK-GI-NEXT: mov v1.d[0], v5.d[0]
-; CHECK-GI-NEXT: mov v2.d[0], v6.d[0]
-; CHECK-GI-NEXT: mov v3.d[0], v7.d[0]
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT: add v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT: add v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT: mov x8, v0.d[1]
+; CHECK-GI-NEXT: mov x9, v1.d[1]
+; CHECK-GI-NEXT: mov x10, v2.d[1]
+; CHECK-GI-NEXT: mov x11, v3.d[1]
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: mov v1.d[1], x9
; CHECK-GI-NEXT: mov v2.d[1], x10
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 9ee924dd2548a6..df901e70ea3ac1 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -146,8 +146,7 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: rev w8, w8
-; CHECK-GI-NEXT: mov v0.s[0], w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: ret
entry:
%res = call <1 x i32> @llvm.bswap.v1i32(<1 x i32> %a)
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index 18570b2d793ff6..932732d18c0ad4 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -33,20 +33,18 @@ define <8 x i8> @concat2(<4 x i8> %A, <4 x i8> %B) {
;
; CHECK-GI-LABEL: concat2:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov v3.h[0], v1.h[0]
-; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v3.h[1], v1.h[1]
-; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v3.h[2], v1.h[2]
-; CHECK-GI-NEXT: mov v2.h[3], v0.h[3]
-; CHECK-GI-NEXT: mov v3.h[3], v1.h[3]
-; CHECK-GI-NEXT: xtn v0.8b, v2.8h
-; CHECK-GI-NEXT: xtn v1.8b, v3.8h
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: mov v2.h[0], v1.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v3.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov v2.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v3.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v2.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v3.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v2.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v3.h[3], v0.h[3]
+; CHECK-GI-NEXT: xtn v1.8b, v2.8h
+; CHECK-GI-NEXT: xtn v0.8b, v3.8h
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -74,16 +72,14 @@ define <4 x i16> @concat4(<2 x i16> %A, <2 x i16> %B) {
;
; CHECK-GI-LABEL: concat4:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
-; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
-; CHECK-GI-NEXT: xtn v2.4h, v2.4s
-; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
-; CHECK-GI-NEXT: xtn v1.4h, v0.4s
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: mov v2.s[0], v1.s[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v3.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v3.s[1], v0.s[1]
+; CHECK-GI-NEXT: xtn v1.4h, v2.4s
+; CHECK-GI-NEXT: xtn v0.4h, v3.4s
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -146,9 +142,8 @@ define <4 x half> @concat9(<2 x half> %A, <2 x half> %B) {
;
; CHECK-GI-LABEL: concat9:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
@@ -183,14 +178,12 @@ define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) {
;
; CHECK-GI-LABEL: concat_v8s16_v2s16:
; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: dup v0.4s, w8
; CHECK-GI-NEXT: ldr h1, [x0]
; CHECK-GI-NEXT: ldr h2, [x0, #2]
-; CHECK-GI-NEXT: dup v0.4s, w8
; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
; CHECK-GI-NEXT: xtn v2.4h, v0.4s
-; CHECK-GI-NEXT: xtn v1.4h, v1.4s
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: xtn v0.4h, v1.4s
; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w8
@@ -255,31 +248,29 @@ define <16 x i8> @concat_v16s8_v4s8_reg(<4 x i8> %A, <4 x i8> %B, <4 x i8> %C, <
;
; CHECK-GI-LABEL: concat_v16s8_v4s8_reg:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov v4.h[0], v0.h[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov v5.h[0], v1.h[0]
+; CHECK-GI-NEXT: mov v4.h[0], v1.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v5.h[0], v0.h[0]
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-GI-NEXT: mov v6.h[0], v2.h[0]
; CHECK-GI-NEXT: mov v7.h[0], v3.h[0]
-; CHECK-GI-NEXT: mov v4.h[1], v0.h[1]
-; CHECK-GI-NEXT: mov v5.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v4.h[1], v1.h[1]
+; CHECK-GI-NEXT: mov v5.h[1], v0.h[1]
; CHECK-GI-NEXT: mov v6.h[1], v2.h[1]
; CHECK-GI-NEXT: mov v7.h[1], v3.h[1]
-; CHECK-GI-NEXT: mov v4.h[2], v0.h[2]
-; CHECK-GI-NEXT: mov v5.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v4.h[2], v1.h[2]
+; CHECK-GI-NEXT: mov v5.h[2], v0.h[2]
; CHECK-GI-NEXT: mov v6.h[2], v2.h[2]
; CHECK-GI-NEXT: mov v7.h[2], v3.h[2]
-; CHECK-GI-NEXT: mov v4.h[3], v0.h[3]
-; CHECK-GI-NEXT: mov v5.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v4.h[3], v1.h[3]
+; CHECK-GI-NEXT: mov v5.h[3], v0.h[3]
; CHECK-GI-NEXT: mov v6.h[3], v2.h[3]
; CHECK-GI-NEXT: mov v7.h[3], v3.h[3]
-; CHECK-GI-NEXT: xtn v0.8b, v4.8h
-; CHECK-GI-NEXT: xtn v1.8b, v5.8h
+; CHECK-GI-NEXT: xtn v1.8b, v4.8h
+; CHECK-GI-NEXT: xtn v0.8b, v5.8h
; CHECK-GI-NEXT: xtn v2.8b, v6.8h
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: xtn v1.8b, v7.8h
; CHECK-GI-NEXT: mov v0.s[1], w8
@@ -308,26 +299,24 @@ define <8 x i16> @concat_v8s16_v2s16_reg(<2 x i16> %A, <2 x i16> %B, <2 x i16> %
;
; CHECK-GI-LABEL: concat_v8s16_v2s16_reg:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov v4.s[0], v0.s[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov v5.s[0], v1.s[0]
+; CHECK-GI-NEXT: mov v4.s[0], v1.s[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v5.s[0], v0.s[0]
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT: mov v4.s[1], v0.s[1]
-; CHECK-GI-NEXT: mov v5.s[1], v1.s[1]
+; CHECK-GI-NEXT: mov v4.s[1], v1.s[1]
; CHECK-GI-NEXT: mov v1.s[0], v2.s[0]
-; CHECK-GI-NEXT: xtn v0.4h, v4.4s
-; CHECK-GI-NEXT: xtn v4.4h, v5.4s
+; CHECK-GI-NEXT: mov v5.s[1], v0.s[1]
+; CHECK-GI-NEXT: xtn v4.4h, v4.4s
; CHECK-GI-NEXT: mov v1.s[1], v2.s[1]
; CHECK-GI-NEXT: mov v2.s[0], v3.s[0]
-; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: xtn v0.4h, v5.4s
; CHECK-GI-NEXT: xtn v1.4h, v1.4s
; CHECK-GI-NEXT: mov v2.s[1], v3.s[1]
-; CHECK-GI-NEXT: mov v0.s[0], w8
; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: xtn v2.4h, v2.4s
; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: xtn v2.4h, v2.4s
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: fmov w8, s2
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 8ca1e9ee5b6178..56f94c30eb86ad 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -556,7 +556,7 @@ define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double>
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: cset w19, lt
; CHECK-GI-NEXT: bl __lttf2
-; CHECK-GI-NEXT: mov v0.d[0], x19
+; CHECK-GI-NEXT: fmov d0, x19
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: cset w8, lt
; CHECK-GI-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload
@@ -663,29 +663,29 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: cset w22, lt
; CHECK-GI-NEXT: bl __lttf2
+; CHECK-GI-NEXT: ldp q0, q2, [sp, #64] // 32-byte Folded Reload
; CHECK-GI-NEXT: sbfx x8, x21, #0, #1
-; CHECK-GI-NEXT: ldp q3, q2, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT: ldp q4, q3, [sp, #96] // 32-byte Folded Reload
+; CHECK-GI-NEXT: sbfx x9, x22, #0, #1
+; CHECK-GI-NEXT: fmov d1, x8
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: mov v1.d[0], x8
-; CHECK-GI-NEXT: sbfx x8, x22, #0, #1
-; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-GI-NEXT: ldp q4, q3, [sp, #96] // 32-byte Folded Reload
-; CHECK-GI-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.d[1], x8
-; CHECK-GI-NEXT: mov v1.d[1], x8
-; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT: mov v2.d[1], v0.d[0]
+; CHECK-GI-NEXT: fmov d0, x8
; CHECK-GI-NEXT: cset w8, lt
+; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
; CHECK-GI-NEXT: sbfx x8, x8, #0, #1
-; CHECK-GI-NEXT: and v1.16b, v2.16b, v1.16b
-; CHECK-GI-NEXT: bic v0.16b, v3.16b, v0.16b
+; CHECK-GI-NEXT: mov v1.d[1], x9
+; CHECK-GI-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.d[1], x9
; CHECK-GI-NEXT: and x9, x19, x8
; CHECK-GI-NEXT: bic x8, x20, x8
; CHECK-GI-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload
; CHECK-GI-NEXT: orr x8, x9, x8
-; CHECK-GI-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: bic v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #176
@@ -831,21 +831,21 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x
; CHECK-GI-NEXT: fcmp d2, d5
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
-; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: cset w9, mi
-; CHECK-GI-NEXT: mov v2.d[0], x9
-; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff
-; CHECK-GI-NEXT: fcmgt v0.2d, v3.2d, v0.2d
; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: mov v3.s[0], w9
+; CHECK-GI-NEXT: fmov d2, x9
+; CHECK-GI-NEXT: fcmgt v0.2d, v3.2d, v0.2d
; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff
; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT: mov v3.s[1], w9
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: mov v2.s[1], w8
; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: neg v1.4s, v1.4s
-; CHECK-GI-NEXT: mov v3.s[2], w9
+; CHECK-GI-NEXT: mov v2.s[2], w8
; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: eor v1.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT: eor v1.16b, v0.16b, v2.16b
; CHECK-GI-NEXT: and v0.16b, v6.16b, v0.16b
; CHECK-GI-NEXT: and v1.16b, v7.16b, v1.16b
; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
@@ -902,18 +902,18 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d,
; CHECK-GI-LABEL: v3f32_float:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #31 // =0x1f
-; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff
; CHECK-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT: mov v4.s[0], w8
-; CHECK-GI-NEXT: mov v5.s[0], w9
+; CHECK-GI-NEXT: fmov s4, w8
; CHECK-GI-NEXT: mov v4.s[1], w8
-; CHECK-GI-NEXT: mov v5.s[1], w9
; CHECK-GI-NEXT: mov v4.s[2], w8
-; CHECK-GI-NEXT: mov v5.s[2], w9
+; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: neg v1.4s, v4.4s
-; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-NEXT: neg v4.4s, v4.4s
+; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
@@ -980,18 +980,18 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i
; CHECK-GI-LABEL: v3f32_i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #31 // =0x1f
-; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff
; CHECK-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT: mov v4.s[0], w8
-; CHECK-GI-NEXT: mov v5.s[0], w9
+; CHECK-GI-NEXT: fmov s4, w8
; CHECK-GI-NEXT: mov v4.s[1], w8
-; CHECK-GI-NEXT: mov v5.s[1], w9
; CHECK-GI-NEXT: mov v4.s[2], w8
-; CHECK-GI-NEXT: mov v5.s[2], w9
+; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: neg v1.4s, v4.4s
-; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-NEXT: neg v4.4s, v4.4s
+; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
@@ -1594,49 +1594,49 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
; CHECK-GI-NOFP16-NEXT: mov v2.h[0], v0.h[4]
; CHECK-GI-NOFP16-NEXT: mov v3.h[0], v1.h[4]
; CHECK-GI-NOFP16-NEXT: mov w8, #31 // =0x1f
-; CHECK-GI-NOFP16-NEXT: mov v4.s[0], w8
-; CHECK-GI-NOFP16-NEXT: mov w9, #-1 // =0xffffffff
-; CHECK-GI-NOFP16-NEXT: mov v5.s[0], w0
-; CHECK-GI-NOFP16-NEXT: mov v6.s[0], w9
-; CHECK-GI-NOFP16-NEXT: mov v7.s[0], w7
-; CHECK-GI-NOFP16-NEXT: ldr s16, [sp]
-; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #24]
-; CHECK-GI-NOFP16-NEXT: ldr s18, [sp, #32]
+; CHECK-GI-NOFP16-NEXT: fmov s4, w8
+; CHECK-GI-NOFP16-NEXT: fmov s7, w0
+; CHECK-GI-NOFP16-NEXT: ldr s5, [sp]
+; CHECK-GI-NOFP16-NEXT: fmov s16, w7
+; CHECK-GI-NOFP16-NEXT: fmov s18, w4
+; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #32]
+; CHECK-GI-NOFP16-NEXT: ldr s6, [sp, #8]
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov v3.h[1], v1.h[5]
; CHECK-GI-NOFP16-NEXT: mov v4.s[1], w8
-; CHECK-GI-NOFP16-NEXT: mov v5.s[1], w1
-; CHECK-GI-NOFP16-NEXT: mov v17.s[1], v18.s[0]
-; CHECK-GI-NOFP16-NEXT: mov v6.s[1], w9
-; CHECK-GI-NOFP16-NEXT: mov v7.s[1], v16.s[0]
-; CHECK-GI-NOFP16-NEXT: ldr s16, [sp, #8]
+; CHECK-GI-NOFP16-NEXT: mov v7.s[1], w1
+; CHECK-GI-NOFP16-NEXT: mov v16.s[1], v5.s[0]
+; CHECK-GI-NOFP16-NEXT: ldr s5, [sp, #24]
+; CHECK-GI-NOFP16-NEXT: mov v18.s[1], w5
+; CHECK-GI-NOFP16-NEXT: mov v5.s[1], v17.s[0]
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: mov v4.s[2], w8
+; CHECK-GI-NOFP16-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v5.s[2], w2
-; CHECK-GI-NOFP16-NEXT: mov v6.s[2], w9
-; CHECK-GI-NOFP16-NEXT: mov v7.s[2], v16.s[0]
-; CHECK-GI-NOFP16-NEXT: ldr s16, [sp, #40]
+; CHECK-GI-NOFP16-NEXT: mov v7.s[2], w2
+; CHECK-GI-NOFP16-NEXT: mov v16.s[2], v6.s[0]
+; CHECK-GI-NOFP16-NEXT: ldr s6, [sp, #40]
+; CHECK-GI-NOFP16-NEXT: mov v18.s[2], w6
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT: mov v17.s[2], v16.s[0]
+; CHECK-GI-NOFP16-NEXT: mov v5.s[2], v6.s[0]
; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.s[3], w3
+; CHECK-GI-NOFP16-NEXT: mov v7.s[3], w3
; CHECK-GI-NOFP16-NEXT: fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.s[0], w4
+; CHECK-GI-NOFP16-NEXT: fmov s3, w8
+; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w8
; CHECK-GI-NOFP16-NEXT: ushl v2.4s, v2.4s, v4.4s
; CHECK-GI-NOFP16-NEXT: neg v4.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w5
+; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w8
; CHECK-GI-NOFP16-NEXT: sshl v2.4s, v2.4s, v4.4s
; CHECK-GI-NOFP16-NEXT: ldr s4, [sp, #16]
-; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w6
-; CHECK-GI-NOFP16-NEXT: mov v7.s[3], v4.s[0]
-; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v6.16b
-; CHECK-GI-NOFP16-NEXT: and v2.16b, v3.16b, v2.16b
-; CHECK-GI-NOFP16-NEXT: and v1.16b, v17.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT: bsl v0.16b, v5.16b, v7.16b
+; CHECK-GI-NOFP16-NEXT: mov v16.s[3], v4.s[0]
+; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v3.16b
+; CHECK-GI-NOFP16-NEXT: and v2.16b, v18.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT: bsl v0.16b, v7.16b, v16.16b
+; CHECK-GI-NOFP16-NEXT: and v1.16b, v5.16b, v1.16b
; CHECK-GI-NOFP16-NEXT: orr v1.16b, v2.16b, v1.16b
; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[1]
; CHECK-GI-NOFP16-NEXT: mov s3, v0.s[2]
@@ -1655,60 +1655,60 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
; CHECK-GI-FP16-LABEL: v7f16_i32:
; CHECK-GI-FP16: // %bb.0: // %entry
; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT: mov w9, #31 // =0x1f
-; CHECK-GI-FP16-NEXT: mov v4.s[0], w0
-; CHECK-GI-FP16-NEXT: mov v2.s[0], w9
-; CHECK-GI-FP16-NEXT: mov v5.s[0], w7
-; CHECK-GI-FP16-NEXT: ldr s6, [sp]
-; CHECK-GI-FP16-NEXT: mov v7.s[0], w4
+; CHECK-GI-FP16-NEXT: mov w10, #31 // =0x1f
+; CHECK-GI-FP16-NEXT: fmov s5, w0
+; CHECK-GI-FP16-NEXT: fmov s2, w10
+; CHECK-GI-FP16-NEXT: fmov s6, w7
+; CHECK-GI-FP16-NEXT: ldr s3, [sp]
+; CHECK-GI-FP16-NEXT: fmov s17, w4
+; CHECK-GI-FP16-NEXT: ldr s7, [sp, #24]
; CHECK-GI-FP16-NEXT: ldr s16, [sp, #32]
-; CHECK-GI-FP16-NEXT: ldr s17, [sp, #8]
+; CHECK-GI-FP16-NEXT: mov v5.s[1], w1
; CHECK-GI-FP16-NEXT: umov w8, v0.h[4]
-; CHECK-GI-FP16-NEXT: umov w10, v0.h[5]
-; CHECK-GI-FP16-NEXT: mov v4.s[1], w1
-; CHECK-GI-FP16-NEXT: mov v2.s[1], w9
-; CHECK-GI-FP16-NEXT: mov v5.s[1], v6.s[0]
-; CHECK-GI-FP16-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-FP16-NEXT: mov v7.s[1], w5
-; CHECK-GI-FP16-NEXT: mov v6.s[1], v16.s[0]
-; CHECK-GI-FP16-NEXT: ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT: mov v1.s[0], w8
+; CHECK-GI-FP16-NEXT: umov w9, v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v2.s[1], w10
+; CHECK-GI-FP16-NEXT: mov v6.s[1], v3.s[0]
+; CHECK-GI-FP16-NEXT: ldr s3, [sp, #8]
+; CHECK-GI-FP16-NEXT: mov v17.s[1], w5
+; CHECK-GI-FP16-NEXT: mov v7.s[1], v16.s[0]
+; CHECK-GI-FP16-NEXT: mov v5.s[2], w2
+; CHECK-GI-FP16-NEXT: fmov s1, w8
; CHECK-GI-FP16-NEXT: umov w8, v0.h[6]
+; CHECK-GI-FP16-NEXT: mov v2.s[2], w10
; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-GI-FP16-NEXT: mov v2.s[2], w9
-; CHECK-GI-FP16-NEXT: mov v4.s[2], w2
-; CHECK-GI-FP16-NEXT: mov v5.s[2], v17.s[0]
-; CHECK-GI-FP16-NEXT: mov v7.s[2], w6
+; CHECK-GI-FP16-NEXT: mov v6.s[2], v3.s[0]
+; CHECK-GI-FP16-NEXT: ldr s3, [sp, #40]
+; CHECK-GI-FP16-NEXT: mov v17.s[2], w6
+; CHECK-GI-FP16-NEXT: mov v1.s[1], w9
+; CHECK-GI-FP16-NEXT: mov v7.s[2], v3.s[0]
+; CHECK-GI-FP16-NEXT: mov v5.s[3], w3
; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT: mov v6.s[2], v16.s[0]
-; CHECK-GI-FP16-NEXT: mov v1.s[1], w10
-; CHECK-GI-FP16-NEXT: mov w10, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT: mov v3.s[0], w10
-; CHECK-GI-FP16-NEXT: mov v4.s[3], w3
-; CHECK-GI-FP16-NEXT: sshr v0.4s, v0.4s, #31
; CHECK-GI-FP16-NEXT: mov v1.s[2], w8
-; CHECK-GI-FP16-NEXT: mov v3.s[1], w10
+; CHECK-GI-FP16-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT: sshr v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT: fmov s4, w8
+; CHECK-GI-FP16-NEXT: mov v4.s[1], w8
; CHECK-GI-FP16-NEXT: ushl v1.4s, v1.4s, v2.4s
; CHECK-GI-FP16-NEXT: neg v2.4s, v2.4s
-; CHECK-GI-FP16-NEXT: mov v3.s[2], w10
; CHECK-GI-FP16-NEXT: sshl v1.4s, v1.4s, v2.4s
; CHECK-GI-FP16-NEXT: ldr s2, [sp, #16]
-; CHECK-GI-FP16-NEXT: mov v5.s[3], v2.s[0]
-; CHECK-GI-FP16-NEXT: eor v3.16b, v1.16b, v3.16b
-; CHECK-GI-FP16-NEXT: and v1.16b, v7.16b, v1.16b
-; CHECK-GI-FP16-NEXT: and v2.16b, v6.16b, v3.16b
-; CHECK-GI-FP16-NEXT: bsl v0.16b, v4.16b, v5.16b
-; CHECK-GI-FP16-NEXT: orr v1.16b, v1.16b, v2.16b
-; CHECK-GI-FP16-NEXT: mov s2, v0.s[1]
+; CHECK-GI-FP16-NEXT: mov v4.s[2], w8
+; CHECK-GI-FP16-NEXT: mov v6.s[3], v2.s[0]
+; CHECK-GI-FP16-NEXT: eor v3.16b, v1.16b, v4.16b
+; CHECK-GI-FP16-NEXT: and v1.16b, v17.16b, v1.16b
+; CHECK-GI-FP16-NEXT: bsl v0.16b, v5.16b, v6.16b
+; CHECK-GI-FP16-NEXT: and v2.16b, v7.16b, v3.16b
; CHECK-GI-FP16-NEXT: mov s3, v0.s[2]
; CHECK-GI-FP16-NEXT: mov s4, v0.s[3]
; CHECK-GI-FP16-NEXT: fmov w0, s0
+; CHECK-GI-FP16-NEXT: orr v1.16b, v1.16b, v2.16b
+; CHECK-GI-FP16-NEXT: mov s2, v0.s[1]
; CHECK-GI-FP16-NEXT: mov s5, v1.s[1]
; CHECK-GI-FP16-NEXT: mov s6, v1.s[2]
-; CHECK-GI-FP16-NEXT: fmov w4, s1
-; CHECK-GI-FP16-NEXT: fmov w1, s2
; CHECK-GI-FP16-NEXT: fmov w2, s3
+; CHECK-GI-FP16-NEXT: fmov w1, s2
; CHECK-GI-FP16-NEXT: fmov w3, s4
+; CHECK-GI-FP16-NEXT: fmov w4, s1
; CHECK-GI-FP16-NEXT: fmov w5, s5
; CHECK-GI-FP16-NEXT: fmov w6, s6
; CHECK-GI-FP16-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index a42ec8e253be29..3a5f7e2cd6b29e 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -156,8 +156,8 @@ define <3 x float> @copysign_v3f32(<3 x float> %a, <3 x float> %b) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #-2147483648 // =0x80000000
; CHECK-GI-NEXT: mov w9, #2147483647 // =0x7fffffff
-; CHECK-GI-NEXT: mov v2.s[0], w9
-; CHECK-GI-NEXT: mov v3.s[0], w8
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: fmov s3, w8
; CHECK-GI-NEXT: mov v2.s[1], w9
; CHECK-GI-NEXT: mov v3.s[1], w8
; CHECK-GI-NEXT: mov v2.s[2], w9
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index aa20304e52a951..2ea7e0f3c44a9a 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -11,15 +11,13 @@ define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
; CHECK-GI-LABEL: interleave2_v4f16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: dup v2.4s, w8
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: xtn v0.4h, v2.4s
-; CHECK-GI-NEXT: mov v1.s[0], w8
-; CHECK-GI-NEXT: mov v2.s[0], w9
-; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: xtn v2.4h, v2.4s
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: mov v2.s[1], w8
-; CHECK-GI-NEXT: zip1 v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT: zip1 v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: ret
%retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
ret <4 x half> %retval
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 20b5567e973d09..08c5bd59f93e0d 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -7297,7 +7297,7 @@ define <2 x i64> @fptos_v2f128_v2i64(<2 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov x19, x0
; CHECK-GI-NEXT: bl __fixtfdi
-; CHECK-GI-NEXT: mov v0.d[0], x19
+; CHECK-GI-NEXT: fmov d0, x19
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.d[1], x0
; CHECK-GI-NEXT: add sp, sp, #32
@@ -7340,7 +7340,7 @@ define <2 x i64> @fptou_v2f128_v2i64(<2 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov x19, x0
; CHECK-GI-NEXT: bl __fixunstfdi
-; CHECK-GI-NEXT: mov v0.d[0], x19
+; CHECK-GI-NEXT: fmov d0, x19
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.d[1], x0
; CHECK-GI-NEXT: add sp, sp, #32
@@ -7496,7 +7496,7 @@ define <2 x i32> @fptos_v2f128_v2i32(<2 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w19, w0
; CHECK-GI-NEXT: bl __fixtfsi
-; CHECK-GI-NEXT: mov v0.s[0], w19
+; CHECK-GI-NEXT: fmov s0, w19
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.s[1], w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -7539,7 +7539,7 @@ define <2 x i32> @fptou_v2f128_v2i32(<2 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w19, w0
; CHECK-GI-NEXT: bl __fixunstfsi
-; CHECK-GI-NEXT: mov v0.s[0], w19
+; CHECK-GI-NEXT: fmov s0, w19
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.s[1], w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -7591,7 +7591,7 @@ define <3 x i32> @fptos_v3f128_v3i32(<3 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w20, w0
; CHECK-GI-NEXT: bl __fixtfsi
-; CHECK-GI-NEXT: mov v0.s[0], w19
+; CHECK-GI-NEXT: fmov s0, w19
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-GI-NEXT: mov v0.s[1], w20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
@@ -7644,7 +7644,7 @@ define <3 x i32> @fptou_v3f128_v3i32(<3 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w20, w0
; CHECK-GI-NEXT: bl __fixunstfsi
-; CHECK-GI-NEXT: mov v0.s[0], w19
+; CHECK-GI-NEXT: fmov s0, w19
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-GI-NEXT: mov v0.s[1], w20
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
@@ -7890,7 +7890,7 @@ define <2 x i8> @fptos_v2f128_v2i8(<2 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w19, w0
; CHECK-GI-NEXT: bl __fixtfsi
-; CHECK-GI-NEXT: mov v0.s[0], w19
+; CHECK-GI-NEXT: fmov s0, w19
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.s[1], w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -7933,7 +7933,7 @@ define <2 x i8> @fptou_v2f128_v2i8(<2 x fp128> %a) {
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov w19, w0
; CHECK-GI-NEXT: bl __fixunstfsi
-; CHECK-GI-NEXT: mov v0.s[0], w19
+; CHECK-GI-NEXT: fmov s0, w19
; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.s[1], w0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index b00e5d6c701d8b..6baf1a84d407c4 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1228,18 +1228,18 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32>
; CHECK-GI-LABEL: v3i32_i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #31 // =0x1f
-; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff
; CHECK-GI-NEXT: cmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT: mov v4.s[0], w8
-; CHECK-GI-NEXT: mov v5.s[0], w9
+; CHECK-GI-NEXT: fmov s4, w8
; CHECK-GI-NEXT: mov v4.s[1], w8
-; CHECK-GI-NEXT: mov v5.s[1], w9
; CHECK-GI-NEXT: mov v4.s[2], w8
-; CHECK-GI-NEXT: mov v5.s[2], w9
+; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: neg v1.4s, v4.4s
-; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-NEXT: neg v4.4s, v4.4s
+; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 296e267a9c7f0b..4be7ff5780d9db 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -271,11 +271,9 @@ define <3 x float> @insert_v3f32_2(<3 x float> %a, float %b, i32 %c) {
;
; CHECK-GI-LABEL: insert_v3f32_2:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], v0.s[1]
; CHECK-GI-NEXT: // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
-; CHECK-GI-NEXT: mov v2.s[2], v1.s[0]
-; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
; CHECK-GI-NEXT: ret
entry:
%d = insertelement <3 x float> %a, float %b, i32 2
@@ -964,21 +962,13 @@ entry:
}
define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) {
-; CHECK-SD-LABEL: insert_v3i32_0:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fmov s1, w0
-; CHECK-SD-NEXT: mov v1.s[1], v0.s[1]
-; CHECK-SD-NEXT: mov v1.s[2], v0.s[2]
-; CHECK-SD-NEXT: mov v0.16b, v1.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: insert_v3i32_0:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v1.s[0], w0
-; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
-; CHECK-GI-NEXT: mov v1.s[2], v0.s[2]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: insert_v3i32_0:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-NEXT: mov v1.s[2], v0.s[2]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
entry:
%d = insertelement <3 x i32> %a, i32 %b, i32 0
ret <3 x i32> %d
@@ -992,10 +982,8 @@ define <3 x i32> @insert_v3i32_2(<3 x i32> %a, i32 %b, i32 %c) {
;
; CHECK-GI-LABEL: insert_v3i32_2:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
-; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
-; CHECK-GI-NEXT: mov v1.s[2], w0
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[1], v0.s[1]
+; CHECK-GI-NEXT: mov v0.s[2], w0
; CHECK-GI-NEXT: ret
entry:
%d = insertelement <3 x i32> %a, i32 %b, i32 2
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 4ac04798e15481..d9fc3eabd34873 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -3313,24 +3313,24 @@ define <3 x double> @stofp_v3i8_v3f64(<3 x i8> %a) {
; CHECK-GI-NEXT: mov v0.h[2], w2
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
-; CHECK-GI-NEXT: mov s1, v0.s[1]
; CHECK-GI-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NEXT: mov s1, v0.s[1]
; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
; CHECK-GI-NEXT: mov h2, v1.h[1]
; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
; CHECK-GI-NEXT: smov x8, v0.s[0]
; CHECK-GI-NEXT: smov x9, v0.s[1]
-; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: smov x8, v1.s[0]
-; CHECK-GI-NEXT: mov v0.d[1], x9
-; CHECK-GI-NEXT: smov x9, v1.s[1]
-; CHECK-GI-NEXT: mov v1.d[0], x8
+; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT: fmov d1, x8
+; CHECK-GI-NEXT: smov x8, v0.s[0]
; CHECK-GI-NEXT: mov v1.d[1], x9
-; CHECK-GI-NEXT: scvtf v0.2d, v0.2d
-; CHECK-GI-NEXT: scvtf v2.2d, v1.2d
+; CHECK-GI-NEXT: smov x9, v0.s[1]
+; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: scvtf v0.2d, v1.2d
+; CHECK-GI-NEXT: mov v2.d[1], x9
; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: scvtf v2.2d, v2.2d
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: ret
@@ -3365,24 +3365,24 @@ define <3 x double> @utofp_v3i8_v3f64(<3 x i8> %a) {
; CHECK-GI-NEXT: mov v0.h[1], w1
; CHECK-GI-NEXT: mov v0.h[2], w2
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT: mov s1, v0.s[1]
; CHECK-GI-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NEXT: mov s1, v0.s[1]
; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
; CHECK-GI-NEXT: mov h2, v1.h[1]
; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
; CHECK-GI-NEXT: mov w8, v0.s[0]
; CHECK-GI-NEXT: mov w9, v0.s[1]
-; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: mov w8, v1.s[0]
-; CHECK-GI-NEXT: mov v0.d[1], x9
-; CHECK-GI-NEXT: mov w9, v1.s[1]
-; CHECK-GI-NEXT: mov v1.d[0], x8
+; CHECK-GI-NEXT: ushll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT: fmov d1, x8
+; CHECK-GI-NEXT: mov w8, v0.s[0]
; CHECK-GI-NEXT: mov v1.d[1], x9
-; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-GI-NEXT: ucvtf v2.2d, v1.2d
+; CHECK-GI-NEXT: mov w9, v0.s[1]
+; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: ucvtf v0.2d, v1.2d
+; CHECK-GI-NEXT: mov v2.d[1], x9
; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: ucvtf v2.2d, v2.2d
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll
index de2c9d50b80540..fc395597b48316 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll
@@ -1267,87 +1267,87 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) {
; CHECK-GI-LABEL: v20:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s18, [sp, #8]
-; CHECK-GI-NEXT: mov v1.s[0], w0
-; CHECK-GI-NEXT: mov v4.s[0], w4
-; CHECK-GI-NEXT: ldr s17, [sp, #96]
-; CHECK-GI-NEXT: ldr s25, [sp, #104]
-; CHECK-GI-NEXT: mov v0.s[1], v18.s[0]
-; CHECK-GI-NEXT: ldr s18, [sp, #128]
-; CHECK-GI-NEXT: ldr s26, [sp, #136]
-; CHECK-GI-NEXT: ldr s22, [sp, #16]
+; CHECK-GI-NEXT: ldr s4, [sp, #8]
+; CHECK-GI-NEXT: fmov s1, w0
+; CHECK-GI-NEXT: ldr s18, [sp, #16]
+; CHECK-GI-NEXT: fmov s3, w4
+; CHECK-GI-NEXT: ldr s16, [sp, #96]
+; CHECK-GI-NEXT: mov v0.s[1], v4.s[0]
+; CHECK-GI-NEXT: ldr s22, [sp, #104]
+; CHECK-GI-NEXT: ldr s23, [sp, #136]
; CHECK-GI-NEXT: ldr s2, [sp, #32]
-; CHECK-GI-NEXT: mov v17.s[1], v25.s[0]
+; CHECK-GI-NEXT: ldr s19, [sp, #40]
; CHECK-GI-NEXT: mov v1.s[1], w1
-; CHECK-GI-NEXT: ldr s24, [sp, #40]
-; CHECK-GI-NEXT: mov v18.s[1], v26.s[0]
-; CHECK-GI-NEXT: mov v4.s[1], w5
-; CHECK-GI-NEXT: ldr s3, [sp, #64]
+; CHECK-GI-NEXT: mov v3.s[1], w5
+; CHECK-GI-NEXT: mov v16.s[1], v22.s[0]
+; CHECK-GI-NEXT: ldr s4, [sp, #64]
; CHECK-GI-NEXT: ldr s21, [sp, #72]
+; CHECK-GI-NEXT: mov v2.s[1], v19.s[0]
; CHECK-GI-NEXT: ldr s19, [sp, #160]
-; CHECK-GI-NEXT: ldr s27, [sp, #168]
+; CHECK-GI-NEXT: mov v0.s[2], v18.s[0]
+; CHECK-GI-NEXT: ldr s18, [sp, #128]
+; CHECK-GI-NEXT: ldr s24, [sp, #168]
; CHECK-GI-NEXT: ldr s20, [sp, #192]
-; CHECK-GI-NEXT: ldr s28, [sp, #200]
-; CHECK-GI-NEXT: mov v0.s[2], v22.s[0]
+; CHECK-GI-NEXT: ldr s25, [sp, #200]
; CHECK-GI-NEXT: ldr s22, [sp, #224]
-; CHECK-GI-NEXT: ldr s25, [sp, #232]
+; CHECK-GI-NEXT: mov v18.s[1], v23.s[0]
+; CHECK-GI-NEXT: ldr s27, [sp, #232]
; CHECK-GI-NEXT: ldr s23, [sp, #112]
-; CHECK-GI-NEXT: mov v2.s[1], v24.s[0]
-; CHECK-GI-NEXT: ldr s24, [sp, #144]
-; CHECK-GI-NEXT: mov v19.s[1], v27.s[0]
-; CHECK-GI-NEXT: mov v20.s[1], v28.s[0]
-; CHECK-GI-NEXT: mov v3.s[1], v21.s[0]
-; CHECK-GI-NEXT: mov v22.s[1], v25.s[0]
-; CHECK-GI-NEXT: ldr s16, [sp, #48]
+; CHECK-GI-NEXT: ldr s26, [sp, #144]
+; CHECK-GI-NEXT: mov v19.s[1], v24.s[0]
+; CHECK-GI-NEXT: mov v20.s[1], v25.s[0]
+; CHECK-GI-NEXT: mov v4.s[1], v21.s[0]
+; CHECK-GI-NEXT: mov v22.s[1], v27.s[0]
; CHECK-GI-NEXT: mov v1.s[2], w2
-; CHECK-GI-NEXT: mov v4.s[2], w6
-; CHECK-GI-NEXT: mov v17.s[2], v23.s[0]
-; CHECK-GI-NEXT: mov v18.s[2], v24.s[0]
+; CHECK-GI-NEXT: ldr s17, [sp, #48]
+; CHECK-GI-NEXT: mov v3.s[2], w6
+; CHECK-GI-NEXT: mov v16.s[2], v23.s[0]
+; CHECK-GI-NEXT: mov v18.s[2], v26.s[0]
; CHECK-GI-NEXT: ldr s7, [sp, #80]
; CHECK-GI-NEXT: ldr s21, [sp, #176]
-; CHECK-GI-NEXT: ldr s26, [sp, #208]
-; CHECK-GI-NEXT: ldr s24, [sp, #240]
-; CHECK-GI-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-GI-NEXT: ldr s16, [sp, #120]
+; CHECK-GI-NEXT: ldr s24, [sp, #208]
+; CHECK-GI-NEXT: ldr s25, [sp, #240]
+; CHECK-GI-NEXT: mov v2.s[2], v17.s[0]
+; CHECK-GI-NEXT: ldr s17, [sp, #120]
; CHECK-GI-NEXT: ldr s23, [sp, #152]
; CHECK-GI-NEXT: ldr s5, [sp, #24]
; CHECK-GI-NEXT: mov v19.s[2], v21.s[0]
-; CHECK-GI-NEXT: mov v20.s[2], v26.s[0]
-; CHECK-GI-NEXT: mov v3.s[2], v7.s[0]
-; CHECK-GI-NEXT: mov v22.s[2], v24.s[0]
+; CHECK-GI-NEXT: mov v20.s[2], v24.s[0]
+; CHECK-GI-NEXT: mov v4.s[2], v7.s[0]
+; CHECK-GI-NEXT: mov v22.s[2], v25.s[0]
; CHECK-GI-NEXT: mov v1.s[3], w3
-; CHECK-GI-NEXT: mov v4.s[3], w7
-; CHECK-GI-NEXT: mov v17.s[3], v16.s[0]
+; CHECK-GI-NEXT: mov v3.s[3], w7
+; CHECK-GI-NEXT: mov v16.s[3], v17.s[0]
; CHECK-GI-NEXT: mov v18.s[3], v23.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #56]
; CHECK-GI-NEXT: ldr s7, [sp, #184]
; CHECK-GI-NEXT: ldr s21, [sp, #216]
-; CHECK-GI-NEXT: ldr s16, [sp, #88]
+; CHECK-GI-NEXT: ldr s17, [sp, #88]
; CHECK-GI-NEXT: mov v0.s[3], v5.s[0]
; CHECK-GI-NEXT: ldr s5, [sp, #248]
; CHECK-GI-NEXT: mov v2.s[3], v6.s[0]
; CHECK-GI-NEXT: mov v19.s[3], v7.s[0]
; CHECK-GI-NEXT: mov v20.s[3], v21.s[0]
-; CHECK-GI-NEXT: mov v3.s[3], v16.s[0]
+; CHECK-GI-NEXT: mov v4.s[3], v17.s[0]
; CHECK-GI-NEXT: mov v22.s[3], v5.s[0]
-; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v4.8h
-; CHECK-GI-NEXT: movi v4.2d, #0xff00ff00ff00ff
-; CHECK-GI-NEXT: uzp1 v5.8h, v17.8h, v18.8h
+; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT: uzp1 v5.8h, v16.8h, v18.8h
; CHECK-GI-NEXT: dup v6.4s, w8
; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; CHECK-GI-NEXT: uzp1 v2.8h, v19.8h, v20.8h
-; CHECK-GI-NEXT: uzp1 v3.8h, v3.8h, v6.8h
+; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v6.8h
; CHECK-GI-NEXT: uzp1 v6.8h, v22.8h, v6.8h
-; CHECK-GI-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-GI-NEXT: and v5.16b, v5.16b, v4.16b
-; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b
-; CHECK-GI-NEXT: and v2.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: and v5.16b, v5.16b, v3.16b
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b
; CHECK-GI-NEXT: add v1.8h, v1.8h, v5.8h
-; CHECK-GI-NEXT: and v3.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT: and v4.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT: and v4.16b, v4.16b, v3.16b
+; CHECK-GI-NEXT: and v3.16b, v6.16b, v3.16b
; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h
; CHECK-GI-NEXT: ushll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT: add v3.4h, v3.4h, v4.4h
+; CHECK-GI-NEXT: add v3.4h, v4.4h, v3.4h
; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0
; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0
@@ -1459,42 +1459,42 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) {
;
; CHECK-GI-LABEL: i12:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v1.s[0], w0
+; CHECK-GI-NEXT: fmov s1, w0
+; CHECK-GI-NEXT: fmov s3, w4
; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: ldr s19, [sp, #8]
-; CHECK-GI-NEXT: mov v3.s[0], w4
+; CHECK-GI-NEXT: ldr s20, [sp, #8]
; CHECK-GI-NEXT: ldr s2, [sp, #32]
-; CHECK-GI-NEXT: ldr s20, [sp, #40]
+; CHECK-GI-NEXT: ldr s21, [sp, #40]
; CHECK-GI-NEXT: ldr s16, [sp, #64]
-; CHECK-GI-NEXT: ldr s21, [sp, #72]
-; CHECK-GI-NEXT: mov v0.s[1], v19.s[0]
+; CHECK-GI-NEXT: ldr s22, [sp, #72]
; CHECK-GI-NEXT: ldr s17, [sp, #96]
-; CHECK-GI-NEXT: ldr s22, [sp, #104]
-; CHECK-GI-NEXT: mov v2.s[1], v20.s[0]
-; CHECK-GI-NEXT: ldr s18, [sp, #128]
-; CHECK-GI-NEXT: ldr s23, [sp, #136]
+; CHECK-GI-NEXT: ldr s23, [sp, #104]
; CHECK-GI-NEXT: mov v1.s[1], w1
-; CHECK-GI-NEXT: ldr s19, [sp, #160]
-; CHECK-GI-NEXT: ldr s24, [sp, #168]
; CHECK-GI-NEXT: mov v3.s[1], w5
+; CHECK-GI-NEXT: ldr s18, [sp, #128]
+; CHECK-GI-NEXT: ldr s24, [sp, #136]
+; CHECK-GI-NEXT: mov v0.s[1], v20.s[0]
+; CHECK-GI-NEXT: ldr s19, [sp, #160]
+; CHECK-GI-NEXT: ldr s25, [sp, #168]
+; CHECK-GI-NEXT: mov v2.s[1], v21.s[0]
+; CHECK-GI-NEXT: mov v16.s[1], v22.s[0]
+; CHECK-GI-NEXT: mov v17.s[1], v23.s[0]
+; CHECK-GI-NEXT: mov v18.s[1], v24.s[0]
+; CHECK-GI-NEXT: mov v19.s[1], v25.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #16]
-; CHECK-GI-NEXT: mov v16.s[1], v21.s[0]
-; CHECK-GI-NEXT: mov v17.s[1], v22.s[0]
-; CHECK-GI-NEXT: mov v18.s[1], v23.s[0]
-; CHECK-GI-NEXT: mov v19.s[1], v24.s[0]
; CHECK-GI-NEXT: ldr s7, [sp, #48]
; CHECK-GI-NEXT: ldr s20, [sp, #80]
; CHECK-GI-NEXT: ldr s21, [sp, #112]
-; CHECK-GI-NEXT: mov v0.s[2], v6.s[0]
-; CHECK-GI-NEXT: ldr s6, [sp, #144]
-; CHECK-GI-NEXT: ldr s22, [sp, #176]
+; CHECK-GI-NEXT: ldr s22, [sp, #144]
+; CHECK-GI-NEXT: ldr s23, [sp, #176]
; CHECK-GI-NEXT: mov v1.s[2], w2
; CHECK-GI-NEXT: mov v3.s[2], w6
+; CHECK-GI-NEXT: mov v0.s[2], v6.s[0]
; CHECK-GI-NEXT: mov v2.s[2], v7.s[0]
; CHECK-GI-NEXT: mov v16.s[2], v20.s[0]
; CHECK-GI-NEXT: mov v17.s[2], v21.s[0]
-; CHECK-GI-NEXT: mov v18.s[2], v6.s[0]
-; CHECK-GI-NEXT: mov v19.s[2], v22.s[0]
+; CHECK-GI-NEXT: mov v18.s[2], v22.s[0]
+; CHECK-GI-NEXT: mov v19.s[2], v23.s[0]
; CHECK-GI-NEXT: ldr s4, [sp, #24]
; CHECK-GI-NEXT: ldr s5, [sp, #56]
; CHECK-GI-NEXT: ldr s6, [sp, #88]
diff --git a/llvm/test/CodeGen/AArch64/neon-extmul.ll b/llvm/test/CodeGen/AArch64/neon-extmul.ll
index f83ac8ed642cc1..3dbc033dfab964 100644
--- a/llvm/test/CodeGen/AArch64/neon-extmul.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extmul.ll
@@ -272,18 +272,18 @@ define <8 x i64> @extaddsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
; CHECK-GI-NEXT: mul x15, x15, x16
; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: fmov x11, d0
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: mov v1.d[0], x9
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: fmov d1, x9
; CHECK-GI-NEXT: mul x13, x13, x18
+; CHECK-GI-NEXT: mov v0.d[1], x12
; CHECK-GI-NEXT: mul x11, x11, x14
; CHECK-GI-NEXT: mov x14, v6.d[1]
-; CHECK-GI-NEXT: mov v0.d[1], x12
-; CHECK-GI-NEXT: mov v2.d[0], x10
; CHECK-GI-NEXT: mov v1.d[1], x15
+; CHECK-GI-NEXT: fmov d2, x10
; CHECK-GI-NEXT: mul x14, x14, x17
-; CHECK-GI-NEXT: mov v3.d[0], x11
-; CHECK-GI-NEXT: mov v2.d[1], x14
+; CHECK-GI-NEXT: fmov d3, x11
; CHECK-GI-NEXT: mov v3.d[1], x13
+; CHECK-GI-NEXT: mov v2.d[1], x14
; CHECK-GI-NEXT: ret
entry:
%s0s = sext <8 x i8> %s0 to <8 x i64>
@@ -423,22 +423,22 @@ define <8 x i64> @extmuladdsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b)
; CHECK-GI-NEXT: mul x15, x15, x16
; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: fmov x11, d0
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: mov v1.d[0], x9
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: fmov d1, x9
; CHECK-GI-NEXT: mul x13, x13, x18
+; CHECK-GI-NEXT: mov v0.d[1], x12
; CHECK-GI-NEXT: mul x11, x11, x14
; CHECK-GI-NEXT: mov x14, v18.d[1]
-; CHECK-GI-NEXT: mov v0.d[1], x12
-; CHECK-GI-NEXT: mov v6.d[0], x10
; CHECK-GI-NEXT: mov v1.d[1], x15
-; CHECK-GI-NEXT: mul x14, x14, x17
+; CHECK-GI-NEXT: fmov d6, x10
; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d
-; CHECK-GI-NEXT: mov v7.d[0], x11
+; CHECK-GI-NEXT: mul x14, x14, x17
; CHECK-GI-NEXT: add v1.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT: mov v6.d[1], x14
+; CHECK-GI-NEXT: fmov d7, x11
; CHECK-GI-NEXT: mov v7.d[1], x13
-; CHECK-GI-NEXT: add v2.2d, v6.2d, v4.2d
+; CHECK-GI-NEXT: mov v6.d[1], x14
; CHECK-GI-NEXT: add v3.2d, v7.2d, v5.2d
+; CHECK-GI-NEXT: add v2.2d, v6.2d, v4.2d
; CHECK-GI-NEXT: ret
entry:
%s0s = sext <8 x i8> %s0 to <8 x i64>
diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll
index 3263a5e03c1fdc..0b4764e22860cc 100644
--- a/llvm/test/CodeGen/AArch64/ptradd.ll
+++ b/llvm/test/CodeGen/AArch64/ptradd.ll
@@ -78,18 +78,17 @@ define void @vector_gep_v3i32(<3 x ptr> %b, <3 x i32> %off, ptr %p) {
; CHECK-GI-LABEL: vector_gep_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: smov x9, v3.s[0]
-; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: smov x10, v3.s[1]
-; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: fmov x8, d1
-; CHECK-GI-NEXT: mov v4.d[0], x9
-; CHECK-GI-NEXT: fmov x9, d2
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: mov w8, v3.s[2]
-; CHECK-GI-NEXT: mov v4.d[1], x10
+; CHECK-GI-NEXT: fmov d1, x9
+; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: mov v1.d[1], x10
; CHECK-GI-NEXT: add x8, x9, w8, sxtw
-; CHECK-GI-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-GI-NEXT: str x8, [x0, #16]
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-GI-NEXT: str q0, [x0]
; CHECK-GI-NEXT: ret
entry:
@@ -167,13 +166,12 @@ define void @vector_gep_v3i64(<3 x ptr> %b, <3 x i64> %off, ptr %p) {
;
; CHECK-GI-LABEL: vector_gep_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: fmov x8, d1
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4
; CHECK-GI-NEXT: fmov x9, d5
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
-; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: fmov x8, d1
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: fmov x8, d2
; CHECK-GI-NEXT: add x8, x8, x9
@@ -208,21 +206,13 @@ entry:
}
define void @vector_gep_v4i128(<2 x ptr> %b, <2 x i128> %off, ptr %p) {
-; CHECK-SD-LABEL: vector_gep_v4i128:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fmov d1, x0
-; CHECK-SD-NEXT: mov v1.d[1], x2
-; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
-; CHECK-SD-NEXT: str q0, [x4]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: vector_gep_v4i128:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v1.d[0], x0
-; CHECK-GI-NEXT: mov v1.d[1], x2
-; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d
-; CHECK-GI-NEXT: str q0, [x4]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: vector_gep_v4i128:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov d1, x0
+; CHECK-NEXT: mov v1.d[1], x2
+; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: str q0, [x4]
+; CHECK-NEXT: ret
entry:
%g = getelementptr i8, <2 x ptr> %b, <2 x i128> %off
store <2 x ptr> %g, ptr %p
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 277e7e9491015b..5237a3491de9b4 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -219,7 +219,7 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) {
;
; CHECK-GI-LABEL: sext_v3i8_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: mov v0.s[1], w1
; CHECK-GI-NEXT: mov v0.s[2], w2
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
@@ -245,8 +245,8 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) {
; CHECK-GI-LABEL: sext_v3i8_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #24 // =0x18
-; CHECK-GI-NEXT: mov v0.s[0], w0
-; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: mov v0.s[1], w1
; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w2
@@ -280,7 +280,7 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
;
; CHECK-GI-LABEL: sext_v3i8_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
; CHECK-GI-NEXT: sxtb x8, w2
; CHECK-GI-NEXT: fmov d2, x8
@@ -307,7 +307,7 @@ define <3 x i32> @sext_v3i16_v3i32(<3 x i16> %a) {
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: smov w8, v0.h[0]
; CHECK-GI-NEXT: smov w9, v0.h[1]
-; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: smov w8, v0.h[2]
; CHECK-GI-NEXT: mov v1.s[1], w9
; CHECK-GI-NEXT: mov v1.s[2], w8
@@ -382,7 +382,7 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v3i10_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: mov v0.s[1], w1
; CHECK-GI-NEXT: mov v0.s[2], w2
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
@@ -408,8 +408,8 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) {
; CHECK-GI-LABEL: sext_v3i10_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #22 // =0x16
-; CHECK-GI-NEXT: mov v0.s[0], w0
-; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: mov v0.s[1], w1
; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w2
@@ -443,7 +443,7 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v3i10_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
; CHECK-GI-NEXT: sbfx x8, x2, #0, #10
; CHECK-GI-NEXT: fmov d2, x8
@@ -1024,34 +1024,34 @@ define <16 x i16> @sext_v16i10_v16i16(<16 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v16i10_v16i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
-; CHECK-GI-NEXT: mov v1.s[0], w4
-; CHECK-GI-NEXT: ldr s2, [sp]
-; CHECK-GI-NEXT: ldr s3, [sp, #8]
-; CHECK-GI-NEXT: ldr s4, [sp, #32]
-; CHECK-GI-NEXT: ldr s5, [sp, #40]
+; CHECK-GI-NEXT: fmov s4, w0
+; CHECK-GI-NEXT: fmov s5, w4
+; CHECK-GI-NEXT: ldr s0, [sp]
+; CHECK-GI-NEXT: ldr s1, [sp, #8]
+; CHECK-GI-NEXT: ldr s2, [sp, #32]
+; CHECK-GI-NEXT: ldr s3, [sp, #40]
+; CHECK-GI-NEXT: mov v4.s[1], w1
+; CHECK-GI-NEXT: mov v5.s[1], w5
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT: ldr s3, [sp, #16]
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v1.s[1], w5
-; CHECK-GI-NEXT: ldr s5, [sp, #48]
+; CHECK-GI-NEXT: ldr s1, [sp, #16]
+; CHECK-GI-NEXT: ldr s3, [sp, #48]
+; CHECK-GI-NEXT: mov v4.s[2], w2
+; CHECK-GI-NEXT: mov v5.s[2], w6
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
; CHECK-GI-NEXT: mov v2.s[2], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[2], v5.s[0]
-; CHECK-GI-NEXT: ldr s3, [sp, #24]
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v1.s[2], w6
-; CHECK-GI-NEXT: ldr s5, [sp, #56]
+; CHECK-GI-NEXT: ldr s1, [sp, #24]
+; CHECK-GI-NEXT: ldr s3, [sp, #56]
+; CHECK-GI-NEXT: mov v4.s[3], w3
+; CHECK-GI-NEXT: mov v5.s[3], w7
+; CHECK-GI-NEXT: mov v0.s[3], v1.s[0]
; CHECK-GI-NEXT: mov v2.s[3], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[3], v5.s[0]
-; CHECK-GI-NEXT: mov v0.s[3], w3
-; CHECK-GI-NEXT: mov v1.s[3], w7
-; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v4.8h
-; CHECK-GI-NEXT: shl v0.8h, v0.8h, #6
+; CHECK-GI-NEXT: uzp1 v1.8h, v4.8h, v5.8h
+; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; CHECK-GI-NEXT: shl v1.8h, v1.8h, #6
-; CHECK-GI-NEXT: sshr v0.8h, v0.8h, #6
-; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #6
+; CHECK-GI-NEXT: shl v2.8h, v0.8h, #6
+; CHECK-GI-NEXT: sshr v0.8h, v1.8h, #6
+; CHECK-GI-NEXT: sshr v1.8h, v2.8h, #6
; CHECK-GI-NEXT: ret
entry:
%c = sext <16 x i10> %a to <16 x i16>
@@ -1101,36 +1101,36 @@ define <16 x i32> @sext_v16i10_v16i32(<16 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v16i10_v16i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
-; CHECK-GI-NEXT: mov v1.s[0], w4
-; CHECK-GI-NEXT: ldr s2, [sp]
-; CHECK-GI-NEXT: ldr s3, [sp, #8]
-; CHECK-GI-NEXT: ldr s4, [sp, #32]
-; CHECK-GI-NEXT: ldr s5, [sp, #40]
+; CHECK-GI-NEXT: fmov s4, w0
+; CHECK-GI-NEXT: fmov s5, w4
+; CHECK-GI-NEXT: ldr s0, [sp]
+; CHECK-GI-NEXT: ldr s1, [sp, #8]
+; CHECK-GI-NEXT: ldr s2, [sp, #32]
+; CHECK-GI-NEXT: ldr s3, [sp, #40]
+; CHECK-GI-NEXT: mov v4.s[1], w1
+; CHECK-GI-NEXT: mov v5.s[1], w5
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT: ldr s3, [sp, #16]
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v1.s[1], w5
-; CHECK-GI-NEXT: ldr s5, [sp, #48]
+; CHECK-GI-NEXT: ldr s1, [sp, #16]
+; CHECK-GI-NEXT: ldr s3, [sp, #48]
+; CHECK-GI-NEXT: mov v4.s[2], w2
+; CHECK-GI-NEXT: mov v5.s[2], w6
+; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
; CHECK-GI-NEXT: mov v2.s[2], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[2], v5.s[0]
-; CHECK-GI-NEXT: ldr s3, [sp, #24]
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v1.s[2], w6
-; CHECK-GI-NEXT: ldr s5, [sp, #56]
+; CHECK-GI-NEXT: ldr s1, [sp, #24]
+; CHECK-GI-NEXT: ldr s3, [sp, #56]
+; CHECK-GI-NEXT: mov v4.s[3], w3
+; CHECK-GI-NEXT: mov v5.s[3], w7
+; CHECK-GI-NEXT: mov v0.s[3], v1.s[0]
; CHECK-GI-NEXT: mov v2.s[3], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[3], v5.s[0]
-; CHECK-GI-NEXT: mov v0.s[3], w3
-; CHECK-GI-NEXT: mov v1.s[3], w7
-; CHECK-GI-NEXT: shl v2.4s, v2.4s, #22
-; CHECK-GI-NEXT: shl v3.4s, v4.4s, #22
-; CHECK-GI-NEXT: shl v0.4s, v0.4s, #22
-; CHECK-GI-NEXT: shl v1.4s, v1.4s, #22
-; CHECK-GI-NEXT: sshr v2.4s, v2.4s, #22
-; CHECK-GI-NEXT: sshr v3.4s, v3.4s, #22
-; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #22
-; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #22
+; CHECK-GI-NEXT: shl v1.4s, v4.4s, #22
+; CHECK-GI-NEXT: shl v3.4s, v5.4s, #22
+; CHECK-GI-NEXT: shl v4.4s, v0.4s, #22
+; CHECK-GI-NEXT: shl v5.4s, v2.4s, #22
+; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #22
+; CHECK-GI-NEXT: sshr v1.4s, v3.4s, #22
+; CHECK-GI-NEXT: sshr v2.4s, v4.4s, #22
+; CHECK-GI-NEXT: sshr v3.4s, v5.4s, #22
; CHECK-GI-NEXT: ret
entry:
%c = sext <16 x i10> %a to <16 x i32>
@@ -1188,50 +1188,50 @@ define <16 x i64> @sext_v16i10_v16i64(<16 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v16i10_v16i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v1.s[0], w0
-; CHECK-GI-NEXT: mov v2.s[0], w2
+; CHECK-GI-NEXT: fmov s7, w0
+; CHECK-GI-NEXT: fmov s17, w2
; CHECK-GI-NEXT: ldr s0, [sp]
-; CHECK-GI-NEXT: mov v3.s[0], w4
-; CHECK-GI-NEXT: mov v4.s[0], w6
-; CHECK-GI-NEXT: ldr s5, [sp, #8]
-; CHECK-GI-NEXT: ldr s6, [sp, #16]
-; CHECK-GI-NEXT: ldr s7, [sp, #24]
-; CHECK-GI-NEXT: ldr s16, [sp, #32]
-; CHECK-GI-NEXT: ldr s17, [sp, #40]
-; CHECK-GI-NEXT: ldr s18, [sp, #48]
-; CHECK-GI-NEXT: ldr s19, [sp, #56]
-; CHECK-GI-NEXT: mov v1.s[1], w1
-; CHECK-GI-NEXT: mov v0.s[1], v5.s[0]
-; CHECK-GI-NEXT: mov v2.s[1], w3
-; CHECK-GI-NEXT: mov v3.s[1], w5
-; CHECK-GI-NEXT: mov v4.s[1], w7
-; CHECK-GI-NEXT: mov v6.s[1], v7.s[0]
-; CHECK-GI-NEXT: mov v16.s[1], v17.s[0]
-; CHECK-GI-NEXT: mov v18.s[1], v19.s[0]
+; CHECK-GI-NEXT: fmov s18, w4
+; CHECK-GI-NEXT: fmov s19, w6
+; CHECK-GI-NEXT: ldr s1, [sp, #8]
+; CHECK-GI-NEXT: ldr s2, [sp, #16]
+; CHECK-GI-NEXT: ldr s3, [sp, #24]
+; CHECK-GI-NEXT: ldr s4, [sp, #32]
+; CHECK-GI-NEXT: ldr s5, [sp, #40]
+; CHECK-GI-NEXT: ldr s6, [sp, #48]
+; CHECK-GI-NEXT: ldr s16, [sp, #56]
+; CHECK-GI-NEXT: mov v7.s[1], w1
+; CHECK-GI-NEXT: mov v17.s[1], w3
+; CHECK-GI-NEXT: mov v18.s[1], w5
+; CHECK-GI-NEXT: mov v19.s[1], w7
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
+; CHECK-GI-NEXT: mov v4.s[1], v5.s[0]
+; CHECK-GI-NEXT: mov v6.s[1], v16.s[0]
+; CHECK-GI-NEXT: ushll v1.2d, v7.2s, #0
+; CHECK-GI-NEXT: ushll v3.2d, v17.2s, #0
+; CHECK-GI-NEXT: ushll v5.2d, v18.2s, #0
+; CHECK-GI-NEXT: ushll v7.2d, v19.2s, #0
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-GI-NEXT: ushll v3.2d, v3.2s, #0
; CHECK-GI-NEXT: ushll v4.2d, v4.2s, #0
-; CHECK-GI-NEXT: ushll v5.2d, v6.2s, #0
-; CHECK-GI-NEXT: ushll v6.2d, v16.2s, #0
-; CHECK-GI-NEXT: ushll v7.2d, v18.2s, #0
-; CHECK-GI-NEXT: shl v0.2d, v0.2d, #54
+; CHECK-GI-NEXT: ushll v6.2d, v6.2s, #0
; CHECK-GI-NEXT: shl v1.2d, v1.2d, #54
-; CHECK-GI-NEXT: shl v2.2d, v2.2d, #54
; CHECK-GI-NEXT: shl v3.2d, v3.2d, #54
-; CHECK-GI-NEXT: shl v16.2d, v4.2d, #54
; CHECK-GI-NEXT: shl v5.2d, v5.2d, #54
-; CHECK-GI-NEXT: shl v6.2d, v6.2d, #54
; CHECK-GI-NEXT: shl v7.2d, v7.2d, #54
-; CHECK-GI-NEXT: sshr v4.2d, v0.2d, #54
+; CHECK-GI-NEXT: shl v16.2d, v0.2d, #54
+; CHECK-GI-NEXT: shl v17.2d, v2.2d, #54
+; CHECK-GI-NEXT: shl v18.2d, v4.2d, #54
+; CHECK-GI-NEXT: shl v19.2d, v6.2d, #54
; CHECK-GI-NEXT: sshr v0.2d, v1.2d, #54
-; CHECK-GI-NEXT: sshr v1.2d, v2.2d, #54
-; CHECK-GI-NEXT: sshr v2.2d, v3.2d, #54
-; CHECK-GI-NEXT: sshr v3.2d, v16.2d, #54
-; CHECK-GI-NEXT: sshr v5.2d, v5.2d, #54
-; CHECK-GI-NEXT: sshr v6.2d, v6.2d, #54
-; CHECK-GI-NEXT: sshr v7.2d, v7.2d, #54
+; CHECK-GI-NEXT: sshr v1.2d, v3.2d, #54
+; CHECK-GI-NEXT: sshr v2.2d, v5.2d, #54
+; CHECK-GI-NEXT: sshr v3.2d, v7.2d, #54
+; CHECK-GI-NEXT: sshr v4.2d, v16.2d, #54
+; CHECK-GI-NEXT: sshr v5.2d, v17.2d, #54
+; CHECK-GI-NEXT: sshr v6.2d, v18.2d, #54
+; CHECK-GI-NEXT: sshr v7.2d, v19.2d, #54
; CHECK-GI-NEXT: ret
entry:
%c = sext <16 x i10> %a to <16 x i64>
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 951458da17c07e..a9e52dcf490676 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -635,8 +635,7 @@ define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: lsl w8, w8, w9
-; CHECK-GI-NEXT: mov v0.s[0], w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: ret
%3 = shl <1 x i32> %0, %1
ret <1 x i32> %3
@@ -788,8 +787,7 @@ define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: asr w8, w8, w9
-; CHECK-GI-NEXT: mov v0.s[0], w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: ret
%3 = ashr <1 x i32> %0, %1
ret <1 x i32> %3
@@ -931,8 +929,7 @@ define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: lsr w8, w8, w9
-; CHECK-GI-NEXT: mov v0.s[0], w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: ret
%3 = lshr <1 x i32> %0, %1
ret <1 x i32> %3
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 294f26dc0385f8..c81fd26a775256 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -3812,49 +3812,49 @@ define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) {
;
; CHECK-GI-LABEL: add_v24i8_v24i16_zext:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
-; CHECK-GI-NEXT: mov v1.s[0], w4
-; CHECK-GI-NEXT: ldr s2, [sp]
+; CHECK-GI-NEXT: fmov s4, w0
+; CHECK-GI-NEXT: fmov s5, w4
+; CHECK-GI-NEXT: ldr s0, [sp]
; CHECK-GI-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-NEXT: ldr s3, [sp, #32]
+; CHECK-GI-NEXT: ldr s1, [sp, #32]
; CHECK-GI-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-NEXT: ldr s4, [sp, #64]
+; CHECK-GI-NEXT: ldr s2, [sp, #64]
; CHECK-GI-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-NEXT: ldr s5, [sp, #96]
+; CHECK-GI-NEXT: ldr s3, [sp, #96]
; CHECK-GI-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-NEXT: mov v2.s[1], v6.s[0]
-; CHECK-GI-NEXT: mov v3.s[1], v7.s[0]
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v1.s[1], w5
-; CHECK-GI-NEXT: mov v4.s[1], v16.s[0]
-; CHECK-GI-NEXT: mov v5.s[1], v17.s[0]
+; CHECK-GI-NEXT: mov v4.s[1], w1
+; CHECK-GI-NEXT: mov v5.s[1], w5
+; CHECK-GI-NEXT: mov v0.s[1], v6.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v7.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v16.s[0]
+; CHECK-GI-NEXT: mov v3.s[1], v17.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #16]
; CHECK-GI-NEXT: ldr s7, [sp, #48]
; CHECK-GI-NEXT: ldr s16, [sp, #80]
; CHECK-GI-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-NEXT: mov v2.s[2], v6.s[0]
-; CHECK-GI-NEXT: mov v3.s[2], v7.s[0]
+; CHECK-GI-NEXT: mov v4.s[2], w2
+; CHECK-GI-NEXT: mov v5.s[2], w6
+; CHECK-GI-NEXT: mov v0.s[2], v6.s[0]
+; CHECK-GI-NEXT: mov v1.s[2], v7.s[0]
+; CHECK-GI-NEXT: mov v2.s[2], v16.s[0]
+; CHECK-GI-NEXT: mov v3.s[2], v17.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v1.s[2], w6
-; CHECK-GI-NEXT: mov v4.s[2], v16.s[0]
-; CHECK-GI-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-NEXT: ldr s7, [sp, #56]
; CHECK-GI-NEXT: ldr s16, [sp, #88]
; CHECK-GI-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-NEXT: mov v2.s[3], v6.s[0]
-; CHECK-GI-NEXT: mov v3.s[3], v7.s[0]
-; CHECK-GI-NEXT: mov v0.s[3], w3
-; CHECK-GI-NEXT: mov v1.s[3], w7
-; CHECK-GI-NEXT: mov v4.s[3], v16.s[0]
-; CHECK-GI-NEXT: mov v5.s[3], v17.s[0]
+; CHECK-GI-NEXT: mov v4.s[3], w3
+; CHECK-GI-NEXT: mov v5.s[3], w7
+; CHECK-GI-NEXT: mov v0.s[3], v6.s[0]
+; CHECK-GI-NEXT: mov v1.s[3], v7.s[0]
+; CHECK-GI-NEXT: mov v2.s[3], v16.s[0]
+; CHECK-GI-NEXT: mov v3.s[3], v17.s[0]
+; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v5.8h
; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-NEXT: uzp1 v2.8h, v4.8h, v5.8h
-; CHECK-GI-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT: xtn v2.8b, v2.8h
+; CHECK-GI-NEXT: uzp1 v0.16b, v4.16b, v0.16b
+; CHECK-GI-NEXT: xtn v1.8b, v1.8h
; CHECK-GI-NEXT: uaddlv h0, v0.16b
-; CHECK-GI-NEXT: uaddlv h1, v2.8b
+; CHECK-GI-NEXT: uaddlv h1, v1.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
@@ -3938,49 +3938,49 @@ define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) {
;
; CHECK-GI-LABEL: add_v24i8_v24i16_sext:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
-; CHECK-GI-NEXT: mov v1.s[0], w4
-; CHECK-GI-NEXT: ldr s2, [sp]
+; CHECK-GI-NEXT: fmov s4, w0
+; CHECK-GI-NEXT: fmov s5, w4
+; CHECK-GI-NEXT: ldr s0, [sp]
; CHECK-GI-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-NEXT: ldr s3, [sp, #32]
+; CHECK-GI-NEXT: ldr s1, [sp, #32]
; CHECK-GI-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-NEXT: ldr s4, [sp, #64]
+; CHECK-GI-NEXT: ldr s2, [sp, #64]
; CHECK-GI-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-NEXT: ldr s5, [sp, #96]
+; CHECK-GI-NEXT: ldr s3, [sp, #96]
; CHECK-GI-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-NEXT: mov v2.s[1], v6.s[0]
-; CHECK-GI-NEXT: mov v3.s[1], v7.s[0]
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v1.s[1], w5
-; CHECK-GI-NEXT: mov v4.s[1], v16.s[0]
-; CHECK-GI-NEXT: mov v5.s[1], v17.s[0]
+; CHECK-GI-NEXT: mov v4.s[1], w1
+; CHECK-GI-NEXT: mov v5.s[1], w5
+; CHECK-GI-NEXT: mov v0.s[1], v6.s[0]
+; CHECK-GI-NEXT: mov v1.s[1], v7.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v16.s[0]
+; CHECK-GI-NEXT: mov v3.s[1], v17.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #16]
; CHECK-GI-NEXT: ldr s7, [sp, #48]
; CHECK-GI-NEXT: ldr s16, [sp, #80]
; CHECK-GI-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-NEXT: mov v2.s[2], v6.s[0]
-; CHECK-GI-NEXT: mov v3.s[2], v7.s[0]
+; CHECK-GI-NEXT: mov v4.s[2], w2
+; CHECK-GI-NEXT: mov v5.s[2], w6
+; CHECK-GI-NEXT: mov v0.s[2], v6.s[0]
+; CHECK-GI-NEXT: mov v1.s[2], v7.s[0]
+; CHECK-GI-NEXT: mov v2.s[2], v16.s[0]
+; CHECK-GI-NEXT: mov v3.s[2], v17.s[0]
; CHECK-GI-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v1.s[2], w6
-; CHECK-GI-NEXT: mov v4.s[2], v16.s[0]
-; CHECK-GI-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-NEXT: ldr s7, [sp, #56]
; CHECK-GI-NEXT: ldr s16, [sp, #88]
; CHECK-GI-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-NEXT: mov v2.s[3], v6.s[0]
-; CHECK-GI-NEXT: mov v3.s[3], v7.s[0]
-; CHECK-GI-NEXT: mov v0.s[3], w3
-; CHECK-GI-NEXT: mov v1.s[3], w7
-; CHECK-GI-NEXT: mov v4.s[3], v16.s[0]
-; CHECK-GI-NEXT: mov v5.s[3], v17.s[0]
+; CHECK-GI-NEXT: mov v4.s[3], w3
+; CHECK-GI-NEXT: mov v5.s[3], w7
+; CHECK-GI-NEXT: mov v0.s[3], v6.s[0]
+; CHECK-GI-NEXT: mov v1.s[3], v7.s[0]
+; CHECK-GI-NEXT: mov v2.s[3], v16.s[0]
+; CHECK-GI-NEXT: mov v3.s[3], v17.s[0]
+; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v5.8h
; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-NEXT: uzp1 v2.8h, v4.8h, v5.8h
-; CHECK-GI-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT: xtn v2.8b, v2.8h
+; CHECK-GI-NEXT: uzp1 v0.16b, v4.16b, v0.16b
+; CHECK-GI-NEXT: xtn v1.8b, v1.8h
; CHECK-GI-NEXT: saddlv h0, v0.16b
-; CHECK-GI-NEXT: saddlv h1, v2.8b
+; CHECK-GI-NEXT: saddlv h1, v1.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
@@ -4125,49 +4125,49 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
;
; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: mov v0.s[0], w0
-; CHECK-GI-BASE-NEXT: mov v1.s[0], w4
-; CHECK-GI-BASE-NEXT: ldr s2, [sp]
+; CHECK-GI-BASE-NEXT: fmov s4, w0
+; CHECK-GI-BASE-NEXT: fmov s5, w4
+; CHECK-GI-BASE-NEXT: ldr s0, [sp]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-BASE-NEXT: ldr s3, [sp, #32]
+; CHECK-GI-BASE-NEXT: ldr s1, [sp, #32]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-BASE-NEXT: ldr s4, [sp, #64]
+; CHECK-GI-BASE-NEXT: ldr s2, [sp, #64]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-BASE-NEXT: ldr s5, [sp, #96]
+; CHECK-GI-BASE-NEXT: ldr s3, [sp, #96]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-BASE-NEXT: mov v2.s[1], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[1], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v0.s[1], w1
-; CHECK-GI-BASE-NEXT: mov v1.s[1], w5
-; CHECK-GI-BASE-NEXT: mov v4.s[1], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v5.s[1], v17.s[0]
+; CHECK-GI-BASE-NEXT: mov v4.s[1], w1
+; CHECK-GI-BASE-NEXT: mov v5.s[1], w5
+; CHECK-GI-BASE-NEXT: mov v0.s[1], v6.s[0]
+; CHECK-GI-BASE-NEXT: mov v1.s[1], v7.s[0]
+; CHECK-GI-BASE-NEXT: mov v2.s[1], v16.s[0]
+; CHECK-GI-BASE-NEXT: mov v3.s[1], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #16]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #48]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #80]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-BASE-NEXT: mov v2.s[2], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[2], v7.s[0]
+; CHECK-GI-BASE-NEXT: mov v4.s[2], w2
+; CHECK-GI-BASE-NEXT: mov v5.s[2], w6
+; CHECK-GI-BASE-NEXT: mov v0.s[2], v6.s[0]
+; CHECK-GI-BASE-NEXT: mov v1.s[2], v7.s[0]
+; CHECK-GI-BASE-NEXT: mov v2.s[2], v16.s[0]
+; CHECK-GI-BASE-NEXT: mov v3.s[2], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-BASE-NEXT: mov v0.s[2], w2
-; CHECK-GI-BASE-NEXT: mov v1.s[2], w6
-; CHECK-GI-BASE-NEXT: mov v4.s[2], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #56]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #88]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-BASE-NEXT: mov v2.s[3], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[3], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v0.s[3], w3
-; CHECK-GI-BASE-NEXT: mov v1.s[3], w7
-; CHECK-GI-BASE-NEXT: mov v4.s[3], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v5.s[3], v17.s[0]
+; CHECK-GI-BASE-NEXT: mov v4.s[3], w3
+; CHECK-GI-BASE-NEXT: mov v5.s[3], w7
+; CHECK-GI-BASE-NEXT: mov v0.s[3], v6.s[0]
+; CHECK-GI-BASE-NEXT: mov v1.s[3], v7.s[0]
+; CHECK-GI-BASE-NEXT: mov v2.s[3], v16.s[0]
+; CHECK-GI-BASE-NEXT: mov v3.s[3], v17.s[0]
+; CHECK-GI-BASE-NEXT: uzp1 v4.8h, v4.8h, v5.8h
; CHECK-GI-BASE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-BASE-NEXT: uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-BASE-NEXT: uzp1 v2.8h, v4.8h, v5.8h
-; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-GI-BASE-NEXT: xtn v2.8b, v2.8h
+; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v4.16b, v0.16b
+; CHECK-GI-BASE-NEXT: xtn v1.8b, v1.8h
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
-; CHECK-GI-BASE-NEXT: uaddlv h1, v2.8b
+; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
@@ -4176,55 +4176,55 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
;
; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT: mov v0.s[0], w0
-; CHECK-GI-DOT-NEXT: mov v2.s[0], w4
-; CHECK-GI-DOT-NEXT: ldr s1, [sp]
+; CHECK-GI-DOT-NEXT: fmov s4, w0
+; CHECK-GI-DOT-NEXT: fmov s5, w4
+; CHECK-GI-DOT-NEXT: ldr s0, [sp]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-DOT-NEXT: ldr s3, [sp, #32]
+; CHECK-GI-DOT-NEXT: ldr s1, [sp, #32]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-DOT-NEXT: ldr s4, [sp, #64]
+; CHECK-GI-DOT-NEXT: ldr s2, [sp, #64]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-DOT-NEXT: ldr s5, [sp, #96]
+; CHECK-GI-DOT-NEXT: ldr s3, [sp, #96]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-DOT-NEXT: mov v1.s[1], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[1], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v0.s[1], w1
-; CHECK-GI-DOT-NEXT: mov v2.s[1], w5
-; CHECK-GI-DOT-NEXT: mov v4.s[1], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v5.s[1], v17.s[0]
+; CHECK-GI-DOT-NEXT: mov v4.s[1], w1
+; CHECK-GI-DOT-NEXT: mov v5.s[1], w5
+; CHECK-GI-DOT-NEXT: mov v0.s[1], v6.s[0]
+; CHECK-GI-DOT-NEXT: mov v1.s[1], v7.s[0]
+; CHECK-GI-DOT-NEXT: mov v2.s[1], v16.s[0]
+; CHECK-GI-DOT-NEXT: mov v3.s[1], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #16]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #48]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #80]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-DOT-NEXT: mov v1.s[2], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[2], v7.s[0]
+; CHECK-GI-DOT-NEXT: mov v4.s[2], w2
+; CHECK-GI-DOT-NEXT: mov v5.s[2], w6
+; CHECK-GI-DOT-NEXT: mov v0.s[2], v6.s[0]
+; CHECK-GI-DOT-NEXT: mov v1.s[2], v7.s[0]
+; CHECK-GI-DOT-NEXT: mov v2.s[2], v16.s[0]
+; CHECK-GI-DOT-NEXT: mov v3.s[2], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-DOT-NEXT: mov v0.s[2], w2
-; CHECK-GI-DOT-NEXT: mov v4.s[2], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[2], w6
-; CHECK-GI-DOT-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #56]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #88]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-DOT-NEXT: mov v1.s[3], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[3], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v0.s[3], w3
-; CHECK-GI-DOT-NEXT: mov v2.s[3], w7
-; CHECK-GI-DOT-NEXT: mov v4.s[3], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v5.s[3], v17.s[0]
-; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v1.8h, v3.8h
-; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-GI-DOT-NEXT: movi v2.8b, #1
-; CHECK-GI-DOT-NEXT: uzp1 v3.8h, v4.8h, v5.8h
-; CHECK-GI-DOT-NEXT: movi v4.8b, #1
+; CHECK-GI-DOT-NEXT: mov v4.s[3], w3
+; CHECK-GI-DOT-NEXT: mov v5.s[3], w7
+; CHECK-GI-DOT-NEXT: mov v0.s[3], v6.s[0]
+; CHECK-GI-DOT-NEXT: mov v1.s[3], v7.s[0]
+; CHECK-GI-DOT-NEXT: mov v2.s[3], v16.s[0]
+; CHECK-GI-DOT-NEXT: mov v3.s[3], v17.s[0]
+; CHECK-GI-DOT-NEXT: uzp1 v4.8h, v4.8h, v5.8h
; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: xtn v3.8b, v3.8h
-; CHECK-GI-DOT-NEXT: mov v4.d[1], v2.d[0]
-; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v4.16b
-; CHECK-GI-DOT-NEXT: udot v1.4s, v3.16b, v2.16b
-; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v1.4s
+; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-DOT-NEXT: movi v2.8b, #1
+; CHECK-GI-DOT-NEXT: movi v3.8b, #1
+; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v4.16b, v0.16b
+; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: xtn v1.8b, v1.8h
+; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v3.16b
+; CHECK-GI-DOT-NEXT: udot v4.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
@@ -4398,49 +4398,49 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
;
; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: mov v0.s[0], w0
-; CHECK-GI-BASE-NEXT: mov v1.s[0], w4
-; CHECK-GI-BASE-NEXT: ldr s2, [sp]
+; CHECK-GI-BASE-NEXT: fmov s4, w0
+; CHECK-GI-BASE-NEXT: fmov s5, w4
+; CHECK-GI-BASE-NEXT: ldr s0, [sp]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-BASE-NEXT: ldr s3, [sp, #32]
+; CHECK-GI-BASE-NEXT: ldr s1, [sp, #32]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-BASE-NEXT: ldr s4, [sp, #64]
+; CHECK-GI-BASE-NEXT: ldr s2, [sp, #64]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-BASE-NEXT: ldr s5, [sp, #96]
+; CHECK-GI-BASE-NEXT: ldr s3, [sp, #96]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-BASE-NEXT: mov v2.s[1], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[1], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v0.s[1], w1
-; CHECK-GI-BASE-NEXT: mov v1.s[1], w5
-; CHECK-GI-BASE-NEXT: mov v4.s[1], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v5.s[1], v17.s[0]
+; CHECK-GI-BASE-NEXT: mov v4.s[1], w1
+; CHECK-GI-BASE-NEXT: mov v5.s[1], w5
+; CHECK-GI-BASE-NEXT: mov v0.s[1], v6.s[0]
+; CHECK-GI-BASE-NEXT: mov v1.s[1], v7.s[0]
+; CHECK-GI-BASE-NEXT: mov v2.s[1], v16.s[0]
+; CHECK-GI-BASE-NEXT: mov v3.s[1], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #16]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #48]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #80]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-BASE-NEXT: mov v2.s[2], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[2], v7.s[0]
+; CHECK-GI-BASE-NEXT: mov v4.s[2], w2
+; CHECK-GI-BASE-NEXT: mov v5.s[2], w6
+; CHECK-GI-BASE-NEXT: mov v0.s[2], v6.s[0]
+; CHECK-GI-BASE-NEXT: mov v1.s[2], v7.s[0]
+; CHECK-GI-BASE-NEXT: mov v2.s[2], v16.s[0]
+; CHECK-GI-BASE-NEXT: mov v3.s[2], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-BASE-NEXT: mov v0.s[2], w2
-; CHECK-GI-BASE-NEXT: mov v1.s[2], w6
-; CHECK-GI-BASE-NEXT: mov v4.s[2], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-BASE-NEXT: ldr s7, [sp, #56]
; CHECK-GI-BASE-NEXT: ldr s16, [sp, #88]
; CHECK-GI-BASE-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-BASE-NEXT: mov v2.s[3], v6.s[0]
-; CHECK-GI-BASE-NEXT: mov v3.s[3], v7.s[0]
-; CHECK-GI-BASE-NEXT: mov v0.s[3], w3
-; CHECK-GI-BASE-NEXT: mov v1.s[3], w7
-; CHECK-GI-BASE-NEXT: mov v4.s[3], v16.s[0]
-; CHECK-GI-BASE-NEXT: mov v5.s[3], v17.s[0]
+; CHECK-GI-BASE-NEXT: mov v4.s[3], w3
+; CHECK-GI-BASE-NEXT: mov v5.s[3], w7
+; CHECK-GI-BASE-NEXT: mov v0.s[3], v6.s[0]
+; CHECK-GI-BASE-NEXT: mov v1.s[3], v7.s[0]
+; CHECK-GI-BASE-NEXT: mov v2.s[3], v16.s[0]
+; CHECK-GI-BASE-NEXT: mov v3.s[3], v17.s[0]
+; CHECK-GI-BASE-NEXT: uzp1 v4.8h, v4.8h, v5.8h
; CHECK-GI-BASE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-GI-BASE-NEXT: uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-BASE-NEXT: uzp1 v2.8h, v4.8h, v5.8h
-; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-GI-BASE-NEXT: xtn v2.8b, v2.8h
+; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v4.16b, v0.16b
+; CHECK-GI-BASE-NEXT: xtn v1.8b, v1.8h
; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
-; CHECK-GI-BASE-NEXT: saddlv h1, v2.8b
+; CHECK-GI-BASE-NEXT: saddlv h1, v1.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
@@ -4449,55 +4449,55 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
;
; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT: mov v0.s[0], w0
-; CHECK-GI-DOT-NEXT: mov v2.s[0], w4
-; CHECK-GI-DOT-NEXT: ldr s1, [sp]
+; CHECK-GI-DOT-NEXT: fmov s4, w0
+; CHECK-GI-DOT-NEXT: fmov s5, w4
+; CHECK-GI-DOT-NEXT: ldr s0, [sp]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-DOT-NEXT: ldr s3, [sp, #32]
+; CHECK-GI-DOT-NEXT: ldr s1, [sp, #32]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #40]
-; CHECK-GI-DOT-NEXT: ldr s4, [sp, #64]
+; CHECK-GI-DOT-NEXT: ldr s2, [sp, #64]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #72]
-; CHECK-GI-DOT-NEXT: ldr s5, [sp, #96]
+; CHECK-GI-DOT-NEXT: ldr s3, [sp, #96]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #104]
-; CHECK-GI-DOT-NEXT: mov v1.s[1], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[1], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v0.s[1], w1
-; CHECK-GI-DOT-NEXT: mov v2.s[1], w5
-; CHECK-GI-DOT-NEXT: mov v4.s[1], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v5.s[1], v17.s[0]
+; CHECK-GI-DOT-NEXT: mov v4.s[1], w1
+; CHECK-GI-DOT-NEXT: mov v5.s[1], w5
+; CHECK-GI-DOT-NEXT: mov v0.s[1], v6.s[0]
+; CHECK-GI-DOT-NEXT: mov v1.s[1], v7.s[0]
+; CHECK-GI-DOT-NEXT: mov v2.s[1], v16.s[0]
+; CHECK-GI-DOT-NEXT: mov v3.s[1], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #16]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #48]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #80]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #112]
-; CHECK-GI-DOT-NEXT: mov v1.s[2], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[2], v7.s[0]
+; CHECK-GI-DOT-NEXT: mov v4.s[2], w2
+; CHECK-GI-DOT-NEXT: mov v5.s[2], w6
+; CHECK-GI-DOT-NEXT: mov v0.s[2], v6.s[0]
+; CHECK-GI-DOT-NEXT: mov v1.s[2], v7.s[0]
+; CHECK-GI-DOT-NEXT: mov v2.s[2], v16.s[0]
+; CHECK-GI-DOT-NEXT: mov v3.s[2], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s6, [sp, #24]
-; CHECK-GI-DOT-NEXT: mov v0.s[2], w2
-; CHECK-GI-DOT-NEXT: mov v4.s[2], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v2.s[2], w6
-; CHECK-GI-DOT-NEXT: mov v5.s[2], v17.s[0]
; CHECK-GI-DOT-NEXT: ldr s7, [sp, #56]
; CHECK-GI-DOT-NEXT: ldr s16, [sp, #88]
; CHECK-GI-DOT-NEXT: ldr s17, [sp, #120]
-; CHECK-GI-DOT-NEXT: mov v1.s[3], v6.s[0]
-; CHECK-GI-DOT-NEXT: mov v3.s[3], v7.s[0]
-; CHECK-GI-DOT-NEXT: mov v0.s[3], w3
-; CHECK-GI-DOT-NEXT: mov v2.s[3], w7
-; CHECK-GI-DOT-NEXT: mov v4.s[3], v16.s[0]
-; CHECK-GI-DOT-NEXT: mov v5.s[3], v17.s[0]
-; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v1.8h, v3.8h
-; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-GI-DOT-NEXT: movi v2.8b, #1
-; CHECK-GI-DOT-NEXT: uzp1 v3.8h, v4.8h, v5.8h
-; CHECK-GI-DOT-NEXT: movi v4.8b, #1
+; CHECK-GI-DOT-NEXT: mov v4.s[3], w3
+; CHECK-GI-DOT-NEXT: mov v5.s[3], w7
+; CHECK-GI-DOT-NEXT: mov v0.s[3], v6.s[0]
+; CHECK-GI-DOT-NEXT: mov v1.s[3], v7.s[0]
+; CHECK-GI-DOT-NEXT: mov v2.s[3], v16.s[0]
+; CHECK-GI-DOT-NEXT: mov v3.s[3], v17.s[0]
+; CHECK-GI-DOT-NEXT: uzp1 v4.8h, v4.8h, v5.8h
; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: xtn v3.8b, v3.8h
-; CHECK-GI-DOT-NEXT: mov v4.d[1], v2.d[0]
-; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v4.16b
-; CHECK-GI-DOT-NEXT: sdot v1.4s, v3.16b, v2.16b
-; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v1.4s
+; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-DOT-NEXT: movi v2.8b, #1
+; CHECK-GI-DOT-NEXT: movi v3.8b, #1
+; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v4.16b, v0.16b
+; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: xtn v1.8b, v1.8h
+; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v3.16b
+; CHECK-GI-DOT-NEXT: sdot v4.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index ef2f297f204c1c..8466f46023e3ab 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -136,7 +136,7 @@ define <2 x i8> @xtn_v2i128_v2i8(<2 x i128> %a) {
;
; CHECK-GI-LABEL: xtn_v2i128_v2i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.d[0], x0
+; CHECK-GI-NEXT: fmov d0, x0
; CHECK-GI-NEXT: mov v0.d[1], x2
; CHECK-GI-NEXT: xtn v0.2s, v0.2d
; CHECK-GI-NEXT: ret
@@ -174,7 +174,7 @@ define <2 x i16> @xtn_v2i128_v2i16(<2 x i128> %a) {
;
; CHECK-GI-LABEL: xtn_v2i128_v2i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.d[0], x0
+; CHECK-GI-NEXT: fmov d0, x0
; CHECK-GI-NEXT: mov v0.d[1], x2
; CHECK-GI-NEXT: xtn v0.2s, v0.2d
; CHECK-GI-NEXT: ret
@@ -203,7 +203,7 @@ define <2 x i32> @xtn_v2i128_v2i32(<2 x i128> %a) {
;
; CHECK-GI-LABEL: xtn_v2i128_v2i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.d[0], x0
+; CHECK-GI-NEXT: fmov d0, x0
; CHECK-GI-NEXT: mov v0.d[1], x2
; CHECK-GI-NEXT: xtn v0.2s, v0.2d
; CHECK-GI-NEXT: ret
@@ -213,17 +213,11 @@ entry:
}
define <2 x i64> @xtn_v2i128_v2i64(<2 x i128> %a) {
-; CHECK-SD-LABEL: xtn_v2i128_v2i64:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fmov d0, x0
-; CHECK-SD-NEXT: mov v0.d[1], x2
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: xtn_v2i128_v2i64:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.d[0], x0
-; CHECK-GI-NEXT: mov v0.d[1], x2
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: xtn_v2i128_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: mov v0.d[1], x2
+; CHECK-NEXT: ret
entry:
%arg1 = trunc <2 x i128> %a to <2 x i64>
ret <2 x i64> %arg1
@@ -350,10 +344,10 @@ define <3 x i32> @xtn_v3i64_v3i32(<3 x i64> %a) {
; CHECK-GI-LABEL: xtn_v3i64_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov x8, d0
-; CHECK-GI-NEXT: mov v0.s[0], w8
-; CHECK-GI-NEXT: fmov x8, d1
-; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov x8, d2
+; CHECK-GI-NEXT: mov v0.s[1], w9
; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 9f3450be607fa5..0b5d239dca323e 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -242,7 +242,7 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) {
;
; CHECK-GI-LABEL: zext_v3i8_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: mov w8, #255 // =0xff
; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: mov v0.s[1], w1
@@ -271,8 +271,8 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) {
; CHECK-GI-LABEL: zext_v3i8_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #255 // =0xff
-; CHECK-GI-NEXT: mov v0.s[0], w0
-; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: mov v0.s[1], w1
; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w2
@@ -304,7 +304,7 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) {
;
; CHECK-GI-LABEL: zext_v3i8_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
; CHECK-GI-NEXT: and x8, x2, #0xff
@@ -331,7 +331,7 @@ define <3 x i32> @zext_v3i16_v3i32(<3 x i16> %a) {
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: umov w8, v0.h[0]
; CHECK-GI-NEXT: umov w9, v0.h[1]
-; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: umov w8, v0.h[2]
; CHECK-GI-NEXT: mov v1.s[1], w9
; CHECK-GI-NEXT: mov v1.s[2], w8
@@ -406,7 +406,7 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v3i10_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: mov w8, #1023 // =0x3ff
; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: mov v0.s[1], w1
@@ -435,8 +435,8 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) {
; CHECK-GI-LABEL: zext_v3i10_v3i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #1023 // =0x3ff
-; CHECK-GI-NEXT: mov v0.s[0], w0
-; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: mov v0.s[1], w1
; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w2
@@ -467,7 +467,7 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v3i10_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
+; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: adrp x8, .LCPI27_0
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI27_0]
@@ -1096,33 +1096,33 @@ define <16 x i32> @zext_v16i10_v16i32(<16 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v16i10_v16i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
-; CHECK-GI-NEXT: mov v1.s[0], w4
+; CHECK-GI-NEXT: fmov s4, w0
+; CHECK-GI-NEXT: fmov s5, w4
; CHECK-GI-NEXT: ldr s2, [sp]
-; CHECK-GI-NEXT: ldr s3, [sp, #8]
-; CHECK-GI-NEXT: ldr s4, [sp, #32]
-; CHECK-GI-NEXT: ldr s5, [sp, #40]
+; CHECK-GI-NEXT: ldr s0, [sp, #8]
+; CHECK-GI-NEXT: ldr s3, [sp, #32]
+; CHECK-GI-NEXT: ldr s1, [sp, #40]
; CHECK-GI-NEXT: movi v6.4s, #3, msl #8
-; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT: ldr s3, [sp, #16]
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v1.s[1], w5
-; CHECK-GI-NEXT: ldr s5, [sp, #48]
-; CHECK-GI-NEXT: mov v2.s[2], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[2], v5.s[0]
-; CHECK-GI-NEXT: ldr s3, [sp, #24]
-; CHECK-GI-NEXT: mov v0.s[2], w2
-; CHECK-GI-NEXT: mov v1.s[2], w6
-; CHECK-GI-NEXT: ldr s5, [sp, #56]
-; CHECK-GI-NEXT: mov v2.s[3], v3.s[0]
-; CHECK-GI-NEXT: mov v4.s[3], v5.s[0]
-; CHECK-GI-NEXT: mov v0.s[3], w3
-; CHECK-GI-NEXT: mov v1.s[3], w7
+; CHECK-GI-NEXT: mov v4.s[1], w1
+; CHECK-GI-NEXT: mov v5.s[1], w5
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[1], v1.s[0]
+; CHECK-GI-NEXT: ldr s0, [sp, #16]
+; CHECK-GI-NEXT: ldr s1, [sp, #48]
+; CHECK-GI-NEXT: mov v4.s[2], w2
+; CHECK-GI-NEXT: mov v5.s[2], w6
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[2], v1.s[0]
+; CHECK-GI-NEXT: ldr s0, [sp, #24]
+; CHECK-GI-NEXT: ldr s1, [sp, #56]
+; CHECK-GI-NEXT: mov v4.s[3], w3
+; CHECK-GI-NEXT: mov v5.s[3], w7
+; CHECK-GI-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-GI-NEXT: mov v3.s[3], v1.s[0]
+; CHECK-GI-NEXT: and v0.16b, v4.16b, v6.16b
+; CHECK-GI-NEXT: and v1.16b, v5.16b, v6.16b
; CHECK-GI-NEXT: and v2.16b, v2.16b, v6.16b
-; CHECK-GI-NEXT: and v3.16b, v4.16b, v6.16b
-; CHECK-GI-NEXT: and v0.16b, v0.16b, v6.16b
-; CHECK-GI-NEXT: and v1.16b, v1.16b, v6.16b
+; CHECK-GI-NEXT: and v3.16b, v3.16b, v6.16b
; CHECK-GI-NEXT: ret
entry:
%c = zext <16 x i10> %a to <16 x i32>
@@ -1174,44 +1174,44 @@ define <16 x i64> @zext_v16i10_v16i64(<16 x i10> %a) {
;
; CHECK-GI-LABEL: zext_v16i10_v16i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.s[0], w0
-; CHECK-GI-NEXT: mov v1.s[0], w2
-; CHECK-GI-NEXT: ldr s3, [sp]
-; CHECK-GI-NEXT: mov v2.s[0], w4
-; CHECK-GI-NEXT: mov v5.s[0], w6
-; CHECK-GI-NEXT: ldr s4, [sp, #8]
-; CHECK-GI-NEXT: ldr s6, [sp, #16]
-; CHECK-GI-NEXT: ldr s7, [sp, #24]
-; CHECK-GI-NEXT: ldr s16, [sp, #32]
-; CHECK-GI-NEXT: ldr s17, [sp, #40]
-; CHECK-GI-NEXT: ldr s18, [sp, #48]
-; CHECK-GI-NEXT: ldr s19, [sp, #56]
-; CHECK-GI-NEXT: mov v0.s[1], w1
-; CHECK-GI-NEXT: mov v1.s[1], w3
-; CHECK-GI-NEXT: mov v3.s[1], v4.s[0]
-; CHECK-GI-NEXT: mov v2.s[1], w5
-; CHECK-GI-NEXT: mov v5.s[1], w7
+; CHECK-GI-NEXT: fmov s16, w0
+; CHECK-GI-NEXT: fmov s17, w2
+; CHECK-GI-NEXT: ldr s0, [sp]
+; CHECK-GI-NEXT: fmov s18, w4
+; CHECK-GI-NEXT: fmov s19, w6
+; CHECK-GI-NEXT: ldr s1, [sp, #8]
+; CHECK-GI-NEXT: ldr s2, [sp, #16]
+; CHECK-GI-NEXT: ldr s3, [sp, #24]
+; CHECK-GI-NEXT: ldr s4, [sp, #32]
+; CHECK-GI-NEXT: ldr s5, [sp, #40]
+; CHECK-GI-NEXT: ldr s6, [sp, #48]
+; CHECK-GI-NEXT: ldr s7, [sp, #56]
+; CHECK-GI-NEXT: mov v16.s[1], w1
+; CHECK-GI-NEXT: mov v17.s[1], w3
+; CHECK-GI-NEXT: mov v18.s[1], w5
+; CHECK-GI-NEXT: mov v19.s[1], w7
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT: mov v2.s[1], v3.s[0]
+; CHECK-GI-NEXT: mov v4.s[1], v5.s[0]
; CHECK-GI-NEXT: mov v6.s[1], v7.s[0]
-; CHECK-GI-NEXT: mov v16.s[1], v17.s[0]
-; CHECK-GI-NEXT: mov v18.s[1], v19.s[0]
; CHECK-GI-NEXT: adrp x8, .LCPI54_0
-; CHECK-GI-NEXT: ldr q7, [x8, :lo12:.LCPI54_0]
-; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-GI-NEXT: ushll v4.2d, v5.2s, #0
-; CHECK-GI-NEXT: ushll v5.2d, v3.2s, #0
-; CHECK-GI-NEXT: ushll v6.2d, v6.2s, #0
-; CHECK-GI-NEXT: ushll v16.2d, v16.2s, #0
-; CHECK-GI-NEXT: ushll v17.2d, v18.2s, #0
-; CHECK-GI-NEXT: and v0.16b, v0.16b, v7.16b
-; CHECK-GI-NEXT: and v1.16b, v1.16b, v7.16b
-; CHECK-GI-NEXT: and v2.16b, v2.16b, v7.16b
-; CHECK-GI-NEXT: and v3.16b, v4.16b, v7.16b
-; CHECK-GI-NEXT: and v4.16b, v5.16b, v7.16b
-; CHECK-GI-NEXT: and v5.16b, v6.16b, v7.16b
-; CHECK-GI-NEXT: and v6.16b, v16.16b, v7.16b
-; CHECK-GI-NEXT: and v7.16b, v17.16b, v7.16b
+; CHECK-GI-NEXT: ushll v1.2d, v16.2s, #0
+; CHECK-GI-NEXT: ushll v3.2d, v17.2s, #0
+; CHECK-GI-NEXT: ushll v5.2d, v18.2s, #0
+; CHECK-GI-NEXT: ushll v7.2d, v19.2s, #0
+; CHECK-GI-NEXT: ushll v16.2d, v0.2s, #0
+; CHECK-GI-NEXT: ushll v18.2d, v2.2s, #0
+; CHECK-GI-NEXT: ushll v19.2d, v4.2s, #0
+; CHECK-GI-NEXT: ushll v20.2d, v6.2s, #0
+; CHECK-GI-NEXT: ldr q17, [x8, :lo12:.LCPI54_0]
+; CHECK-GI-NEXT: and v0.16b, v1.16b, v17.16b
+; CHECK-GI-NEXT: and v1.16b, v3.16b, v17.16b
+; CHECK-GI-NEXT: and v2.16b, v5.16b, v17.16b
+; CHECK-GI-NEXT: and v3.16b, v7.16b, v17.16b
+; CHECK-GI-NEXT: and v4.16b, v16.16b, v17.16b
+; CHECK-GI-NEXT: and v5.16b, v18.16b, v17.16b
+; CHECK-GI-NEXT: and v6.16b, v19.16b, v17.16b
+; CHECK-GI-NEXT: and v7.16b, v20.16b, v17.16b
; CHECK-GI-NEXT: ret
entry:
%c = zext <16 x i10> %a to <16 x i64>
>From 56cbacdc7c18f6240ae4a93ce2adb1dde60a6693 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Wed, 21 Aug 2024 09:23:38 +0000
Subject: [PATCH 4/4] [AArch64][GlobalISel] Prefer to use Vector Truncate
Tries to combine scalarised truncates into vector truncate
operations
EXAMPLE:
%a(i32), %b(i32) = G_UNMERGE %src(<2 x i32>)
%T_a(i16) = G_TRUNC %a(i32)
%T_b(i16) = G_TRUNC %b(i32)
%Imp(i16) = G_IMPLICIT_DEF(i16)
%dst(v8i16) = G_MERGE_VALUES %T_a(i16), %T_b(i16), %Imp(i16), %Imp(i16)
===>
%Imp(<2 x i32>) = G_IMPLICIT_DEF(<2 x i32>)
%Mid(<4 x s16>) = G_CONCAT_VECTORS %src(<2 x i32>), %Imp(<2 x i32>)
%dst(<4 x s16>) = G_TRUNC %Mid(<4 x s16>)
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 +
.../include/llvm/Target/GlobalISel/Combine.td | 9 +-
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 104 ++++++++++++++++++
llvm/lib/Target/AArch64/AArch64Combine.td | 2 +-
.../AArch64/GISel/AArch64LegalizerInfo.cpp | 3 +-
.../AArch64/GlobalISel/legalize-freeze.mir | 24 +---
.../GlobalISel/legalize-insert-vector-elt.mir | 11 +-
llvm/test/CodeGen/AArch64/bswap.ll | 4 +-
llvm/test/CodeGen/AArch64/concat-vector.ll | 14 +--
.../AArch64/fixed-vector-interleave.ll | 20 +---
llvm/test/CodeGen/AArch64/fptoi.ll | 52 ++-------
llvm/test/CodeGen/AArch64/itofp.ll | 11 +-
llvm/test/CodeGen/AArch64/shift.ll | 90 ++-------------
llvm/test/CodeGen/AArch64/shufflevector.ll | 45 +-------
llvm/test/CodeGen/AArch64/xtn.ll | 17 +--
15 files changed, 174 insertions(+), 235 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 05d7e882f5135c..8556692dcf4f3b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -605,6 +605,9 @@ class CombinerHelper {
bool matchRotateOutOfRange(MachineInstr &MI);
void applyRotateOutOfRange(MachineInstr &MI);
+ bool matchUseVectorTruncate(MachineInstr &MI, Register &MatchInfo);
+ void applyUseVectorTruncate(MachineInstr &MI, Register &MatchInfo);
+
/// \returns true if a G_ICMP instruction \p MI can be replaced with a true
/// or false constant based off of KnownBits information.
bool matchICmpToTrueFalseKnownBits(MachineInstr &MI, int64_t &MatchInfo);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 2246e20ecc1dc8..16a3ddf849ca5b 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1599,6 +1599,13 @@ def insert_vector_elt_oob : GICombineRule<
[{ return Helper.matchInsertVectorElementOOB(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+// Combine v8i8 (buildvector i8 (trunc(unmerge)), i8 (trunc), i8 (trunc), i8 (trunc), undef, undef, undef, undef)
+def combine_use_vector_truncate : GICombineRule<
+ (defs root:$root, register_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_BUILD_VECTOR):$root,
+ [{ return Helper.matchUseVectorTruncate(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyUseVectorTruncate(*${root}, ${matchinfo}); }])>;
+
def add_of_vscale : GICombineRule<
(defs root:$root, build_fn_matchinfo:$matchinfo),
(match (G_VSCALE $left, $imm1),
@@ -1875,7 +1882,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
sub_add_reg, select_to_minmax, redundant_binop_in_equality,
fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
combine_concat_vector, double_icmp_zero_and_or_combine, match_addos,
- sext_trunc, zext_trunc, prefer_sign_combines, combine_shuffle_concat]>;
+ sext_trunc, zext_trunc, prefer_sign_combines, combine_shuffle_concat, combine_use_vector_truncate]>;
// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index d930ab29846297..61cd464cd619b7 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -3367,6 +3367,110 @@ static bool isConstValidTrue(const TargetLowering &TLI, unsigned ScalarSizeBits,
isConstTrueVal(TLI, Cst, IsVector, IsFP);
}
+// This combine tries to reduce the number of scalarised G_TRUNC instructions by
+// using vector truncates instead
+//
+// EXAMPLE:
+// %a(i32), %b(i32) = G_UNMERGE %src(<2 x i32>)
+// %T_a(i16) = G_TRUNC %a(i32)
+// %T_b(i16) = G_TRUNC %b(i32)
+// %Undef(i16) = G_IMPLICIT_DEF(i16)
+// %dst(v8i16) = G_MERGE_VALUES %T_a(i16), %T_b(i16), %Undef(i16), %Undef(i16)
+//
+// ===>
+// %Undef(<2 x i32>) = G_IMPLICIT_DEF(<2 x i32>)
+// %Mid(<4 x s16>) = G_CONCAT_VECTORS %src(<2 x i32>), %Undef(<2 x i32>)
+// %dst(<4 x s16>) = G_TRUNC %Mid(<4 x s16>)
+bool CombinerHelper::matchUseVectorTruncate(MachineInstr &MI,
+ Register &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
+ "Expected G_BUILD_VECTOR instruction\n");
+
+ unsigned NumOperands = MI.getNumOperands();
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+ // Check the G_BUILD_VECTOR sources
+ SmallVector<MachineInstr *> TruncMIs;
+ for (unsigned i = 1; i < NumOperands; ++i) {
+ auto SrcMI = MRI.getVRegDef(MI.getOperand(i).getReg());
+ auto SrcMIOpc = SrcMI->getOpcode();
+
+ if (SrcMIOpc == TargetOpcode::G_TRUNC)
+ TruncMIs.push_back(SrcMI);
+ else if (SrcMIOpc != TargetOpcode::G_IMPLICIT_DEF)
+ return false;
+ }
+
+ if (TruncMIs.size() < 2)
+ return false;
+
+ // Check if the Trunc instructions all come from the same MI
+ auto UnmergeMI = MRI.getVRegDef(TruncMIs[0]->getOperand(1).getReg());
+ if (UnmergeMI->getOpcode() != TargetOpcode::G_UNMERGE_VALUES)
+ return false;
+
+ for (auto TruncMI : TruncMIs) {
+ auto SrcMI = MRI.getVRegDef(TruncMI->getOperand(1).getReg());
+ if (!UnmergeMI->isIdenticalTo(*SrcMI))
+ return false;
+ }
+
+ // Check the size of unmerge source
+ unsigned numOps = UnmergeMI->getNumOperands();
+ MatchInfo = UnmergeMI->getOperand(numOps - 1).getReg();
+ LLT UnmergeSrcTy = MRI.getType(MatchInfo);
+ unsigned DstTyNumElt = DstTy.getNumElements();
+ unsigned UnmergeSrcTyNumElt = UnmergeSrcTy.getNumElements();
+ if (UnmergeSrcTyNumElt % UnmergeSrcTyNumElt != 0)
+ return false;
+
+ // If post legalizer, ensure generated instructions are legal
+ if (!IsPreLegalize) {
+ LLT MidTy = DstTy.changeElementSize(UnmergeSrcTy.getScalarSizeInBits());
+
+ if (DstTyNumElt != UnmergeSrcTyNumElt &&
+ !isLegal({TargetOpcode::G_CONCAT_VECTORS, {MidTy, UnmergeSrcTy}}))
+ return false;
+
+ if (!isLegal({TargetOpcode::G_TRUNC, {DstTy, MidTy}}))
+ return false;
+ }
+
+ return true;
+}
+
+void CombinerHelper::applyUseVectorTruncate(MachineInstr &MI,
+ Register &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
+ "Expected G_BUILD_VECTOR instruction\n");
+
+ Register MidReg;
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ LLT UnmergeSrcTy = MRI.getType(MatchInfo);
+ unsigned DstTyNumElt = DstTy.getNumElements();
+ unsigned UnmergeSrcTyNumElt = UnmergeSrcTy.getNumElements();
+
+ // No need to pad vector if only G_TRUNC is needed
+ if (DstTyNumElt / UnmergeSrcTyNumElt == 1) {
+ MidReg = MatchInfo;
+ } else {
+ Register UndefReg = Builder.buildUndef(UnmergeSrcTy).getReg(0);
+ SmallVector<Register> ConcatRegs = {MatchInfo};
+ for (unsigned i = 1; i < DstTyNumElt / UnmergeSrcTyNumElt; ++i)
+ ConcatRegs.push_back(UndefReg);
+
+ MidReg = Builder
+ .buildConcatVectors(DstTy.changeElementSize(
+ UnmergeSrcTy.getScalarSizeInBits()),
+ ConcatRegs)
+ .getReg(0);
+ }
+
+ Builder.buildTrunc(DstReg, MidReg);
+ MI.eraseFromParent();
+}
+
bool CombinerHelper::matchNotCmp(MachineInstr &MI,
SmallVectorImpl<Register> &RegsToNegate) {
assert(MI.getOpcode() == TargetOpcode::G_XOR);
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index ef00e962f3870f..c8724d1c610324 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -330,5 +330,5 @@ def AArch64PostLegalizerCombiner
select_to_minmax, or_to_bsp, combine_concat_vector,
commute_constant_to_rhs,
push_freeze_to_prevent_poison_from_propagating,
- combine_mul_cmlt]> {
+ combine_mul_cmlt, combine_use_vector_truncate]> {
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index d3c5742cee3eb4..8ae6ec795b272d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -95,7 +95,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(
{G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
.legalFor({p0, s8, s16, s32, s64})
- .legalFor(PackedVectorAllTypeList)
+ .legalFor({v16s8, v8s16, v4s32, v2s64, v2p0, v8s8, v4s16, v2s32, v4s8,
+ v2s16, v2s8})
.widenScalarToNextPow2(0)
.clampScalar(0, s8, s64)
.moreElementsToNextPow2(0)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
index 3e768c4d7a267c..03c28efe7e09fb 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
@@ -159,25 +159,13 @@ body: |
; CHECK-LABEL: name: test_freeze_v3s8
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16)
- ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s16)
- ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC]](s8), [[TRUNC1]](s8), [[TRUNC2]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR]](<8 x s8>)
- ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s16>), [[UV5:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>)
- ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s16>) = G_FREEZE [[UV4]]
- ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[FREEZE]](<4 x s16>)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s8>) = G_FREEZE [[DEF]]
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[FREEZE]](<4 x s8>)
; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
- ; CHECK-NEXT: %ext0:_(s32) = G_AND [[ANYEXT1]], [[C]]
- ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
- ; CHECK-NEXT: %ext1:_(s32) = G_AND [[ANYEXT2]], [[C]]
- ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
- ; CHECK-NEXT: %ext2:_(s32) = G_AND [[ANYEXT3]], [[C]]
+ ; CHECK-NEXT: %ext0:_(s32) = G_ZEXT [[UV]](s8)
+ ; CHECK-NEXT: %ext1:_(s32) = G_ZEXT [[UV1]](s8)
+ ; CHECK-NEXT: %ext2:_(s32) = G_ZEXT [[UV2]](s8)
; CHECK-NEXT: %res:_(<4 x s32>) = G_BUILD_VECTOR %ext0(s32), %ext1(s32), %ext2(s32), %undef(s32)
; CHECK-NEXT: $q0 = COPY %res(<4 x s32>)
%x:_(<3 x s8>) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
index 9a8697c1d9b866..11c6c7fb40faa1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
@@ -248,13 +248,10 @@ body: |
; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16)
; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV4]](s16)
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>)
- ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[UV6]](s16)
- ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[UV7]](s16)
- ; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s8) = G_TRUNC [[UV8]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF2]](<4 x s8>)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<16 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR1]](<16 x s8>), [[BUILD_VECTOR2]], shufflemask(0, 16, 16, 16, 1, 16, 16, 16, 2, 16, 16, 16, undef, undef, undef, undef)
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[SHUF]](<16 x s8>)
; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(<4 x s32>) = G_UITOFP [[BITCAST]](<4 x s32>)
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index df901e70ea3ac1..b2eab3fb77ca91 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -109,9 +109,7 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %a){
;
; CHECK-GI-LABEL: bswap_v2i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-NEXT: rev16 v0.8b, v0.8b
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index 932732d18c0ad4..9b224f11d3fb93 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -178,13 +178,10 @@ define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) {
;
; CHECK-GI-LABEL: concat_v8s16_v2s16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: dup v0.4s, w8
-; CHECK-GI-NEXT: ldr h1, [x0]
-; CHECK-GI-NEXT: ldr h2, [x0, #2]
-; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
-; CHECK-GI-NEXT: xtn v2.4h, v0.4s
-; CHECK-GI-NEXT: xtn v0.4h, v1.4s
-; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: ldr h0, [x0]
+; CHECK-GI-NEXT: ldr h1, [x0, #2]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT: xtn v0.4h, v0.4s
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: mov v0.s[3], w8
@@ -202,10 +199,7 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) {
;
; CHECK-GI-LABEL: concat_v16s8_v4s8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: dup v0.8h, w8
-; CHECK-GI-NEXT: xtn v1.8b, v0.8h
; CHECK-GI-NEXT: ldr s0, [x0]
-; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: mov v0.s[3], w8
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index 2ea7e0f3c44a9a..a9618fdc2dec30 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -3,22 +3,10 @@
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
-; CHECK-SD-LABEL: interleave2_v4f16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: interleave2_v4f16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: dup v2.4s, w8
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: xtn v2.4h, v2.4s
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.s[1], w8
-; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: zip1 v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: interleave2_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
ret <4 x half> %retval
}
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 08c5bd59f93e0d..77f8c492ac8eb2 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -3172,42 +3172,22 @@ entry:
}
define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) {
-; CHECK-SD-LABEL: fptos_v3f32_v3i16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fptos_v3f32_v3i16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov w9, v0.s[2]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: mov v0.h[2], w9
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fptos_v3f32_v3i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%c = fptosi <3 x float> %a to <3 x i16>
ret <3 x i16> %c
}
define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) {
-; CHECK-SD-LABEL: fptou_v3f32_v3i16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fptou_v3f32_v3i16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov w9, v0.s[2]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: mov v0.h[2], w9
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fptou_v3f32_v3i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%c = fptoui <3 x float> %a to <3 x i16>
ret <3 x i16> %c
@@ -6077,11 +6057,7 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
-; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NOFP16-NEXT: xtn v0.4h, v0.4s
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fptos_v3f16_v3i16:
@@ -6110,11 +6086,7 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
-; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NOFP16-NEXT: xtn v0.4h, v0.4s
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fptou_v3f16_v3i16:
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index d9fc3eabd34873..1545779ec79be0 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -7450,9 +7450,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
;
; CHECK-GI-FP16-LABEL: stofp_v2i16_v2f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
@@ -7493,9 +7491,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
;
; CHECK-GI-FP16-LABEL: utofp_v2i16_v2f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
@@ -8059,8 +8055,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
; CHECK-GI-FP16-NEXT: movi d1, #0x0000ff000000ff
; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-FP16-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index a9e52dcf490676..50be46a2635c1a 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -531,26 +531,8 @@ define <4 x i8> @shl_v4i8(<4 x i8> %0, <4 x i8> %1){
;
; CHECK-GI-LABEL: shl_v4i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NEXT: mov h3, v1.h[1]
-; CHECK-GI-NEXT: mov h4, v0.h[2]
-; CHECK-GI-NEXT: mov h5, v0.h[3]
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov h2, v1.h[2]
-; CHECK-GI-NEXT: fmov w9, s3
-; CHECK-GI-NEXT: mov h3, v1.h[3]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: mov v1.b[1], w9
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: fmov w9, s2
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: mov v1.b[2], w9
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: fmov w9, s3
-; CHECK-GI-NEXT: mov v0.b[3], w8
-; CHECK-GI-NEXT: mov v1.b[3], w9
+; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b
; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mov b1, v0.b[1]
; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
@@ -592,12 +574,8 @@ define <2 x i16> @shl_v2i16(<2 x i16> %0, <2 x i16> %1){
;
; CHECK-GI-LABEL: shl_v2i16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov w9, v1.s[1]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: mov v1.h[1], w9
+; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
@@ -685,26 +663,8 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %0, <4 x i8> %1){
;
; CHECK-GI-LABEL: ashr_v4i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h3, v0.h[1]
-; CHECK-GI-NEXT: mov h4, v1.h[2]
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov h2, v1.h[3]
-; CHECK-GI-NEXT: fmov w9, s4
-; CHECK-GI-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NEXT: mov v1.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v1.b[2], w9
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v1.b[3], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-GI-NEXT: neg v1.8b, v1.8b
; CHECK-GI-NEXT: sshl v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mov b1, v0.b[1]
@@ -746,12 +706,8 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %0, <2 x i16> %1){
;
; CHECK-GI-LABEL: ashr_v2i16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov w8, v1.s[1]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov w9, v0.s[1]
-; CHECK-GI-NEXT: mov v1.h[1], w8
-; CHECK-GI-NEXT: mov v0.h[1], w9
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-NEXT: neg v1.4h, v1.4h
; CHECK-GI-NEXT: sshl v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
@@ -828,26 +784,8 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %0, <4 x i8> %1){
;
; CHECK-GI-LABEL: lshr_v4i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h3, v0.h[1]
-; CHECK-GI-NEXT: mov h4, v1.h[2]
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov h2, v1.h[3]
-; CHECK-GI-NEXT: fmov w9, s4
-; CHECK-GI-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NEXT: mov v1.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v1.b[2], w9
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v1.b[3], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-GI-NEXT: neg v1.8b, v1.8b
; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mov b1, v0.b[1]
@@ -888,12 +826,8 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %0, <2 x i16> %1){
;
; CHECK-GI-LABEL: lshr_v2i16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov w8, v1.s[1]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov w9, v0.s[1]
-; CHECK-GI-NEXT: mov v1.h[1], w8
-; CHECK-GI-NEXT: mov v0.h[1], w9
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-NEXT: neg v1.4h, v1.4h
; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index 954458e4459749..5f4ff1e64673bb 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -209,27 +209,9 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
;
; CHECK-GI-LABEL: shufflevector_v4i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NEXT: mov h3, v1.h[1]
-; CHECK-GI-NEXT: mov h4, v0.h[2]
-; CHECK-GI-NEXT: mov h5, v0.h[3]
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov h2, v1.h[2]
-; CHECK-GI-NEXT: fmov w9, s3
-; CHECK-GI-NEXT: mov h3, v1.h[3]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: mov v1.b[1], w9
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: fmov w9, s2
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: mov v1.b[2], w9
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: fmov w9, s3
-; CHECK-GI-NEXT: mov v0.b[3], w8
-; CHECK-GI-NEXT: mov v1.b[3], w9
+; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-GI-NEXT: adrp x8, .LCPI15_0
+; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI15_0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
@@ -284,13 +266,9 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
;
; CHECK-GI-LABEL: shufflevector_v2i16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov w9, v1.s[1]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: mov v1.h[1], w9
+; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-NEXT: adrp x8, .LCPI17_0
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI17_0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
@@ -403,16 +381,7 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
;
; CHECK-GI-LABEL: shufflevector_v4i8_zeroes:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov h1, v0.h[3]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-GI-NEXT: dup v0.8b, v0.b[0]
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
@@ -448,9 +417,7 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
;
; CHECK-GI-LABEL: shufflevector_v2i16_zeroes:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index 8466f46023e3ab..fa5020e7415d37 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -287,19 +287,10 @@ entry:
}
define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) {
-; CHECK-SD-LABEL: xtn_v3i32_v3i16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: xtn_v3i32_v3i16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov w9, v0.s[2]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: mov v0.h[2], w9
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: xtn_v3i32_v3i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%arg1 = trunc <3 x i32> %a to <3 x i16>
ret <3 x i16> %arg1
More information about the llvm-commits
mailing list