[llvm] 72105d1 - [AArch64] Avoid using intermediate integer registers for copying between source and destination floating point registers
Nilanjana Basu via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 27 12:25:35 PST 2023
Author: Nilanjana Basu
Date: 2023-02-27T12:20:10-08:00
New Revision: 72105d10d5296ac175eb1339c4f71b67905fde61
URL: https://github.com/llvm/llvm-project/commit/72105d10d5296ac175eb1339c4f71b67905fde61
DIFF: https://github.com/llvm/llvm-project/commit/72105d10d5296ac175eb1339c4f71b67905fde61.diff
LOG: [AArch64] Avoid using intermediate integer registers for copying between source and destination floating point registers
In post-isel code, there are cases where there were redundant copies from a source FPR to an intermediate GPR in order to copy to a destination FPR. In this patch, we identify these patterns in post-isel peephole optimization and replace them with a direct FPR-to-FPR copy.
One example for this will be the insertion of the scalar result of 'uaddlv' neon intrinsic function into a destination vector. During instruction selection phase, 'uaddlv' result is copied to a GPR, & a vector insert instruction is matched separately to copy the previous result to a destination SIMD&FP register.
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D142594
Added:
Modified:
llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 6ff5ced2f330..199e1f748d1f 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -35,6 +35,17 @@
// 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
// ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
//
+// 6. %intermediate:gpr32 = COPY %src:fpr128
+// %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
+// ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
+//
+// In cases where a source FPR is copied to a GPR in order to be copied
+// to a destination FPR, we can directly copy the values between the FPRs,
+// eliminating the use of the Integer unit. When we match a pattern of
+// INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
+// source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
+// instructions.
+//
//===----------------------------------------------------------------------===//
#include "AArch64ExpandImm.h"
@@ -99,6 +110,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
bool visitAND(unsigned Opc, MachineInstr &MI);
bool visitORR(MachineInstr &MI);
bool visitINSERT(MachineInstr &MI);
+ bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override {
@@ -535,6 +547,50 @@ bool AArch64MIPeepholeOpt::splitTwoPartImm(
return true;
}
+bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
+ // Check if this INSvi[X]gpr comes from COPY of a source FPR128
+ //
+ // From
+ // %intermediate1:gpr64 = COPY %src:fpr128
+ // %intermediate2:gpr32 = COPY %intermediate1:gpr64
+ // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
+ // To
+ // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
+ // src_index
+ // where src_index = 0, X = [8|16|32|64]
+
+ MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
+
+ // For a chain of COPY instructions, find the initial source register
+ // and check if it's an FPR128
+ while (true) {
+ if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
+ return false;
+
+ if (!SrcMI->getOperand(1).getReg().isVirtual())
+ return false;
+
+ if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==
+ &AArch64::FPR128RegClass) {
+ break;
+ }
+ SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
+ }
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = SrcMI->getOperand(1).getReg();
+ MachineInstr *INSvilaneMI =
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .addUse(SrcReg, getRegState(SrcMI->getOperand(1)))
+ .addImm(0);
+
+ LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n");
+ MI.eraseFromParent();
+ return true;
+}
+
bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -598,6 +654,18 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
{AArch64::ADDXri, AArch64::ADDSXri},
MI);
break;
+ case AArch64::INSvi64gpr:
+ Changed = visitINSviGPR(MI, AArch64::INSvi64lane);
+ break;
+ case AArch64::INSvi32gpr:
+ Changed = visitINSviGPR(MI, AArch64::INSvi32lane);
+ break;
+ case AArch64::INSvi16gpr:
+ Changed = visitINSviGPR(MI, AArch64::INSvi16lane);
+ break;
+ case AArch64::INSvi8gpr:
+ Changed = visitINSviGPR(MI, AArch64::INSvi8lane);
+ break;
}
}
}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index 7126c3aed767..c017bcfa10e1 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -13,10 +13,9 @@ define void @insert_vec_v2i32_uaddlv_from_v8i16(ptr %0) {
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: uaddlv.8h s0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov.s v1[0], w8
-; CHECK-NEXT: ucvtf.2s v0, v1
-; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: mov.s v1[0], v0[0]
+; CHECK-NEXT: ucvtf.2s v1, v1
+; CHECK-NEXT: str d1, [x0]
; CHECK-NEXT: ret
entry:
@@ -32,8 +31,7 @@ define void @insert_vec_v4i32_uaddlv_from_v8i16(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: uaddlv.8h s1, v0
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.s v0[0], w8
+; CHECK-NEXT: mov.s v0[0], v1[0]
; CHECK-NEXT: ucvtf.4s v0, v0
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
@@ -50,12 +48,11 @@ define void @insert_vec_v16i32_uaddlv_from_v8i16(ptr %0) {
; CHECK-LABEL: insert_vec_v16i32_uaddlv_from_v8i16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
-; CHECK-NEXT: movi.2d v2, #0000000000000000
-; CHECK-NEXT: uaddlv.8h s1, v0
+; CHECK-NEXT: movi.2d v1, #0000000000000000
+; CHECK-NEXT: uaddlv.8h s2, v0
; CHECK-NEXT: stp q0, q0, [x0, #32]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.s v2[0], w8
-; CHECK-NEXT: ucvtf.4s v1, v2
+; CHECK-NEXT: mov.s v1[0], v2[0]
+; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: ret
@@ -71,16 +68,15 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) {
; CHECK-LABEL: insert_vec_v23i32_uaddlv_from_v8i16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
-; CHECK-NEXT: movi.2d v2, #0000000000000000
-; CHECK-NEXT: uaddlv.8h s1, v0
+; CHECK-NEXT: add x8, x0, #88
+; CHECK-NEXT: movi.2d v1, #0000000000000000
+; CHECK-NEXT: uaddlv.8h s2, v0
; CHECK-NEXT: stp q0, q0, [x0, #16]
; CHECK-NEXT: stp q0, q0, [x0, #48]
-; CHECK-NEXT: str d0, [x0, #80]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.s v2[0], w8
-; CHECK-NEXT: add x8, x0, #88
; CHECK-NEXT: st1.s { v0 }[2], [x8]
-; CHECK-NEXT: ucvtf.4s v1, v2
+; CHECK-NEXT: mov.s v1[0], v2[0]
+; CHECK-NEXT: str d0, [x0, #80]
+; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: ret
@@ -98,10 +94,9 @@ define void @insert_vec_v2i32_uaddlv_from_v16i8(ptr %0) {
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: uaddlv.16b h0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov.s v1[0], w8
-; CHECK-NEXT: ucvtf.2s v0, v1
-; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: mov.s v1[0], v0[0]
+; CHECK-NEXT: ucvtf.2s v1, v1
+; CHECK-NEXT: str d1, [x0]
; CHECK-NEXT: ret
entry:
@@ -117,8 +112,7 @@ define void @insert_vec_v2i32_uaddlv_from_v8i8(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: uaddlv.8b h1, v0
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.s v0[0], w8
+; CHECK-NEXT: mov.s v0[0], v1[0]
; CHECK-NEXT: ucvtf.2s v0, v0
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
@@ -136,8 +130,7 @@ define void @insert_vec_v2i32_uaddlv_from_v4i16(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: uaddlv.4h s1, v0
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.s v0[0], w8
+; CHECK-NEXT: mov.s v0[0], v1[0]
; CHECK-NEXT: ucvtf.2s v0, v0
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
@@ -155,11 +148,10 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: movi.2d v3, #0000000000000000
; CHECK-NEXT: uaddlv.4s d2, v1
-; CHECK-NEXT: fmov x8, d2
-; CHECK-NEXT: movi.2d v2, #0000000000000000
-; CHECK-NEXT: mov.d v1[0], x8
-; CHECK-NEXT: str d2, [x0, #16]
+; CHECK-NEXT: str d3, [x0, #16]
+; CHECK-NEXT: mov.d v1[0], v2[0]
; CHECK-NEXT: ucvtf.2d v1, v1
; CHECK-NEXT: fcvtn v1.2s, v1.2d
; CHECK-NEXT: mov.d v1[1], v0[0]
@@ -179,8 +171,7 @@ define void @insert_vec_v2i64_uaddlv_from_v4i32(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: uaddlv.4s d1, v0
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: mov.d v0[0], x8
+; CHECK-NEXT: mov.d v0[0], v1[0]
; CHECK-NEXT: ucvtf.2d v0, v0
; CHECK-NEXT: fcvtn v0.2s, v0.2d
; CHECK-NEXT: str d0, [x0]
@@ -201,8 +192,7 @@ define void @insert_vec_v5i64_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: str wzr, [x0, #16]
; CHECK-NEXT: movi d0, #0000000000000000
; CHECK-NEXT: uaddlv.4s d2, v1
-; CHECK-NEXT: fmov x8, d2
-; CHECK-NEXT: mov.d v1[0], x8
+; CHECK-NEXT: mov.d v1[0], v2[0]
; CHECK-NEXT: ucvtf.2d v1, v1
; CHECK-NEXT: fcvtn v1.2s, v1.2d
; CHECK-NEXT: mov.d v1[1], v0[0]
@@ -224,11 +214,10 @@ define void @insert_vec_v8i16_uaddlv_from_v8i16(ptr %0) {
; CHECK-NEXT: stp xzr, xzr, [x0, #16]
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: uaddlv.8h s0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov.h v1[0], w8
-; CHECK-NEXT: ushll.4s v0, v1, #0
-; CHECK-NEXT: ucvtf.4s v0, v0
-; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: mov.h v1[0], v0[0]
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: ucvtf.4s v1, v1
+; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: ret
entry:
@@ -244,15 +233,14 @@ define void @insert_vec_v3i16_uaddlv_from_v8i16(ptr %0) {
; CHECK-LABEL: insert_vec_v3i16_uaddlv_from_v8i16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
+; CHECK-NEXT: add x8, x0, #8
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: uaddlv.8h s0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov.h v1[0], w8
-; CHECK-NEXT: add x8, x0, #8
-; CHECK-NEXT: ushll.4s v0, v1, #0
-; CHECK-NEXT: ucvtf.4s v0, v0
-; CHECK-NEXT: st1.s { v0 }[2], [x8]
-; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: mov.h v1[0], v0[0]
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: ucvtf.4s v1, v1
+; CHECK-NEXT: st1.s { v1 }[2], [x8]
+; CHECK-NEXT: str d1, [x0]
; CHECK-NEXT: ret
entry:
@@ -269,12 +257,11 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: movi.2d v3, #0000000000000000
-; CHECK-NEXT: uaddlv.4h s2, v1
+; CHECK-NEXT: movi.2d v2, #0000000000000000
+; CHECK-NEXT: uaddlv.4h s3, v1
; CHECK-NEXT: stp q1, q1, [x0, #32]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov.s v3[0], w8
-; CHECK-NEXT: ucvtf.2d v2, v3
+; CHECK-NEXT: mov.s v2[0], v3[0]
+; CHECK-NEXT: ucvtf.2d v2, v2
; CHECK-NEXT: fcvtn v2.2s, v2.2d
; CHECK-NEXT: mov.d v2[1], v0[0]
; CHECK-NEXT: stp q2, q1, [x0]
@@ -293,12 +280,11 @@ define void @insert_vec_v16i8_uaddlv_from_v8i8(ptr %0) {
; CHECK-LABEL: insert_vec_v16i8_uaddlv_from_v8i8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
-; CHECK-NEXT: movi.2d v2, #0000000000000000
-; CHECK-NEXT: uaddlv.8b h1, v0
+; CHECK-NEXT: movi.2d v1, #0000000000000000
+; CHECK-NEXT: uaddlv.8b h2, v0
; CHECK-NEXT: stp q0, q0, [x0, #32]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.b v2[0], w8
-; CHECK-NEXT: zip1.8b v1, v2, v0
+; CHECK-NEXT: mov.b v1[0], v2[0]
+; CHECK-NEXT: zip1.8b v1, v1, v0
; CHECK-NEXT: bic.4h v1, #255, lsl #8
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
@@ -320,8 +306,7 @@ define void @insert_vec_v8i8_uaddlv_from_v8i8(ptr %0) {
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: stp xzr, xzr, [x0, #16]
; CHECK-NEXT: uaddlv.8b h1, v0
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.h v0[0], w8
+; CHECK-NEXT: mov.h v0[0], v1[0]
; CHECK-NEXT: bic.4h v0, #255, lsl #8
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: ucvtf.4s v0, v0
@@ -344,8 +329,7 @@ define void @insert_vec_v12i16_uaddlv_from_v4i16(ptr %0) {
; CHECK-NEXT: stp xzr, xzr, [x0, #16]
; CHECK-NEXT: stp xzr, xzr, [x0, #32]
; CHECK-NEXT: uaddlv.4h s1, v0
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.h v0[0], w8
+; CHECK-NEXT: mov.h v0[0], v1[0]
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: ucvtf.4s v0, v0
; CHECK-NEXT: str q0, [x0]
@@ -366,8 +350,7 @@ define void @insert_vec_v8i32_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: stp xzr, xzr, [x0, #16]
; CHECK-NEXT: uaddlv.4s d1, v0
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: mov.s v0[0], w8
+; CHECK-NEXT: mov.s v0[0], v1[0]
; CHECK-NEXT: ucvtf.4s v0, v0
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
@@ -385,12 +368,11 @@ define void @insert_vec_v16i32_uaddlv_from_v4i32(ptr %0) {
; CHECK-LABEL: insert_vec_v16i32_uaddlv_from_v4i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
-; CHECK-NEXT: movi.2d v2, #0000000000000000
-; CHECK-NEXT: uaddlv.4s d1, v0
+; CHECK-NEXT: movi.2d v1, #0000000000000000
+; CHECK-NEXT: uaddlv.4s d2, v0
; CHECK-NEXT: stp q0, q0, [x0, #32]
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: mov.s v2[0], w8
-; CHECK-NEXT: ucvtf.4s v1, v2
+; CHECK-NEXT: mov.s v1[0], v2[0]
+; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: ret
@@ -409,8 +391,7 @@ define void @insert_vec_v4i16_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: mov.h v1[0], w8
+; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: ushll.4s v0, v1, #0
; CHECK-NEXT: ucvtf.4s v0, v0
; CHECK-NEXT: str q0, [x0]
@@ -431,11 +412,10 @@ define void @insert_vec_v16i16_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
-; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: movi.2d v0, #0000000000000000
-; CHECK-NEXT: mov.h v1[0], w8
-; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: ret
@@ -456,8 +436,7 @@ define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: stp xzr, xzr, [x0, #16]
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: mov.h v1[0], w8
+; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: bic.4h v1, #255, lsl #8
; CHECK-NEXT: ushll.4s v0, v1, #0
; CHECK-NEXT: ucvtf.4s v0, v0
@@ -479,8 +458,7 @@ define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: mov.b v1[0], w8
+; CHECK-NEXT: mov.b v1[0], v0[0]
; CHECK-NEXT: zip1.8b v0, v1, v0
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: bic.4h v0, #255, lsl #8
@@ -504,8 +482,7 @@ define void @insert_vec_v2i32_uaddlv_from_v8i16_nz_index(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: uaddlv.8h s1, v0
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.s v0[2], w8
+; CHECK-NEXT: mov.s v0[2], v1[0]
; CHECK-NEXT: ucvtf.4s v0, v0
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 9fe0eb9c18bb..20f2f2e67c47 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -3377,13 +3377,11 @@ define <16 x i8> @test_signed_v16f64_v16i8(<16 x double> %f) {
; CHECK-NEXT: mov w8, #127
; CHECK-NEXT: fcvtzs w11, d0
; CHECK-NEXT: mov w9, #-128
-; CHECK-NEXT: fcvtzs w13, d1
; CHECK-NEXT: mov d0, v2.d[1]
-; CHECK-NEXT: fcvtzs w14, d2
+; CHECK-NEXT: fcvtzs w13, d1
; CHECK-NEXT: fcvtzs w10, d16
; CHECK-NEXT: mov d16, v1.d[1]
-; CHECK-NEXT: mov d1, v3.d[1]
-; CHECK-NEXT: fcvtzs w15, d0
+; CHECK-NEXT: fcvtzs w14, d0
; CHECK-NEXT: cmp w10, #127
; CHECK-NEXT: csel w10, w10, w8, lt
; CHECK-NEXT: fcvtzs w12, d16
@@ -3398,117 +3396,112 @@ define <16 x i8> @test_signed_v16f64_v16i8(<16 x double> %f) {
; CHECK-NEXT: cmn w12, #128
; CHECK-NEXT: csel w12, w12, w9, gt
; CHECK-NEXT: cmp w13, #127
-; CHECK-NEXT: csel w13, w13, w8, lt
; CHECK-NEXT: fmov s0, w11
-; CHECK-NEXT: cmn w13, #128
-; CHECK-NEXT: csel w11, w13, w9, gt
-; CHECK-NEXT: cmp w15, #127
-; CHECK-NEXT: mov v0.s[1], w10
-; CHECK-NEXT: csel w10, w15, w8, lt
-; CHECK-NEXT: cmn w10, #128
-; CHECK-NEXT: fcvtzs w13, d3
-; CHECK-NEXT: fmov s2, w11
-; CHECK-NEXT: csel w10, w10, w9, gt
-; CHECK-NEXT: cmp w14, #127
-; CHECK-NEXT: fcvtzs w11, d1
-; CHECK-NEXT: mov w15, v0.s[1]
-; CHECK-NEXT: csel w14, w14, w8, lt
-; CHECK-NEXT: mov v2.s[1], w12
-; CHECK-NEXT: cmn w14, #128
-; CHECK-NEXT: csel w12, w14, w9, gt
-; CHECK-NEXT: cmp w11, #127
-; CHECK-NEXT: csel w11, w11, w8, lt
-; CHECK-NEXT: mov d1, v4.d[1]
-; CHECK-NEXT: mov v0.b[1], w15
+; CHECK-NEXT: csel w11, w13, w8, lt
; CHECK-NEXT: cmn w11, #128
-; CHECK-NEXT: fmov w14, s2
+; CHECK-NEXT: fcvtzs w13, d2
; CHECK-NEXT: csel w11, w11, w9, gt
-; CHECK-NEXT: fmov s3, w12
-; CHECK-NEXT: cmp w13, #127
-; CHECK-NEXT: mov w12, v2.s[1]
-; CHECK-NEXT: csel w13, w13, w8, lt
-; CHECK-NEXT: mov v0.b[2], w14
-; CHECK-NEXT: cmn w13, #128
-; CHECK-NEXT: mov v3.s[1], w10
-; CHECK-NEXT: csel w13, w13, w9, gt
-; CHECK-NEXT: fcvtzs w15, d1
-; CHECK-NEXT: fcvtzs w14, d4
-; CHECK-NEXT: mov d1, v5.d[1]
-; CHECK-NEXT: mov v0.b[3], w12
-; CHECK-NEXT: fmov s4, w13
-; CHECK-NEXT: cmp w15, #127
-; CHECK-NEXT: fmov w13, s3
-; CHECK-NEXT: csel w10, w15, w8, lt
-; CHECK-NEXT: mov w12, v3.s[1]
-; CHECK-NEXT: cmn w10, #128
-; CHECK-NEXT: fcvtzs w15, d1
-; CHECK-NEXT: csel w10, w10, w9, gt
; CHECK-NEXT: cmp w14, #127
-; CHECK-NEXT: mov v0.b[4], w13
+; CHECK-NEXT: mov v0.s[1], w10
; CHECK-NEXT: csel w14, w14, w8, lt
-; CHECK-NEXT: mov v4.s[1], w11
; CHECK-NEXT: cmn w14, #128
-; CHECK-NEXT: csel w14, w14, w9, gt
-; CHECK-NEXT: fcvtzs w13, d5
-; CHECK-NEXT: cmp w15, #127
-; CHECK-NEXT: mov d2, v6.d[1]
-; CHECK-NEXT: mov v0.b[5], w12
-; CHECK-NEXT: csel w11, w15, w8, lt
-; CHECK-NEXT: fmov w12, s4
-; CHECK-NEXT: cmn w11, #128
-; CHECK-NEXT: fmov s1, w14
-; CHECK-NEXT: csel w11, w11, w9, gt
+; CHECK-NEXT: mov d2, v3.d[1]
+; CHECK-NEXT: fmov s1, w11
+; CHECK-NEXT: csel w11, w14, w9, gt
; CHECK-NEXT: cmp w13, #127
-; CHECK-NEXT: mov w14, v4.s[1]
-; CHECK-NEXT: mov v0.b[6], w12
+; CHECK-NEXT: fcvtzs w10, d3
+; CHECK-NEXT: mov w14, v0.s[1]
; CHECK-NEXT: csel w13, w13, w8, lt
-; CHECK-NEXT: mov v1.s[1], w10
; CHECK-NEXT: cmn w13, #128
-; CHECK-NEXT: fcvtzs w15, d2
+; CHECK-NEXT: mov d3, v4.d[1]
; CHECK-NEXT: csel w13, w13, w9, gt
-; CHECK-NEXT: fcvtzs w10, d6
-; CHECK-NEXT: mov v0.b[7], w14
-; CHECK-NEXT: cmp w15, #127
-; CHECK-NEXT: fmov w14, s1
-; CHECK-NEXT: csel w12, w15, w8, lt
+; CHECK-NEXT: mov v1.s[1], w12
+; CHECK-NEXT: fcvtzs w12, d2
+; CHECK-NEXT: mov v0.b[1], w14
; CHECK-NEXT: fmov s2, w13
-; CHECK-NEXT: mov w13, v1.s[1]
-; CHECK-NEXT: mov d1, v7.d[1]
+; CHECK-NEXT: cmp w12, #127
+; CHECK-NEXT: fcvtzs w13, d3
+; CHECK-NEXT: csel w12, w12, w8, lt
+; CHECK-NEXT: fcvtzs w14, d4
; CHECK-NEXT: cmn w12, #128
-; CHECK-NEXT: fcvtzs w15, d7
+; CHECK-NEXT: mov d3, v5.d[1]
+; CHECK-NEXT: mov v2.s[1], w11
+; CHECK-NEXT: mov w11, v1.s[1]
+; CHECK-NEXT: mov v0.b[2], v1.b[0]
; CHECK-NEXT: csel w12, w12, w9, gt
; CHECK-NEXT: cmp w10, #127
-; CHECK-NEXT: mov v0.b[8], w14
+; CHECK-NEXT: mov d4, v6.d[1]
; CHECK-NEXT: csel w10, w10, w8, lt
-; CHECK-NEXT: mov v2.s[1], w11
; CHECK-NEXT: cmn w10, #128
-; CHECK-NEXT: fcvtzs w11, d1
; CHECK-NEXT: csel w10, w10, w9, gt
-; CHECK-NEXT: mov v0.b[9], w13
-; CHECK-NEXT: fmov w14, s2
-; CHECK-NEXT: cmp w11, #127
-; CHECK-NEXT: fmov s1, w10
-; CHECK-NEXT: csel w10, w11, w8, lt
+; CHECK-NEXT: cmp w13, #127
+; CHECK-NEXT: mov v0.b[3], w11
+; CHECK-NEXT: csel w13, w13, w8, lt
+; CHECK-NEXT: cmn w13, #128
+; CHECK-NEXT: fcvtzs w11, d3
+; CHECK-NEXT: csel w13, w13, w9, gt
+; CHECK-NEXT: cmp w14, #127
+; CHECK-NEXT: fmov s3, w10
+; CHECK-NEXT: csel w10, w14, w8, lt
+; CHECK-NEXT: mov w14, v2.s[1]
; CHECK-NEXT: cmn w10, #128
-; CHECK-NEXT: mov w13, v2.s[1]
-; CHECK-NEXT: mov v0.b[10], w14
+; CHECK-NEXT: mov v0.b[4], v2.b[0]
; CHECK-NEXT: csel w10, w10, w9, gt
-; CHECK-NEXT: cmp w15, #127
-; CHECK-NEXT: mov v1.s[1], w12
-; CHECK-NEXT: csel w8, w15, w8, lt
+; CHECK-NEXT: mov v3.s[1], w12
+; CHECK-NEXT: cmp w11, #127
+; CHECK-NEXT: csel w11, w11, w8, lt
+; CHECK-NEXT: fcvtzs w12, d5
+; CHECK-NEXT: cmn w11, #128
+; CHECK-NEXT: mov v0.b[5], w14
+; CHECK-NEXT: fcvtzs w14, d4
+; CHECK-NEXT: fmov s4, w10
+; CHECK-NEXT: csel w10, w11, w9, gt
+; CHECK-NEXT: mov w11, v3.s[1]
+; CHECK-NEXT: cmp w12, #127
+; CHECK-NEXT: csel w12, w12, w8, lt
+; CHECK-NEXT: mov v0.b[6], v3.b[0]
+; CHECK-NEXT: cmn w12, #128
+; CHECK-NEXT: mov v4.s[1], w13
+; CHECK-NEXT: csel w12, w12, w9, gt
+; CHECK-NEXT: cmp w14, #127
+; CHECK-NEXT: csel w13, w14, w8, lt
+; CHECK-NEXT: mov v0.b[7], w11
+; CHECK-NEXT: fcvtzs w11, d6
+; CHECK-NEXT: cmn w13, #128
+; CHECK-NEXT: fmov s5, w12
+; CHECK-NEXT: csel w12, w13, w9, gt
+; CHECK-NEXT: mov w13, v4.s[1]
+; CHECK-NEXT: cmp w11, #127
+; CHECK-NEXT: mov d6, v7.d[1]
+; CHECK-NEXT: mov v0.b[8], v4.b[0]
+; CHECK-NEXT: csel w11, w11, w8, lt
+; CHECK-NEXT: cmn w11, #128
+; CHECK-NEXT: mov v5.s[1], w10
+; CHECK-NEXT: csel w10, w11, w9, gt
+; CHECK-NEXT: fcvtzs w11, d6
+; CHECK-NEXT: mov v0.b[9], w13
+; CHECK-NEXT: fcvtzs w13, d7
+; CHECK-NEXT: fmov s6, w10
+; CHECK-NEXT: mov w10, v5.s[1]
+; CHECK-NEXT: cmp w11, #127
+; CHECK-NEXT: csel w11, w11, w8, lt
+; CHECK-NEXT: mov v0.b[10], v5.b[0]
+; CHECK-NEXT: cmn w11, #128
+; CHECK-NEXT: mov v6.s[1], w12
+; CHECK-NEXT: mov v0.b[11], w10
+; CHECK-NEXT: csel w10, w11, w9, gt
+; CHECK-NEXT: cmp w13, #127
+; CHECK-NEXT: csel w8, w13, w8, lt
; CHECK-NEXT: cmn w8, #128
; CHECK-NEXT: csel w8, w8, w9, gt
-; CHECK-NEXT: mov v0.b[11], w13
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov v0.b[12], w9
-; CHECK-NEXT: mov v2.s[1], w10
-; CHECK-NEXT: mov v0.b[13], w8
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov w9, v2.s[1]
-; CHECK-NEXT: mov v0.b[14], w8
-; CHECK-NEXT: mov v0.b[15], w9
+; CHECK-NEXT: mov w9, v6.s[1]
+; CHECK-NEXT: mov v0.b[12], v6.b[0]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: mov v0.b[13], w9
+; CHECK-NEXT: mov v7.s[1], w10
+; CHECK-NEXT: mov v0.b[14], v7.b[0]
+; CHECK-NEXT: mov w8, v7.s[1]
+; CHECK-NEXT: mov v0.b[15], w8
; CHECK-NEXT: ret
%x = call <16 x i8> @llvm.fptosi.sat.v16f64.v16i8(<16 x double> %f)
ret <16 x i8> %x
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index c1fbfce3756b..510e0dca2f52 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -2820,92 +2820,85 @@ define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) {
; CHECK-NEXT: csel w10, w11, w8, lo
; CHECK-NEXT: cmp w12, #255
; CHECK-NEXT: csel w11, w12, w8, lo
+; CHECK-NEXT: fcvtzu w12, d2
; CHECK-NEXT: mov v0.s[1], w9
; CHECK-NEXT: fcvtzu w9, d1
+; CHECK-NEXT: mov d2, v3.d[1]
; CHECK-NEXT: fmov s1, w11
-; CHECK-NEXT: fcvtzu w11, d2
; CHECK-NEXT: cmp w9, #255
-; CHECK-NEXT: mov d2, v3.d[1]
-; CHECK-NEXT: mov w12, v0.s[1]
+; CHECK-NEXT: mov w11, v0.s[1]
; CHECK-NEXT: csel w9, w9, w8, lo
+; CHECK-NEXT: cmp w12, #255
; CHECK-NEXT: mov v1.s[1], w10
-; CHECK-NEXT: cmp w11, #255
-; CHECK-NEXT: csel w11, w11, w8, lo
+; CHECK-NEXT: csel w12, w12, w8, lo
; CHECK-NEXT: fcvtzu w10, d2
-; CHECK-NEXT: mov d2, v4.d[1]
-; CHECK-NEXT: mov v0.b[1], w12
-; CHECK-NEXT: fmov w13, s1
-; CHECK-NEXT: mov w12, v1.s[1]
-; CHECK-NEXT: fmov s1, w11
+; CHECK-NEXT: mov v0.b[1], w11
; CHECK-NEXT: fcvtzu w11, d3
+; CHECK-NEXT: fmov s2, w12
+; CHECK-NEXT: mov w12, v1.s[1]
; CHECK-NEXT: cmp w10, #255
-; CHECK-NEXT: mov v0.b[2], w13
-; CHECK-NEXT: mov v1.s[1], w9
-; CHECK-NEXT: csel w9, w10, w8, lo
+; CHECK-NEXT: mov d3, v4.d[1]
+; CHECK-NEXT: csel w10, w10, w8, lo
+; CHECK-NEXT: mov v0.b[2], v1.b[0]
; CHECK-NEXT: cmp w11, #255
-; CHECK-NEXT: fcvtzu w10, d2
+; CHECK-NEXT: mov v2.s[1], w9
; CHECK-NEXT: csel w11, w11, w8, lo
-; CHECK-NEXT: mov d2, v5.d[1]
+; CHECK-NEXT: fcvtzu w9, d3
+; CHECK-NEXT: mov d3, v5.d[1]
; CHECK-NEXT: mov v0.b[3], w12
-; CHECK-NEXT: fmov w12, s1
-; CHECK-NEXT: cmp w10, #255
-; CHECK-NEXT: mov w13, v1.s[1]
-; CHECK-NEXT: fmov s1, w11
-; CHECK-NEXT: fcvtzu w11, d4
-; CHECK-NEXT: mov v0.b[4], w12
-; CHECK-NEXT: mov v1.s[1], w9
-; CHECK-NEXT: csel w9, w10, w8, lo
-; CHECK-NEXT: cmp w11, #255
-; CHECK-NEXT: csel w10, w11, w8, lo
-; CHECK-NEXT: mov v0.b[5], w13
-; CHECK-NEXT: fcvtzu w13, d2
-; CHECK-NEXT: fmov w11, s1
-; CHECK-NEXT: mov w12, v1.s[1]
-; CHECK-NEXT: fmov s1, w10
-; CHECK-NEXT: fcvtzu w10, d5
-; CHECK-NEXT: cmp w13, #255
-; CHECK-NEXT: mov v0.b[6], w11
-; CHECK-NEXT: mov d2, v6.d[1]
-; CHECK-NEXT: mov v1.s[1], w9
-; CHECK-NEXT: csel w9, w13, w8, lo
+; CHECK-NEXT: fcvtzu w12, d4
+; CHECK-NEXT: fmov s4, w11
+; CHECK-NEXT: mov w11, v2.s[1]
+; CHECK-NEXT: cmp w9, #255
+; CHECK-NEXT: csel w9, w9, w8, lo
+; CHECK-NEXT: cmp w12, #255
+; CHECK-NEXT: mov v0.b[4], v2.b[0]
+; CHECK-NEXT: csel w12, w12, w8, lo
+; CHECK-NEXT: mov v4.s[1], w10
+; CHECK-NEXT: fcvtzu w10, d3
+; CHECK-NEXT: fmov s3, w12
+; CHECK-NEXT: mov v0.b[5], w11
+; CHECK-NEXT: fcvtzu w11, d5
+; CHECK-NEXT: mov w12, v4.s[1]
; CHECK-NEXT: cmp w10, #255
-; CHECK-NEXT: fcvtzu w13, d6
; CHECK-NEXT: csel w10, w10, w8, lo
+; CHECK-NEXT: mov d5, v6.d[1]
+; CHECK-NEXT: cmp w11, #255
+; CHECK-NEXT: mov v0.b[6], v4.b[0]
+; CHECK-NEXT: csel w11, w11, w8, lo
+; CHECK-NEXT: mov v3.s[1], w9
+; CHECK-NEXT: fcvtzu w9, d6
+; CHECK-NEXT: mov d6, v7.d[1]
; CHECK-NEXT: mov v0.b[7], w12
-; CHECK-NEXT: fcvtzu w12, d2
-; CHECK-NEXT: fmov w11, s1
-; CHECK-NEXT: fmov s2, w10
-; CHECK-NEXT: mov w10, v1.s[1]
+; CHECK-NEXT: fcvtzu w12, d5
+; CHECK-NEXT: fmov s5, w11
+; CHECK-NEXT: mov w11, v3.s[1]
; CHECK-NEXT: cmp w12, #255
-; CHECK-NEXT: mov d1, v7.d[1]
-; CHECK-NEXT: mov v0.b[8], w11
-; CHECK-NEXT: mov v2.s[1], w9
-; CHECK-NEXT: csel w9, w12, w8, lo
-; CHECK-NEXT: cmp w13, #255
-; CHECK-NEXT: csel w11, w13, w8, lo
-; CHECK-NEXT: fcvtzu w13, d7
-; CHECK-NEXT: mov v0.b[9], w10
-; CHECK-NEXT: fmov w10, s2
-; CHECK-NEXT: fmov s3, w11
-; CHECK-NEXT: fcvtzu w11, d1
-; CHECK-NEXT: mov w12, v2.s[1]
-; CHECK-NEXT: mov v0.b[10], w10
-; CHECK-NEXT: mov v3.s[1], w9
+; CHECK-NEXT: mov v0.b[8], v3.b[0]
+; CHECK-NEXT: csel w12, w12, w8, lo
+; CHECK-NEXT: cmp w9, #255
+; CHECK-NEXT: mov v5.s[1], w10
+; CHECK-NEXT: csel w9, w9, w8, lo
+; CHECK-NEXT: fcvtzu w10, d6
+; CHECK-NEXT: mov v0.b[9], w11
+; CHECK-NEXT: fcvtzu w11, d7
+; CHECK-NEXT: fmov s16, w9
+; CHECK-NEXT: mov w9, v5.s[1]
+; CHECK-NEXT: cmp w10, #255
+; CHECK-NEXT: mov v0.b[10], v5.b[0]
+; CHECK-NEXT: mov v16.s[1], w12
+; CHECK-NEXT: mov v0.b[11], w9
+; CHECK-NEXT: csel w9, w10, w8, lo
; CHECK-NEXT: cmp w11, #255
-; CHECK-NEXT: csel w9, w11, w8, lo
-; CHECK-NEXT: cmp w13, #255
-; CHECK-NEXT: csel w8, w13, w8, lo
-; CHECK-NEXT: mov v0.b[11], w12
-; CHECK-NEXT: fmov w10, s3
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: mov w8, v3.s[1]
-; CHECK-NEXT: mov v0.b[12], w10
-; CHECK-NEXT: mov v1.s[1], w9
-; CHECK-NEXT: mov v0.b[13], w8
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov w9, v1.s[1]
-; CHECK-NEXT: mov v0.b[14], w8
-; CHECK-NEXT: mov v0.b[15], w9
+; CHECK-NEXT: mov w10, v16.s[1]
+; CHECK-NEXT: csel w8, w11, w8, lo
+; CHECK-NEXT: mov v0.b[12], v16.b[0]
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: mov v0.b[13], w10
+; CHECK-NEXT: mov v6.s[1], w9
+; CHECK-NEXT: mov v0.b[14], v6.b[0]
+; CHECK-NEXT: mov w8, v6.s[1]
+; CHECK-NEXT: mov v0.b[15], w8
; CHECK-NEXT: ret
%x = call <16 x i8> @llvm.fptoui.sat.v16f64.v16i8(<16 x double> %f)
ret <16 x i8> %x
diff --git a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
index aaa7dd00419e..b2ecd6ead550 100644
--- a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
@@ -41,16 +41,15 @@ define <8 x i8> @extract_2_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-NEXT: mov w9, v0.s[2]
; CHECK-NEXT: mov w10, v0.s[3]
; CHECK-NEXT: mov v0.b[1], w8
-; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: mov w8, v1.s[1]
; CHECK-NEXT: mov v0.b[2], w9
-; CHECK-NEXT: mov w9, v1.s[1]
+; CHECK-NEXT: mov w9, v1.s[2]
; CHECK-NEXT: mov v0.b[3], w10
-; CHECK-NEXT: mov v0.b[4], w8
-; CHECK-NEXT: mov w8, v1.s[2]
-; CHECK-NEXT: mov v0.b[5], w9
-; CHECK-NEXT: mov w9, v1.s[3]
-; CHECK-NEXT: mov v0.b[6], w8
-; CHECK-NEXT: mov v0.b[7], w9
+; CHECK-NEXT: mov v0.b[4], v1.b[0]
+; CHECK-NEXT: mov v0.b[5], w8
+; CHECK-NEXT: mov w8, v1.s[3]
+; CHECK-NEXT: mov v0.b[6], w9
+; CHECK-NEXT: mov v0.b[7], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
index 7158aab03249..3c344a679780 100644
--- a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
+++ b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
@@ -1,14 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -run-pass=aarch64-mi-peephole-opt -mtriple=aarch64-unknown-linux -verify-machineinstrs -o - %s | FileCheck %s
--- |
- source_filename = "/Users/nilanjana/Documents/code/llvm-project/llvm/test/CodeGen/AArch64/tmp.ll"
-
- ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
- declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>) #0
-
- ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
- declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>) #0
-
define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
entry:
ret void
@@ -79,18 +71,18 @@ body: |
; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], killed [[UADDLVv4i32v]], %subreg.dsub
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY [[INSERT_SUBREG]].dsub
- ; CHECK-NEXT: [[INSvi64gpr:%[0-9]+]]:fpr128 = INSvi64gpr [[MOVIv2d_ns]], 0, killed [[COPY1]]
+ ; CHECK-NEXT: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[MOVIv2d_ns]], 0, [[INSERT_SUBREG]], 0
; CHECK-NEXT: [[MOVID:%[0-9]+]]:fpr64 = MOVID 0
; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], killed [[MOVID]], %subreg.dsub
- ; CHECK-NEXT: [[UCVTFv2f64_:%[0-9]+]]:fpr128 = nofpexcept UCVTFv2f64 killed [[INSvi64gpr]], implicit $fpcr
+ ; CHECK-NEXT: [[UCVTFv2f64_:%[0-9]+]]:fpr128 = nofpexcept UCVTFv2f64 killed [[INSvi64lane]], implicit $fpcr
; CHECK-NEXT: [[FCVTNv2i32_:%[0-9]+]]:fpr64 = nofpexcept FCVTNv2i32 killed [[UCVTFv2f64_]], implicit $fpcr
; CHECK-NEXT: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], killed [[FCVTNv2i32_]], %subreg.dsub
- ; CHECK-NEXT: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG2]], 1, killed [[INSERT_SUBREG1]], 0
+ ; CHECK-NEXT: [[INSvi64lane1:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG2]], 1, killed [[INSERT_SUBREG1]], 0
; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[MOVIv2d_ns]].dsub
; CHECK-NEXT: STRDui killed [[COPY2]], [[COPY]], 2 :: (store (s64) into %ir.0 + 16)
- ; CHECK-NEXT: STRQui killed [[INSvi64lane]], [[COPY]], 0 :: (store (s128) into %ir.0, align 8)
+ ; CHECK-NEXT: STRQui killed [[INSvi64lane1]], [[COPY]], 0 :: (store (s128) into %ir.0, align 8)
; CHECK-NEXT: RET_ReallyLR
%0:gpr64common = COPY $x0
%1:fpr128 = MOVIv2d_ns 0
@@ -146,8 +138,8 @@ body: |
; CHECK-NEXT: [[MOVID:%[0-9]+]]:fpr64 = MOVID 0
; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], killed [[MOVID]], %subreg.dsub
- ; CHECK-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG1]], 0, killed [[COPY1]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr]].dsub
+ ; CHECK-NEXT: [[INSvi32lane:%[0-9]+]]:fpr128 = INSvi32lane [[INSERT_SUBREG1]], 0, [[INSERT_SUBREG]], 0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[INSvi32lane]].dsub
; CHECK-NEXT: [[UCVTFv2f32_:%[0-9]+]]:fpr64 = nofpexcept UCVTFv2f32 killed [[COPY2]], implicit $fpcr
; CHECK-NEXT: STRDui killed [[UCVTFv2f32_]], [[COPY]], 0 :: (store (s64) into %ir.0)
; CHECK-NEXT: RET_ReallyLR
@@ -202,8 +194,8 @@ body: |
; CHECK-NEXT: [[MOVID:%[0-9]+]]:fpr64 = MOVID 0
; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], killed [[MOVID]], %subreg.dsub
- ; CHECK-NEXT: [[INSvi16gpr:%[0-9]+]]:fpr128 = INSvi16gpr [[INSERT_SUBREG1]], 0, killed [[COPY1]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[INSvi16gpr]].dsub
+ ; CHECK-NEXT: [[INSvi16lane:%[0-9]+]]:fpr128 = INSvi16lane [[INSERT_SUBREG1]], 0, [[INSERT_SUBREG]], 0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[INSvi16lane]].dsub
; CHECK-NEXT: [[USHLLv4i16_shift:%[0-9]+]]:fpr128 = USHLLv4i16_shift killed [[COPY2]], 0
; CHECK-NEXT: [[UCVTFv4f32_:%[0-9]+]]:fpr128 = nofpexcept UCVTFv4f32 killed [[USHLLv4i16_shift]], implicit $fpcr
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY $wzr
@@ -275,8 +267,8 @@ body: |
; CHECK-NEXT: [[MOVID:%[0-9]+]]:fpr64 = MOVID 0
; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], killed [[MOVID]], %subreg.dsub
- ; CHECK-NEXT: [[INSvi8gpr:%[0-9]+]]:fpr128 = INSvi8gpr [[INSERT_SUBREG1]], 0, killed [[COPY2]]
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr64 = COPY [[INSvi8gpr]].dsub
+ ; CHECK-NEXT: [[INSvi8lane:%[0-9]+]]:fpr128 = INSvi8lane [[INSERT_SUBREG1]], 0, [[INSERT_SUBREG]], 0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr64 = COPY [[INSvi8lane]].dsub
; CHECK-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF
; CHECK-NEXT: [[ZIP1v8i8_:%[0-9]+]]:fpr64 = ZIP1v8i8 killed [[COPY3]], killed [[DEF2]]
; CHECK-NEXT: [[BICv4i16_:%[0-9]+]]:fpr64 = BICv4i16 [[ZIP1v8i8_]], 255, 8
@@ -338,8 +330,8 @@ body: |
; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], killed [[UADDLVv8i16v]], %subreg.ssub
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY [[INSERT_SUBREG]].ssub
- ; CHECK-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[MOVIv2d_ns]], 2, killed [[COPY1]]
- ; CHECK-NEXT: [[UCVTFv4f32_:%[0-9]+]]:fpr128 = nofpexcept UCVTFv4f32 killed [[INSvi32gpr]], implicit $fpcr
+ ; CHECK-NEXT: [[INSvi32lane:%[0-9]+]]:fpr128 = INSvi32lane [[MOVIv2d_ns]], 2, [[INSERT_SUBREG]], 0
+ ; CHECK-NEXT: [[UCVTFv4f32_:%[0-9]+]]:fpr128 = nofpexcept UCVTFv4f32 killed [[INSvi32lane]], implicit $fpcr
; CHECK-NEXT: STRQui killed [[UCVTFv4f32_]], [[COPY]], 0 :: (store (s128) into %ir.0, align 8)
; CHECK-NEXT: RET_ReallyLR
%0:gpr64common = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index e509a3d1fc5e..559b1982832b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -574,19 +574,18 @@ define void @masked_gather_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-NEXT: ldr s1, [x0]
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: ldr q2, [x1]
; CHECK-NEXT: fcmeq v1.4h, v1.4h, #0.0
; CHECK-NEXT: sshll v1.4s, v1.4h, #0
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov w9, v1.s[1]
-; CHECK-NEXT: ldr q1, [x1]
-; CHECK-NEXT: mov v0.h[0], w8
-; CHECK-NEXT: mov v0.h[1], w9
+; CHECK-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: mov v0.h[1], w8
; CHECK-NEXT: shl v0.4h, v0.4h, #15
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sunpklo z0.d, z0.s
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [z2.d]
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: str s0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
index b06a065f65d4..40709ca420bc 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -18,10 +18,9 @@ define <2 x half> @masked_load_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: fcmeq v1.4h, v1.4h, v2.4h
; CHECK-NEXT: sshll v1.4s, v1.4h, #0
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov w9, v1.s[1]
-; CHECK-NEXT: mov v0.h[0], w8
-; CHECK-NEXT: mov v0.h[1], w9
+; CHECK-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: mov v0.h[1], w8
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index 4130eefbf903..ff1c899b3d82 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -539,11 +539,10 @@ define void @masked_scatter_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-NEXT: fcmeq v2.4h, v1.4h, #0.0
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov w9, v2.s[1]
+; CHECK-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-NEXT: mov w8, v2.s[1]
; CHECK-NEXT: ldr q2, [x1]
-; CHECK-NEXT: mov v0.h[0], w8
-; CHECK-NEXT: mov v0.h[1], w9
+; CHECK-NEXT: mov v0.h[1], w8
; CHECK-NEXT: shl v0.4h, v0.4h, #15
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
; CHECK-NEXT: sunpklo z0.s, z0.h
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
index a82811b99520..05640e68b236 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
@@ -18,10 +18,9 @@ define void @masked_store_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: fcmeq v2.4h, v1.4h, v2.4h
; CHECK-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov w9, v2.s[1]
-; CHECK-NEXT: mov v0.h[0], w8
-; CHECK-NEXT: mov v0.h[1], w9
+; CHECK-NEXT: mov v0.h[0], v2.h[0]
+; CHECK-NEXT: mov w8, v2.s[1]
+; CHECK-NEXT: mov v0.h[1], w8
; CHECK-NEXT: shl v0.4h, v0.4h, #15
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
More information about the llvm-commits
mailing list