[llvm] f059d2b - [AArch64] Lower zero cycle FPR zeroing (#156261)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 9 22:32:55 PDT 2025
Author: Tomer Shafir
Date: 2025-09-10T08:32:51+03:00
New Revision: f059d2bac034acca39ad60a1b13aaec6afa0a3d6
URL: https://github.com/llvm/llvm-project/commit/f059d2bac034acca39ad60a1b13aaec6afa0a3d6
DIFF: https://github.com/llvm/llvm-project/commit/f059d2bac034acca39ad60a1b13aaec6afa0a3d6.diff
LOG: [AArch64] Lower zero cycle FPR zeroing (#156261)
Lower FPR64, FPR32, FPR16 from `fmov` zeroing into NEON zeroing if the
target supports zero cycle zeroing of NEON registers but not for the
narrower classes.
It handles 2 cases: one in `AsmPrinter` where a FP zeroing from
immediate has been captured by pattern matching on instruction
selection, and second post RA in `AArch64InstrInfo::copyPhysReg` for
uncaptured/later-generated WZR/XZR fmovs.
Adds a subtarget feature called FeatureZCZeroingFPR128 that enables to
query wether the target supports zero cycle zeroing for FPR128 NEON
registers, and updates the appropriate processors.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
llvm/lib/Target/AArch64/AArch64Features.td
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
llvm/lib/Target/AArch64/AArch64Processors.td
llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
llvm/test/CodeGen/AArch64/expand-select.ll
llvm/test/CodeGen/AArch64/ext-narrow-index.ll
llvm/test/CodeGen/AArch64/fsh.ll
llvm/test/CodeGen/AArch64/neon-dotreduce.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
llvm/test/CodeGen/AArch64/vecreduce-add.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index fa050526b722c..c31a090bba77f 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -307,6 +307,7 @@ class AArch64AsmPrinter : public AsmPrinter {
/// Emit instruction to set float register to zero.
void emitFMov0(const MachineInstr &MI);
+ void emitFMov0AsFMov(const MachineInstr &MI, Register DestReg);
using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
@@ -1829,45 +1830,77 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
Register DestReg = MI.getOperand(0).getReg();
- if (STI->hasZeroCycleZeroingFPR64() &&
- !STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
- // Convert H/S register to corresponding D register
- if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
- DestReg = AArch64::D0 + (DestReg - AArch64::H0);
- else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
- DestReg = AArch64::D0 + (DestReg - AArch64::S0);
- else
- assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
+ if (!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
+ if (STI->hasZeroCycleZeroingFPR64()) {
+ // Convert H/S register to corresponding D register
+ const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
+ if (AArch64::FPR16RegClass.contains(DestReg))
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ else if (AArch64::FPR32RegClass.contains(DestReg))
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ else
+ assert(AArch64::FPR64RegClass.contains(DestReg));
+
+ MCInst MOVI;
+ MOVI.setOpcode(AArch64::MOVID);
+ MOVI.addOperand(MCOperand::createReg(DestReg));
+ MOVI.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MOVI);
+ } else if (STI->hasZeroCycleZeroingFPR128()) {
+ // Convert H/S/D register to corresponding Q register
+ const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
+ if (AArch64::FPR16RegClass.contains(DestReg)) {
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR128RegClass);
+ } else if (AArch64::FPR32RegClass.contains(DestReg)) {
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR128RegClass);
+ } else {
+ assert(AArch64::FPR64RegClass.contains(DestReg));
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::dsub,
+ &AArch64::FPR128RegClass);
+ }
- MCInst MOVI;
- MOVI.setOpcode(AArch64::MOVID);
- MOVI.addOperand(MCOperand::createReg(DestReg));
- MOVI.addOperand(MCOperand::createImm(0));
- EmitToStreamer(*OutStreamer, MOVI);
- } else {
- MCInst FMov;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("Unexpected opcode");
- case AArch64::FMOVH0:
- FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
- if (!STI->hasFullFP16())
- DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
- FMov.addOperand(MCOperand::createReg(DestReg));
- FMov.addOperand(MCOperand::createReg(AArch64::WZR));
- break;
- case AArch64::FMOVS0:
- FMov.setOpcode(AArch64::FMOVWSr);
- FMov.addOperand(MCOperand::createReg(DestReg));
- FMov.addOperand(MCOperand::createReg(AArch64::WZR));
- break;
- case AArch64::FMOVD0:
- FMov.setOpcode(AArch64::FMOVXDr);
- FMov.addOperand(MCOperand::createReg(DestReg));
- FMov.addOperand(MCOperand::createReg(AArch64::XZR));
- break;
+ MCInst MOVI;
+ MOVI.setOpcode(AArch64::MOVIv2d_ns);
+ MOVI.addOperand(MCOperand::createReg(DestReg));
+ MOVI.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MOVI);
+ } else {
+ emitFMov0AsFMov(MI, DestReg);
}
- EmitToStreamer(*OutStreamer, FMov);
+ } else {
+ emitFMov0AsFMov(MI, DestReg);
+ }
+}
+
+void AArch64AsmPrinter::emitFMov0AsFMov(const MachineInstr &MI,
+ Register DestReg) {
+ MCInst FMov;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ case AArch64::FMOVH0:
+ FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
+ if (!STI->hasFullFP16())
+ DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+ break;
+ case AArch64::FMOVS0:
+ FMov.setOpcode(AArch64::FMOVWSr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+ break;
+ case AArch64::FMOVD0:
+ FMov.setOpcode(AArch64::FMOVXDr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::XZR));
+ break;
}
+ EmitToStreamer(*OutStreamer, FMov);
}
Register AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 6904e09072649..46f5f0c1ca9dd 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -636,6 +636,9 @@ def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGP
def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
"Has zero-cycle zeroing instructions for GPR32 registers">;
+def FeatureZCZeroingFPR128 : SubtargetFeature<"zcz-fpr128", "HasZeroCycleZeroingFPR128", "true",
+ "Has zero-cycle zeroing instructions for FPR128 registers">;
+
// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
// as movi is more efficient across all cores. Newer cores can eliminate
// fmovs early and there is no
diff erence with movi, but this not true for
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index b47ae5d2cbb17..e56fe90259d5c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5471,8 +5471,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copies between GPR64 and FPR64.
if (AArch64::FPR64RegClass.contains(DestReg) &&
AArch64::GPR64RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (AArch64::XZR == SrcReg) {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
if (AArch64::GPR64RegClass.contains(DestReg) &&
@@ -5484,8 +5488,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copies between GPR32 and FPR32.
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::GPR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (AArch64::WZR == SrcReg) {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
if (AArch64::GPR32RegClass.contains(DestReg) &&
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index d5f4e91ae5188..81f5d075729d9 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -344,6 +344,8 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128,
FeatureZCZeroingFPWorkaround]>;
def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
@@ -358,7 +360,9 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
"Apple A11", [
@@ -372,7 +376,9 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
"Apple A12", [
@@ -386,7 +392,9 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
"Apple A13", [
@@ -400,7 +408,9 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
"Apple A14", [
@@ -419,7 +429,9 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
"Apple A15", [
@@ -438,7 +450,9 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
"Apple A16", [
@@ -457,7 +471,9 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
"Apple A17", [
@@ -476,7 +492,9 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
"Apple M4", [
@@ -494,7 +512,9 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
diff --git a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 7934e39b2b69f..78e20f2a5e214 100644
--- a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -69,14 +69,14 @@ define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
; CHECK-LABEL: add_sub_su64:
; CHECK: // %bb.0:
; CHECK-NEXT: add d0, d1, d0
-; CHECK-NEXT: fmov d1, xzr
+; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: sub d0, d1, d0
; CHECK-NEXT: ret
;
; GENERIC-LABEL: add_sub_su64:
; GENERIC: // %bb.0:
; GENERIC-NEXT: add d0, d1, d0
-; GENERIC-NEXT: fmov d1, xzr
+; GENERIC-NEXT: movi d1, #0000000000000000
; GENERIC-NEXT: sub d0, d1, d0
; GENERIC-NEXT: ret
%vecext = extractelement <2 x i64> %a, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
index 2a75976d58549..ccdaa8779e38f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-FULLFP16
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-NOZCZ-FPR128
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-NOZCZ-FPR128-FULLFP16
; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+zcz-fpr128 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-ZCZ-FPR128
; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,FP-WORKAROUND
-; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-ZCZ-FPR128
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
@@ -12,9 +13,10 @@ define half @tf16() {
entry:
; ALL-LABEL: tf16:
; FP-WORKAROUND: mov s0, wzr
-; NOZCZ-FPR64: mov s0, wzr
-; NOZCZ-FPR64-FULLFP16: mov h0, wzr
+; NOZCZ-FPR64-NOZCZ-FPR128: mov s0, wzr
+; NOZCZ-FPR64-NOZCZ-FPR128-FULLFP16: mov h0, wzr
; ZCZ-FPR64: movi d0, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret half 0.0
}
@@ -22,8 +24,9 @@ define float @tf32() {
entry:
; ALL-LABEL: tf32:
; FP-WORKAROUND: mov s0, wzr
-; NOZCZ-FPR64: mov s0, wzr
+; NOZCZ-FPR64-NOZCZ-FPR128: mov s0, wzr
; ZCZ-FPR64: movi d0, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret float 0.0
}
@@ -31,8 +34,9 @@ define double @td64() {
entry:
; ALL-LABEL: td64:
; FP-WORKAROUND: mov d0, xzr
-; NOZCZ-FPR64: mov d0, xzr
+; NOZCZ-FPR64-NOZCZ-FPR128: mov d0, xzr
; ZCZ-FPR64: movi d0, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret double 0.0
}
@@ -40,8 +44,9 @@ define <8 x i8> @tv8i8() {
entry:
; ALL-LABEL: tv8i8:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
}
@@ -49,8 +54,9 @@ define <4 x i16> @tv4i16() {
entry:
; ALL-LABEL: tv4i16:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
}
@@ -58,8 +64,9 @@ define <2 x i32> @tv2i32() {
entry:
; ALL-LABEL: tv2i32:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <2 x i32> <i32 0, i32 0>
}
@@ -67,8 +74,9 @@ define <2 x float> @tv2f32() {
entry:
; ALL-LABEL: tv2f32:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <2 x float> <float 0.0, float 0.0>
}
@@ -76,8 +84,9 @@ define <16 x i8> @tv16i8() {
entry:
; ALL-LABEL: tv16i8:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
}
@@ -85,8 +94,9 @@ define <8 x i16> @tv8i16() {
entry:
; ALL-LABEL: tv8i16:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
}
@@ -94,8 +104,9 @@ define <4 x i32> @tv4i32() {
entry:
; ALL-LABEL: tv4i32:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <4 x i32> <i32 0, i32 0, i32 0, i32 0>
}
@@ -103,8 +114,9 @@ define <2 x i64> @tv2i64() {
entry:
; ALL-LABEL: tv2i64:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <2 x i64> <i64 0, i64 0>
}
@@ -112,8 +124,9 @@ define <4 x float> @tv4f32() {
entry:
; ALL-LABEL: tv4f32:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
}
@@ -121,8 +134,9 @@ define <2 x double> @tv2d64() {
entry:
; ALL-LABEL: tv2d64:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <2 x double> <double 0.0, double 0.0>
}
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 53126a08db86f..c0c31427307b5 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -8,7 +8,7 @@ declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x
define fastcc i8 @allocno_reload_assign(ptr %p) {
; CHECK-LABEL: allocno_reload_assign:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d0, xzr
+; CHECK-NEXT: movi d0, #0000000000000000
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0
; CHECK-NEXT: uzp1 p0.s, p0.s, p0.s
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
index 7ca6adb1338d3..1ca4719d9b6bf 100644
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -4,8 +4,8 @@
define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
; CHECK-LABEL: foo:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi d0, #0000000000000000
; CHECK-NEXT: and w8, w0, #0x1
-; CHECK-NEXT: fmov s0, wzr
; CHECK-NEXT: ldr x11, [sp]
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: ldp x8, x10, [sp, #8]
@@ -31,8 +31,8 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
; CHECK-LABEL: bar:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi d0, #0000000000000000
; CHECK-NEXT: and w8, w0, #0x1
-; CHECK-NEXT: fmov s0, wzr
; CHECK-NEXT: ldr x10, [sp, #16]
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
diff --git a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll
index 177f2cafcf833..f62cfef9baf28 100644
--- a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll
+++ b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll
@@ -382,7 +382,7 @@ entry:
define <1 x i64> @i64_zero_off2(<2 x i64> %arg1) {
; CHECK-LABEL: i64_zero_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov d0, xzr
+; CHECK-NEXT: movi d0, #0000000000000000
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i64> %arg1, <2 x i64> zeroinitializer, <1 x i32> <i32 2>
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index ae2ef2649102e..765f6b77b41a9 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -1379,7 +1379,7 @@ define <7 x i32> @rotl_v7i32(<7 x i32> %a, <7 x i32> %c) {
; CHECK-GI-LABEL: rotl_v7i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr s0, [sp, #24]
-; CHECK-GI-NEXT: fmov s1, wzr
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: fmov s3, w7
; CHECK-GI-NEXT: ldr s2, [sp, #32]
; CHECK-GI-NEXT: mov x8, sp
@@ -1387,31 +1387,32 @@ define <7 x i32> @rotl_v7i32(<7 x i32> %a, <7 x i32> %c) {
; CHECK-GI-NEXT: mov v6.16b, v0.16b
; CHECK-GI-NEXT: ldr s7, [sp]
; CHECK-GI-NEXT: ldr s5, [sp, #40]
-; CHECK-GI-NEXT: mov v1.s[1], wzr
; CHECK-GI-NEXT: ld1 { v3.s }[1], [x8]
; CHECK-GI-NEXT: add x8, sp, #8
+; CHECK-GI-NEXT: fmov s16, w0
+; CHECK-GI-NEXT: mov v1.s[1], wzr
; CHECK-GI-NEXT: mov v4.s[1], v7.s[0]
; CHECK-GI-NEXT: ldr s7, [sp, #8]
-; CHECK-GI-NEXT: fmov s16, w0
; CHECK-GI-NEXT: mov v6.s[1], v2.s[0]
; CHECK-GI-NEXT: fmov s17, w0
; CHECK-GI-NEXT: add x9, sp, #16
; CHECK-GI-NEXT: ld1 { v3.s }[2], [x8]
; CHECK-GI-NEXT: mov w8, #31 // =0x1f
+; CHECK-GI-NEXT: mov v16.s[1], w1
+; CHECK-GI-NEXT: fmov s18, w8
; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT: fmov s2, w4
; CHECK-GI-NEXT: mov v1.s[2], wzr
-; CHECK-GI-NEXT: fmov s18, w8
-; CHECK-GI-NEXT: mov v16.s[1], w1
; CHECK-GI-NEXT: mov v4.s[2], v7.s[0]
; CHECK-GI-NEXT: ldr s7, [sp, #16]
-; CHECK-GI-NEXT: mov v17.s[1], w1
; CHECK-GI-NEXT: mov v6.s[2], v5.s[0]
; CHECK-GI-NEXT: ld1 { v3.s }[3], [x9]
-; CHECK-GI-NEXT: fmov s2, w4
+; CHECK-GI-NEXT: mov v17.s[1], w1
; CHECK-GI-NEXT: mov v18.s[1], w8
; CHECK-GI-NEXT: movi v19.4s, #31
-; CHECK-GI-NEXT: mov v0.s[2], v5.s[0]
; CHECK-GI-NEXT: mov v16.s[2], w2
+; CHECK-GI-NEXT: mov v2.s[1], w5
+; CHECK-GI-NEXT: mov v0.s[2], v5.s[0]
; CHECK-GI-NEXT: mov v4.s[3], v7.s[0]
; CHECK-GI-NEXT: fmov s7, w4
; CHECK-GI-NEXT: neg v3.4s, v3.4s
@@ -1419,15 +1420,14 @@ define <7 x i32> @rotl_v7i32(<7 x i32> %a, <7 x i32> %c) {
; CHECK-GI-NEXT: fmov s6, w8
; CHECK-GI-NEXT: mov v17.s[2], w2
; CHECK-GI-NEXT: mov v18.s[2], w8
-; CHECK-GI-NEXT: mov v2.s[1], w5
+; CHECK-GI-NEXT: mov v16.s[3], w3
; CHECK-GI-NEXT: mov v7.s[1], w5
; CHECK-GI-NEXT: and v3.16b, v3.16b, v19.16b
-; CHECK-GI-NEXT: mov v16.s[3], w3
+; CHECK-GI-NEXT: mov v2.s[2], w6
; CHECK-GI-NEXT: mov v6.s[1], w8
; CHECK-GI-NEXT: and v4.16b, v4.16b, v19.16b
; CHECK-GI-NEXT: mov v17.s[3], w3
; CHECK-GI-NEXT: and v1.16b, v1.16b, v18.16b
-; CHECK-GI-NEXT: mov v2.s[2], w6
; CHECK-GI-NEXT: neg v3.4s, v3.4s
; CHECK-GI-NEXT: mov v7.s[2], w6
; CHECK-GI-NEXT: mov v6.s[2], w8
@@ -1510,7 +1510,7 @@ define <7 x i32> @rotr_v7i32(<7 x i32> %a, <7 x i32> %c) {
; CHECK-GI-NEXT: fmov s2, w7
; CHECK-GI-NEXT: mov x8, sp
; CHECK-GI-NEXT: ldr s6, [sp, #8]
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: ldr s7, [sp, #32]
; CHECK-GI-NEXT: fmov s16, w0
; CHECK-GI-NEXT: fmov s17, w0
@@ -1518,12 +1518,12 @@ define <7 x i32> @rotr_v7i32(<7 x i32> %a, <7 x i32> %c) {
; CHECK-GI-NEXT: ldr s3, [sp, #24]
; CHECK-GI-NEXT: ld1 { v2.s }[1], [x8]
; CHECK-GI-NEXT: mov w8, #31 // =0x1f
-; CHECK-GI-NEXT: mov v0.s[1], wzr
; CHECK-GI-NEXT: add x9, sp, #8
+; CHECK-GI-NEXT: ldr s5, [sp, #40]
; CHECK-GI-NEXT: mov v4.16b, v3.16b
; CHECK-GI-NEXT: mov v3.s[1], v7.s[0]
+; CHECK-GI-NEXT: mov v0.s[1], wzr
; CHECK-GI-NEXT: fmov s18, w8
-; CHECK-GI-NEXT: ldr s5, [sp, #40]
; CHECK-GI-NEXT: ld1 { v2.s }[2], [x9]
; CHECK-GI-NEXT: mov v17.s[1], w1
; CHECK-GI-NEXT: mov v1.s[2], v6.s[0]
@@ -1531,9 +1531,9 @@ define <7 x i32> @rotr_v7i32(<7 x i32> %a, <7 x i32> %c) {
; CHECK-GI-NEXT: mov v16.s[1], w1
; CHECK-GI-NEXT: mov v4.s[1], v7.s[0]
; CHECK-GI-NEXT: ldr s7, [sp, #16]
+; CHECK-GI-NEXT: fmov s19, w4
; CHECK-GI-NEXT: mov v18.s[1], w8
; CHECK-GI-NEXT: mov v3.s[2], v5.s[0]
-; CHECK-GI-NEXT: fmov s19, w4
; CHECK-GI-NEXT: add x10, sp, #16
; CHECK-GI-NEXT: mov v6.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], wzr
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 88b6f6c40baca..fb2a1fa697c26 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -2400,7 +2400,7 @@ define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b
; CHECK-GI-NEXT: .cfi_offset w30, -88
; CHECK-GI-NEXT: .cfi_offset w29, -96
; CHECK-GI-NEXT: ldp q2, q1, [x1]
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: str w2, [sp, #12] // 4-byte Folded Spill
; CHECK-GI-NEXT: mov b6, v2.b[3]
; CHECK-GI-NEXT: mov b7, v2.b[4]
@@ -2710,7 +2710,7 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) {
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
; CHECK-GI-NEXT: .cfi_offset w19, -16
; CHECK-GI-NEXT: ldp q2, q1, [x0]
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: umov w15, v2.b[0]
; CHECK-GI-NEXT: umov w17, v2.b[4]
; CHECK-GI-NEXT: umov w0, v2.b[8]
@@ -2830,7 +2830,7 @@ define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b
; CHECK-GI-NEXT: .cfi_offset w30, -88
; CHECK-GI-NEXT: .cfi_offset w29, -96
; CHECK-GI-NEXT: ldp q2, q1, [x1]
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: str w2, [sp, #12] // 4-byte Folded Spill
; CHECK-GI-NEXT: mov b5, v2.b[2]
; CHECK-GI-NEXT: mov b6, v2.b[3]
@@ -3360,12 +3360,12 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25
; CHECK-GI-NEXT: sbfx w9, w11, #8, #8
; CHECK-GI-NEXT: lsl w11, w3, #8
; CHECK-GI-NEXT: sbfx w14, w14, #8, #8
-; CHECK-GI-NEXT: fmov s1, wzr
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: lsl w10, w10, #8
; CHECK-GI-NEXT: mov v4.h[1], w8
; CHECK-GI-NEXT: ldr w8, [sp, #152]
; CHECK-GI-NEXT: sbfx w11, w11, #8, #8
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: mov v2.h[2], w9
; CHECK-GI-NEXT: ldr w9, [sp, #40]
; CHECK-GI-NEXT: sbfx w10, w10, #8, #8
@@ -4012,25 +4012,24 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> %
; CHECK-GI-NEXT: sxtb w8, w4
; CHECK-GI-NEXT: sxtb w10, w10
; CHECK-GI-NEXT: ldr w14, [sp, #448]
-; CHECK-GI-NEXT: fmov s1, wzr
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d1, #0000000000000000
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: fmov s3, w8
; CHECK-GI-NEXT: sxtb w8, w2
; CHECK-GI-NEXT: fmov s5, w10
; CHECK-GI-NEXT: mov v2.s[1], w9
; CHECK-GI-NEXT: sxtb w9, w5
; CHECK-GI-NEXT: ldr w10, [sp, #80]
-; CHECK-GI-NEXT: mov v1.s[1], wzr
-; CHECK-GI-NEXT: mov v0.s[1], wzr
; CHECK-GI-NEXT: mov v3.s[1], w9
; CHECK-GI-NEXT: ldr w9, [sp, #16]
; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v1.s[1], wzr
+; CHECK-GI-NEXT: mov v0.s[1], wzr
; CHECK-GI-NEXT: mov v2.s[2], w8
; CHECK-GI-NEXT: sxtb w9, w9
; CHECK-GI-NEXT: ldr w8, [sp, #24]
; CHECK-GI-NEXT: fmov s6, w10
; CHECK-GI-NEXT: ldr w10, [sp, #64]
-; CHECK-GI-NEXT: mov v1.s[2], wzr
; CHECK-GI-NEXT: mov v3.s[2], w11
; CHECK-GI-NEXT: fmov s4, w9
; CHECK-GI-NEXT: sxtb w8, w8
@@ -4039,7 +4038,7 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> %
; CHECK-GI-NEXT: sxtb w10, w10
; CHECK-GI-NEXT: mov v2.s[3], w12
; CHECK-GI-NEXT: ldr w12, [sp, #88]
-; CHECK-GI-NEXT: mov v0.s[2], wzr
+; CHECK-GI-NEXT: mov v1.s[2], wzr
; CHECK-GI-NEXT: mov v4.s[1], w8
; CHECK-GI-NEXT: ldr w8, [sp, #120]
; CHECK-GI-NEXT: sxtb w9, w9
@@ -4063,7 +4062,7 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> %
; CHECK-GI-NEXT: ldr w10, [sp, #136]
; CHECK-GI-NEXT: sxtb w13, w13
; CHECK-GI-NEXT: sxtb w9, w9
-; CHECK-GI-NEXT: mov v1.s[3], wzr
+; CHECK-GI-NEXT: mov v0.s[2], wzr
; CHECK-GI-NEXT: mov v7.s[1], w8
; CHECK-GI-NEXT: sxtb w10, w10
; CHECK-GI-NEXT: ldr w8, [sp, #72]
@@ -4072,8 +4071,9 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> %
; CHECK-GI-NEXT: mov v4.s[3], w9
; CHECK-GI-NEXT: ldr w9, [sp, #360]
; CHECK-GI-NEXT: sxtb w8, w8
-; CHECK-GI-NEXT: mov v0.s[3], wzr
+; CHECK-GI-NEXT: mov v1.s[3], wzr
; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v0.s[3], wzr
; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s
; CHECK-GI-NEXT: mov v7.s[2], w12
; CHECK-GI-NEXT: ldr w12, [sp, #352]
@@ -4562,13 +4562,13 @@ define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b
; CHECK-GI-NEXT: .cfi_offset w30, -88
; CHECK-GI-NEXT: .cfi_offset w29, -96
; CHECK-GI-NEXT: ldp q7, q16, [x1]
-; CHECK-GI-NEXT: fmov s5, wzr
+; CHECK-GI-NEXT: movi d5, #0000000000000000
; CHECK-GI-NEXT: str w2, [sp, #12] // 4-byte Folded Spill
-; CHECK-GI-NEXT: fmov s6, wzr
-; CHECK-GI-NEXT: fmov s0, wzr
-; CHECK-GI-NEXT: fmov s1, wzr
-; CHECK-GI-NEXT: fmov s3, wzr
-; CHECK-GI-NEXT: fmov s2, wzr
+; CHECK-GI-NEXT: movi d6, #0000000000000000
+; CHECK-GI-NEXT: movi d0, #0000000000000000
+; CHECK-GI-NEXT: movi d1, #0000000000000000
+; CHECK-GI-NEXT: movi d3, #0000000000000000
+; CHECK-GI-NEXT: movi d2, #0000000000000000
; CHECK-GI-NEXT: mov b23, v7.b[7]
; CHECK-GI-NEXT: mov b17, v7.b[1]
; CHECK-GI-NEXT: fmov w11, s7
@@ -4822,7 +4822,7 @@ define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b
; CHECK-GI-NEXT: mov v19.h[6], w11
; CHECK-GI-NEXT: fmov w12, s16
; CHECK-GI-NEXT: fmov w11, s7
-; CHECK-GI-NEXT: fmov s4, wzr
+; CHECK-GI-NEXT: movi d4, #0000000000000000
; CHECK-GI-NEXT: uxtb w9, w9
; CHECK-GI-NEXT: mov v20.h[6], w10
; CHECK-GI-NEXT: umov w10, v24.h[0]
@@ -4991,13 +4991,13 @@ define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) {
; CHECK-GI-NEXT: .cfi_offset w19, -8
; CHECK-GI-NEXT: .cfi_offset w20, -16
; CHECK-GI-NEXT: ldp q7, q19, [x0]
-; CHECK-GI-NEXT: fmov s1, wzr
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: ldrb w10, [x0, #32]
-; CHECK-GI-NEXT: fmov s0, wzr
-; CHECK-GI-NEXT: fmov s3, wzr
-; CHECK-GI-NEXT: fmov s2, wzr
-; CHECK-GI-NEXT: fmov s5, wzr
-; CHECK-GI-NEXT: fmov s4, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
+; CHECK-GI-NEXT: movi d3, #0000000000000000
+; CHECK-GI-NEXT: movi d2, #0000000000000000
+; CHECK-GI-NEXT: movi d5, #0000000000000000
+; CHECK-GI-NEXT: movi d4, #0000000000000000
; CHECK-GI-NEXT: umov w15, v7.b[8]
; CHECK-GI-NEXT: umov w2, v7.b[12]
; CHECK-GI-NEXT: umov w16, v7.b[9]
@@ -5022,13 +5022,13 @@ define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) {
; CHECK-GI-NEXT: mov v17.s[1], w16
; CHECK-GI-NEXT: mov v18.s[1], w4
; CHECK-GI-NEXT: umov w4, v19.b[4]
+; CHECK-GI-NEXT: movi d6, #0000000000000000
; CHECK-GI-NEXT: umov w6, v19.b[1]
; CHECK-GI-NEXT: umov w7, v19.b[5]
-; CHECK-GI-NEXT: umov w19, v19.b[9]
; CHECK-GI-NEXT: mov v7.s[1], w9
; CHECK-GI-NEXT: mov v16.s[1], w14
+; CHECK-GI-NEXT: umov w19, v19.b[9]
; CHECK-GI-NEXT: umov w20, v19.b[13]
-; CHECK-GI-NEXT: fmov s6, wzr
; CHECK-GI-NEXT: umov w12, v19.b[2]
; CHECK-GI-NEXT: umov w8, v19.b[3]
; CHECK-GI-NEXT: mov v17.s[2], w3
@@ -5164,13 +5164,13 @@ define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b
; CHECK-GI-NEXT: .cfi_offset w30, -88
; CHECK-GI-NEXT: .cfi_offset w29, -96
; CHECK-GI-NEXT: ldp q7, q16, [x1]
-; CHECK-GI-NEXT: fmov s1, wzr
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: str w2, [sp, #12] // 4-byte Folded Spill
-; CHECK-GI-NEXT: fmov s3, wzr
-; CHECK-GI-NEXT: fmov s2, wzr
-; CHECK-GI-NEXT: fmov s5, wzr
-; CHECK-GI-NEXT: fmov s4, wzr
-; CHECK-GI-NEXT: fmov s6, wzr
+; CHECK-GI-NEXT: movi d3, #0000000000000000
+; CHECK-GI-NEXT: movi d2, #0000000000000000
+; CHECK-GI-NEXT: movi d5, #0000000000000000
+; CHECK-GI-NEXT: movi d4, #0000000000000000
+; CHECK-GI-NEXT: movi d6, #0000000000000000
; CHECK-GI-NEXT: mov b19, v7.b[3]
; CHECK-GI-NEXT: mov b23, v7.b[7]
; CHECK-GI-NEXT: mov b17, v7.b[1]
@@ -5454,7 +5454,7 @@ define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b
; CHECK-GI-NEXT: smov w8, v20.h[7]
; CHECK-GI-NEXT: sxth w9, w9
; CHECK-GI-NEXT: mov v16.s[1], w12
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: fmov s19, w15
; CHECK-GI-NEXT: smov w15, v22.h[6]
; CHECK-GI-NEXT: mov v1.s[1], wzr
@@ -5900,28 +5900,28 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33
; CHECK-GI-NEXT: sbfx w15, w15, #8, #8
; CHECK-GI-NEXT: mov v23.h[2], w8
; CHECK-GI-NEXT: ldr w8, [sp, #112]
-; CHECK-GI-NEXT: fmov s19, wzr
+; CHECK-GI-NEXT: movi d19, #0000000000000000
; CHECK-GI-NEXT: sbfx w9, w9, #8, #8
; CHECK-GI-NEXT: sbfx w11, w11, #8, #8
-; CHECK-GI-NEXT: fmov s21, wzr
+; CHECK-GI-NEXT: movi d21, #0000000000000000
; CHECK-GI-NEXT: mov v22.h[3], w10
; CHECK-GI-NEXT: ldr w10, [sp, #144]
; CHECK-GI-NEXT: lsl w8, w8, #8
-; CHECK-GI-NEXT: fmov s16, wzr
-; CHECK-GI-NEXT: fmov s18, wzr
-; CHECK-GI-NEXT: fmov s17, wzr
+; CHECK-GI-NEXT: movi d16, #0000000000000000
+; CHECK-GI-NEXT: movi d18, #0000000000000000
+; CHECK-GI-NEXT: movi d17, #0000000000000000
; CHECK-GI-NEXT: lsl w10, w10, #8
; CHECK-GI-NEXT: mov v23.h[3], w9
; CHECK-GI-NEXT: sbfx w8, w8, #8, #8
; CHECK-GI-NEXT: ldr w9, [sp, #120]
-; CHECK-GI-NEXT: fmov s20, wzr
-; CHECK-GI-NEXT: fmov s6, wzr
+; CHECK-GI-NEXT: movi d20, #0000000000000000
+; CHECK-GI-NEXT: movi d6, #0000000000000000
; CHECK-GI-NEXT: sbfx w10, w10, #8, #8
; CHECK-GI-NEXT: mov v22.h[4], w11
; CHECK-GI-NEXT: lsl w11, w5, #8
; CHECK-GI-NEXT: lsl w9, w9, #8
-; CHECK-GI-NEXT: fmov s7, wzr
-; CHECK-GI-NEXT: fmov s2, wzr
+; CHECK-GI-NEXT: movi d7, #0000000000000000
+; CHECK-GI-NEXT: movi d2, #0000000000000000
; CHECK-GI-NEXT: fmov s24, w10
; CHECK-GI-NEXT: mov v23.h[4], w8
; CHECK-GI-NEXT: ldr w8, [sp, #160]
@@ -5929,8 +5929,8 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33
; CHECK-GI-NEXT: ldr w10, [sp, #168]
; CHECK-GI-NEXT: sbfx w9, w9, #8, #8
; CHECK-GI-NEXT: lsl w8, w8, #8
-; CHECK-GI-NEXT: fmov s4, wzr
-; CHECK-GI-NEXT: fmov s3, wzr
+; CHECK-GI-NEXT: movi d4, #0000000000000000
+; CHECK-GI-NEXT: movi d3, #0000000000000000
; CHECK-GI-NEXT: mov v24.h[1], w12
; CHECK-GI-NEXT: lsl w12, w6, #8
; CHECK-GI-NEXT: mov v22.h[5], w11
@@ -5941,8 +5941,8 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33
; CHECK-GI-NEXT: ldr w11, [sp, #184]
; CHECK-GI-NEXT: ldr w9, [sp, #192]
; CHECK-GI-NEXT: sbfx w10, w10, #8, #8
-; CHECK-GI-NEXT: fmov s5, wzr
-; CHECK-GI-NEXT: fmov s1, wzr
+; CHECK-GI-NEXT: movi d5, #0000000000000000
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: mov v24.h[2], w8
; CHECK-GI-NEXT: mov v22.h[6], w12
; CHECK-GI-NEXT: ldr w12, [sp, #208]
@@ -5951,7 +5951,7 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33
; CHECK-GI-NEXT: lsl w9, w9, #8
; CHECK-GI-NEXT: lsl w12, w12, #8
; CHECK-GI-NEXT: ldr w8, [sp, #200]
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: lsl w13, w13, #8
; CHECK-GI-NEXT: sbfx w9, w9, #8, #8
; CHECK-GI-NEXT: mov v19.s[1], wzr
@@ -6813,10 +6813,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
; CHECK-GI-NEXT: fmov s23, w12
; CHECK-GI-NEXT: sxtb w10, w10
; CHECK-GI-NEXT: sxtb w12, w7
-; CHECK-GI-NEXT: fmov s18, wzr
+; CHECK-GI-NEXT: movi d18, #0000000000000000
; CHECK-GI-NEXT: sxtb w8, w8
-; CHECK-GI-NEXT: fmov s19, wzr
-; CHECK-GI-NEXT: fmov s20, wzr
+; CHECK-GI-NEXT: movi d19, #0000000000000000
+; CHECK-GI-NEXT: movi d20, #0000000000000000
; CHECK-GI-NEXT: mov v22.s[1], w9
; CHECK-GI-NEXT: sxtb w9, w2
; CHECK-GI-NEXT: mov v23.s[1], w13
@@ -6825,10 +6825,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
; CHECK-GI-NEXT: sxtb w11, w6
; CHECK-GI-NEXT: ldr w13, [sp, #232]
; CHECK-GI-NEXT: mov v18.s[1], wzr
-; CHECK-GI-NEXT: mov v19.s[1], wzr
+; CHECK-GI-NEXT: movi d21, #0000000000000000
; CHECK-GI-NEXT: fmov s25, w8
; CHECK-GI-NEXT: ldr w8, [sp, #80]
-; CHECK-GI-NEXT: fmov s21, wzr
+; CHECK-GI-NEXT: mov v19.s[1], wzr
; CHECK-GI-NEXT: mov v22.s[2], w9
; CHECK-GI-NEXT: mov v24.s[1], w10
; CHECK-GI-NEXT: sxtb w10, w3
@@ -6837,10 +6837,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
; CHECK-GI-NEXT: sxtb w8, w8
; CHECK-GI-NEXT: ldr w11, [sp, #136]
; CHECK-GI-NEXT: mov v18.s[2], wzr
-; CHECK-GI-NEXT: mov v19.s[2], wzr
+; CHECK-GI-NEXT: movi d6, #0000000000000000
; CHECK-GI-NEXT: sxtb w9, w9
-; CHECK-GI-NEXT: fmov s6, wzr
-; CHECK-GI-NEXT: fmov s7, wzr
+; CHECK-GI-NEXT: mov v19.s[2], wzr
+; CHECK-GI-NEXT: movi d7, #0000000000000000
; CHECK-GI-NEXT: mov v22.s[3], w10
; CHECK-GI-NEXT: ldr w10, [sp, #128]
; CHECK-GI-NEXT: mov v24.s[2], w8
@@ -6855,7 +6855,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
; CHECK-GI-NEXT: sxtb w8, w8
; CHECK-GI-NEXT: fmov s26, w10
; CHECK-GI-NEXT: ldr w10, [sp, #144]
-; CHECK-GI-NEXT: mov v18.s[3], wzr
+; CHECK-GI-NEXT: movi d5, #0000000000000000
; CHECK-GI-NEXT: mov v25.s[2], w9
; CHECK-GI-NEXT: ldr w9, [sp, #120]
; CHECK-GI-NEXT: sxtb w12, w12
@@ -6872,14 +6872,14 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
; CHECK-GI-NEXT: ldr w9, [sp, #192]
; CHECK-GI-NEXT: add v22.4s, v22.4s, v23.4s
; CHECK-GI-NEXT: mov v27.s[1], w8
-; CHECK-GI-NEXT: mov v19.s[3], wzr
-; CHECK-GI-NEXT: fmov s5, wzr
+; CHECK-GI-NEXT: movi d16, #0000000000000000
+; CHECK-GI-NEXT: movi d17, #0000000000000000
; CHECK-GI-NEXT: mov v26.s[2], w10
; CHECK-GI-NEXT: ldr w10, [sp, #200]
; CHECK-GI-NEXT: sxtb w9, w9
-; CHECK-GI-NEXT: fmov s16, wzr
-; CHECK-GI-NEXT: fmov s17, wzr
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
+; CHECK-GI-NEXT: movi d1, #0000000000000000
+; CHECK-GI-NEXT: movi d3, #0000000000000000
; CHECK-GI-NEXT: sxtb w8, w10
; CHECK-GI-NEXT: sxtb w10, w12
; CHECK-GI-NEXT: fmov s28, w9
@@ -6936,7 +6936,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
; CHECK-GI-NEXT: fmov s9, w12
; CHECK-GI-NEXT: sxtb w11, w11
; CHECK-GI-NEXT: sxtb w10, w10
-; CHECK-GI-NEXT: fmov s1, wzr
+; CHECK-GI-NEXT: movi d2, #0000000000000000
; CHECK-GI-NEXT: sxtb w9, w9
; CHECK-GI-NEXT: mov v30.s[3], w8
; CHECK-GI-NEXT: ldr w8, [sp, #632]
@@ -6948,10 +6948,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
; CHECK-GI-NEXT: ldr w10, [sp, #688]
; CHECK-GI-NEXT: sxtb w11, w11
; CHECK-GI-NEXT: sxtb w8, w8
-; CHECK-GI-NEXT: fmov s3, wzr
+; CHECK-GI-NEXT: movi d4, #0000000000000000
; CHECK-GI-NEXT: sxtb w9, w9
; CHECK-GI-NEXT: sxtb w10, w10
-; CHECK-GI-NEXT: fmov s2, wzr
+; CHECK-GI-NEXT: mov v18.s[3], wzr
; CHECK-GI-NEXT: mov v9.s[2], w11
; CHECK-GI-NEXT: ldr w11, [sp, #664]
; CHECK-GI-NEXT: mov v10.s[1], w8
@@ -6963,7 +6963,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
; CHECK-GI-NEXT: ldr w10, [sp, #672]
; CHECK-GI-NEXT: sxtb w8, w8
; CHECK-GI-NEXT: sxtb w9, w9
-; CHECK-GI-NEXT: fmov s4, wzr
+; CHECK-GI-NEXT: mov v19.s[3], wzr
; CHECK-GI-NEXT: mov v11.s[1], w11
; CHECK-GI-NEXT: sxtb w10, w10
; CHECK-GI-NEXT: mov v20.s[1], wzr
@@ -7121,15 +7121,15 @@ define i32 @test_udot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b
;
; CHECK-GI-LABEL: test_udot_v48i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-NEXT: ldr q7, [x0, #32]
; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-NEXT: ldr q17, [x1, #32]
; CHECK-GI-NEXT: ldp q4, q5, [x0]
-; CHECK-GI-NEXT: mov v0.s[1], wzr
; CHECK-GI-NEXT: ldp q6, q16, [x1]
+; CHECK-GI-NEXT: mov v0.s[1], wzr
; CHECK-GI-NEXT: udot v2.4s, v17.16b, v7.16b
; CHECK-GI-NEXT: udot v1.4s, v6.16b, v4.16b
; CHECK-GI-NEXT: udot v3.4s, v16.16b, v5.16b
@@ -7169,7 +7169,7 @@ define i32 @test_udot_v48i8_nomla(ptr nocapture readonly %a1) {
;
; CHECK-GI-LABEL: test_udot_v48i8_nomla:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: movi v1.16b, #1
; CHECK-GI-NEXT: ldr q7, [x0, #32]
; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
@@ -7212,15 +7212,15 @@ define i32 @test_sdot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b
;
; CHECK-GI-LABEL: test_sdot_v48i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-NEXT: ldr q7, [x0, #32]
; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-NEXT: ldr q17, [x1, #32]
; CHECK-GI-NEXT: ldp q4, q5, [x0]
-; CHECK-GI-NEXT: mov v0.s[1], wzr
; CHECK-GI-NEXT: ldp q6, q16, [x1]
+; CHECK-GI-NEXT: mov v0.s[1], wzr
; CHECK-GI-NEXT: sdot v2.4s, v17.16b, v7.16b
; CHECK-GI-NEXT: sdot v1.4s, v6.16b, v4.16b
; CHECK-GI-NEXT: sdot v3.4s, v16.16b, v5.16b
@@ -7639,7 +7639,7 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48
; CHECK-GI-NEXT: fmov s2, w0
; CHECK-GI-NEXT: ldr w11, [sp, #208]
; CHECK-GI-NEXT: ldr w8, [sp, #216]
-; CHECK-GI-NEXT: fmov s1, wzr
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: fmov s3, w10
; CHECK-GI-NEXT: ldr w10, [sp, #336]
; CHECK-GI-NEXT: ldr w12, [sp, #720]
@@ -7663,7 +7663,7 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48
; CHECK-GI-NEXT: ldr w11, [sp, #16]
; CHECK-GI-NEXT: mov v7.b[1], w9
; CHECK-GI-NEXT: ldr w9, [sp, #480]
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: mov v6.b[1], w8
; CHECK-GI-NEXT: ldr w8, [sp, #96]
; CHECK-GI-NEXT: mov v4.b[2], w10
@@ -8271,7 +8271,7 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %
; CHECK-GI-NEXT: fmov s2, w0
; CHECK-GI-NEXT: ldr w10, [sp, #216]
; CHECK-GI-NEXT: ldr w12, [sp, #848]
-; CHECK-GI-NEXT: fmov s1, wzr
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: fmov s4, w9
; CHECK-GI-NEXT: fmov s3, w11
; CHECK-GI-NEXT: ldr w11, [sp, #720]
@@ -8295,7 +8295,7 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %
; CHECK-GI-NEXT: mov v2.b[2], w2
; CHECK-GI-NEXT: mov v3.b[2], w10
; CHECK-GI-NEXT: ldr w10, [sp, #864]
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: mov v7.b[1], w11
; CHECK-GI-NEXT: ldr w11, [sp, #992]
; CHECK-GI-NEXT: mov v4.b[2], w8
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 42b947604b860..1fa4b5f62bdec 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -1466,8 +1466,8 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) {
; CHECK-LABEL: masked_load_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: fmov s1, wzr
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z2.s, z0.s[1]
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index 9b3da75be47ec..8f4a696a28d62 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -589,8 +589,8 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
; CHECK-LABEL: masked_store_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: fmov s1, wzr
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z2.s, z0.s[1]
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index d6d323530946e..25702ef25510c 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -851,15 +851,15 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
; CHECK-GI-NEXT: sub sp, sp, #16
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
; CHECK-GI-NEXT: fmov s1, w0
-; CHECK-GI-NEXT: fmov s0, wzr
+; CHECK-GI-NEXT: movi d0, #0000000000000000
; CHECK-GI-NEXT: fmov s2, w4
; CHECK-GI-NEXT: mov.s v1[1], w1
; CHECK-GI-NEXT: mov.s v2[1], w5
; CHECK-GI-NEXT: mov.s v0[1], wzr
; CHECK-GI-NEXT: mov.s v1[2], w2
; CHECK-GI-NEXT: cmeq.4s v0, v2, v0
-; CHECK-GI-NEXT: mvn.16b v0, v0
; CHECK-GI-NEXT: mov.s v1[3], w3
+; CHECK-GI-NEXT: mvn.16b v0, v0
; CHECK-GI-NEXT: cmtst.4s v1, v1, v1
; CHECK-GI-NEXT: mov.s w8, v1[1]
; CHECK-GI-NEXT: mov.s w9, v1[2]
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index fb504028a161b..ee04e41d55046 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2127,15 +2127,15 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-DOT-LABEL: test_udot_v48i8:
; CHECK-GI-DOT: // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT: fmov s0, wzr
+; CHECK-GI-DOT-NEXT: movi d0, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32]
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q17, [x1, #32]
; CHECK-GI-DOT-NEXT: ldp q4, q5, [x0]
-; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr
; CHECK-GI-DOT-NEXT: ldp q6, q16, [x1]
+; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr
; CHECK-GI-DOT-NEXT: udot v2.4s, v17.16b, v7.16b
; CHECK-GI-DOT-NEXT: udot v1.4s, v6.16b, v4.16b
; CHECK-GI-DOT-NEXT: udot v3.4s, v16.16b, v5.16b
@@ -2395,15 +2395,15 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
; CHECK-GI-DOT: // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT: fmov s0, wzr
+; CHECK-GI-DOT-NEXT: movi d0, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32]
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q17, [x1, #32]
; CHECK-GI-DOT-NEXT: ldp q4, q5, [x0]
-; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr
; CHECK-GI-DOT-NEXT: ldp q6, q16, [x1]
+; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr
; CHECK-GI-DOT-NEXT: sdot v2.4s, v17.16b, v7.16b
; CHECK-GI-DOT-NEXT: sdot v1.4s, v6.16b, v4.16b
; CHECK-GI-DOT-NEXT: sdot v3.4s, v16.16b, v5.16b
More information about the llvm-commits
mailing list