[llvm] [AArch64] Lower zero cycle FPR zeroing (PR #156261)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 31 12:47:02 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Tomer Shafir (tomershafir)
<details>
<summary>Changes</summary>
Lower FPR64, FPR32, FPR16 from `fmov` zeroing into NEON zeroing if the target supports zero cycle zeroing of NEON registers but not for the narrower classes.
It handles 2 cases: one in `AsmPrinter` where a FP zeroing from immediate has been captured by pattern matching on instruction selection, and second post RA in `AArch64InstrInfo::copyPhysReg` for uncaptured/later-generated WZR/XZR fmovs.
Adds a subtarget feature called FeatureZCZeroingFPR128 that enables to query wether the target supports zero cycle zeroing for FPR128 NEON registers, and updates the appropriate processors.
---
Patch is 51.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156261.diff
13 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp (+69-36)
- (modified) llvm/lib/Target/AArch64/AArch64Features.td (+3)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+39-4)
- (modified) llvm/lib/Target/AArch64/AArch64Processors.td (+29-9)
- (modified) llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll (+31-17)
- (modified) llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll (+1-1)
- (modified) llvm/test/CodeGen/AArch64/expand-select.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/ext-narrow-index.ll (+1-1)
- (modified) llvm/test/CodeGen/AArch64/fsh.ll (+15-15)
- (modified) llvm/test/CodeGen/AArch64/neon-dotreduce.ll (+76-76)
- (modified) llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/vecreduce-add.ll (+4-4)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index da344305f39d9..b1f411d489ebd 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -307,6 +307,7 @@ class AArch64AsmPrinter : public AsmPrinter {
/// Emit instruction to set float register to zero.
void emitFMov0(const MachineInstr &MI);
+ void emitFMov0AsFMov(const MachineInstr &MI, Register DestReg);
using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
@@ -1829,45 +1830,77 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
Register DestReg = MI.getOperand(0).getReg();
- if (STI->hasZeroCycleZeroingFPR64() &&
- !STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
- // Convert H/S register to corresponding D register
- if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
- DestReg = AArch64::D0 + (DestReg - AArch64::H0);
- else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
- DestReg = AArch64::D0 + (DestReg - AArch64::S0);
- else
- assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
+ if (!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
+ if (STI->hasZeroCycleZeroingFPR64()) {
+ // Convert H/S register to corresponding D register
+ const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
+ if (AArch64::FPR16RegClass.contains(DestReg))
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ else if (AArch64::FPR32RegClass.contains(DestReg))
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ else
+ assert(AArch64::FPR64RegClass.contains(DestReg));
+
+ MCInst MOVI;
+ MOVI.setOpcode(AArch64::MOVID);
+ MOVI.addOperand(MCOperand::createReg(DestReg));
+ MOVI.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MOVI);
+ } else if (STI->hasZeroCycleZeroingFPR128()) {
+ // Convert H/S/D register to corresponding Q register
+ const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
+ if (AArch64::FPR16RegClass.contains(DestReg)) {
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR128RegClass);
+ } else if (AArch64::FPR32RegClass.contains(DestReg)) {
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR128RegClass);
+ } else {
+ assert(AArch64::FPR64RegClass.contains(DestReg));
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::dsub,
+ &AArch64::FPR128RegClass);
+ }
- MCInst MOVI;
- MOVI.setOpcode(AArch64::MOVID);
- MOVI.addOperand(MCOperand::createReg(DestReg));
- MOVI.addOperand(MCOperand::createImm(0));
- EmitToStreamer(*OutStreamer, MOVI);
- } else {
- MCInst FMov;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("Unexpected opcode");
- case AArch64::FMOVH0:
- FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
- if (!STI->hasFullFP16())
- DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
- FMov.addOperand(MCOperand::createReg(DestReg));
- FMov.addOperand(MCOperand::createReg(AArch64::WZR));
- break;
- case AArch64::FMOVS0:
- FMov.setOpcode(AArch64::FMOVWSr);
- FMov.addOperand(MCOperand::createReg(DestReg));
- FMov.addOperand(MCOperand::createReg(AArch64::WZR));
- break;
- case AArch64::FMOVD0:
- FMov.setOpcode(AArch64::FMOVXDr);
- FMov.addOperand(MCOperand::createReg(DestReg));
- FMov.addOperand(MCOperand::createReg(AArch64::XZR));
- break;
+ MCInst MOVI;
+ MOVI.setOpcode(AArch64::MOVIv2d_ns);
+ MOVI.addOperand(MCOperand::createReg(DestReg));
+ MOVI.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MOVI);
+ } else {
+ emitFMov0AsFMov(MI, DestReg);
}
- EmitToStreamer(*OutStreamer, FMov);
+ } else {
+ emitFMov0AsFMov(MI, DestReg);
+ }
+}
+
+void AArch64AsmPrinter::emitFMov0AsFMov(const MachineInstr &MI,
+ Register DestReg) {
+ MCInst FMov;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ case AArch64::FMOVH0:
+ FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
+ if (!STI->hasFullFP16())
+ DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+ break;
+ case AArch64::FMOVS0:
+ FMov.setOpcode(AArch64::FMOVWSr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+ break;
+ case AArch64::FMOVD0:
+ FMov.setOpcode(AArch64::FMOVXDr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::XZR));
+ break;
}
+ EmitToStreamer(*OutStreamer, FMov);
}
Register AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 6904e09072649..46f5f0c1ca9dd 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -636,6 +636,9 @@ def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGP
def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
"Has zero-cycle zeroing instructions for GPR32 registers">;
+def FeatureZCZeroingFPR128 : SubtargetFeature<"zcz-fpr128", "HasZeroCycleZeroingFPR128", "true",
+ "Has zero-cycle zeroing instructions for FPR128 registers">;
+
// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
// as movi is more efficient across all cores. Newer cores can eliminate
// fmovs early and there is no difference with movi, but this not true for
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 3ce7829207cb6..b5c2f73760bf1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5469,8 +5469,24 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copies between GPR64 and FPR64.
if (AArch64::FPR64RegClass.contains(DestReg) &&
AArch64::GPR64RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (AArch64::XZR == SrcReg &&
+ !Subtarget.hasZeroCycleZeroingFPWorkaround() &&
+ Subtarget.isNeonAvailable()) {
+ if (Subtarget.hasZeroCycleZeroingFPR64()) {
+ BuildMI(MBB, I, DL, get(AArch64::MOVID), DestReg).addImm(0);
+ } else if (Subtarget.hasZeroCycleZeroingFPR128()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegQ = TRI->getMatchingSuperReg(
+ DestReg, AArch64::dsub, &AArch64::FPR128RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::MOVIv2d_ns), DestRegQ).addImm(0);
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
if (AArch64::GPR64RegClass.contains(DestReg) &&
@@ -5482,8 +5498,27 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copies between GPR32 and FPR32.
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::GPR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (AArch64::WZR == SrcReg &&
+ !Subtarget.hasZeroCycleZeroingFPWorkaround() &&
+ Subtarget.isNeonAvailable()) {
+ if (Subtarget.hasZeroCycleZeroingFPR64()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::MOVID), DestRegD).addImm(0);
+ } else if (Subtarget.hasZeroCycleZeroingFPR128()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegQ = TRI->getMatchingSuperReg(
+ DestReg, AArch64::ssub, &AArch64::FPR128RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::MOVIv2d_ns), DestRegQ).addImm(0);
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
if (AArch64::GPR32RegClass.contains(DestReg) &&
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index d5f4e91ae5188..81f5d075729d9 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -344,6 +344,8 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128,
FeatureZCZeroingFPWorkaround]>;
def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
@@ -358,7 +360,9 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
"Apple A11", [
@@ -372,7 +376,9 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
"Apple A12", [
@@ -386,7 +392,9 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
"Apple A13", [
@@ -400,7 +408,9 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
"Apple A14", [
@@ -419,7 +429,9 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
"Apple A15", [
@@ -438,7 +450,9 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
"Apple A16", [
@@ -457,7 +471,9 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
"Apple A17", [
@@ -476,7 +492,9 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
"Apple M4", [
@@ -494,7 +512,9 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
diff --git a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 7934e39b2b69f..78e20f2a5e214 100644
--- a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -69,14 +69,14 @@ define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
; CHECK-LABEL: add_sub_su64:
; CHECK: // %bb.0:
; CHECK-NEXT: add d0, d1, d0
-; CHECK-NEXT: fmov d1, xzr
+; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: sub d0, d1, d0
; CHECK-NEXT: ret
;
; GENERIC-LABEL: add_sub_su64:
; GENERIC: // %bb.0:
; GENERIC-NEXT: add d0, d1, d0
-; GENERIC-NEXT: fmov d1, xzr
+; GENERIC-NEXT: movi d1, #0000000000000000
; GENERIC-NEXT: sub d0, d1, d0
; GENERIC-NEXT: ret
%vecext = extractelement <2 x i64> %a, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
index 2a75976d58549..ccdaa8779e38f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-FULLFP16
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-NOZCZ-FPR128
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-NOZCZ-FPR128-FULLFP16
; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+zcz-fpr128 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-ZCZ-FPR128
; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,FP-WORKAROUND
-; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-ZCZ-FPR128
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
@@ -12,9 +13,10 @@ define half @tf16() {
entry:
; ALL-LABEL: tf16:
; FP-WORKAROUND: mov s0, wzr
-; NOZCZ-FPR64: mov s0, wzr
-; NOZCZ-FPR64-FULLFP16: mov h0, wzr
+; NOZCZ-FPR64-NOZCZ-FPR128: mov s0, wzr
+; NOZCZ-FPR64-NOZCZ-FPR128-FULLFP16: mov h0, wzr
; ZCZ-FPR64: movi d0, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret half 0.0
}
@@ -22,8 +24,9 @@ define float @tf32() {
entry:
; ALL-LABEL: tf32:
; FP-WORKAROUND: mov s0, wzr
-; NOZCZ-FPR64: mov s0, wzr
+; NOZCZ-FPR64-NOZCZ-FPR128: mov s0, wzr
; ZCZ-FPR64: movi d0, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret float 0.0
}
@@ -31,8 +34,9 @@ define double @td64() {
entry:
; ALL-LABEL: td64:
; FP-WORKAROUND: mov d0, xzr
-; NOZCZ-FPR64: mov d0, xzr
+; NOZCZ-FPR64-NOZCZ-FPR128: mov d0, xzr
; ZCZ-FPR64: movi d0, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret double 0.0
}
@@ -40,8 +44,9 @@ define <8 x i8> @tv8i8() {
entry:
; ALL-LABEL: tv8i8:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
}
@@ -49,8 +54,9 @@ define <4 x i16> @tv4i16() {
entry:
; ALL-LABEL: tv4i16:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
}
@@ -58,8 +64,9 @...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/156261
More information about the llvm-commits
mailing list