[llvm] [AArch64] Use 0-cycle reg2reg MOVs for FPR32, FPR16, FPR8 (PR #144152)
Tomer Shafir via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 22 06:52:45 PDT 2025
https://github.com/tomershafir updated https://github.com/llvm/llvm-project/pull/144152
>From bb47589b1b21f0bd74ff14e47c9328114fb305e0 Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8 at gmail.com>
Date: Sun, 22 Jun 2025 16:41:37 +0300
Subject: [PATCH 1/2] [AArch64] Add 0-cycle copy subtarget features for FPR64,
FPR32 reg classes
This change adds 2 new subtarget features to AArch64 to model 0-cycle copy execution for FPR64 and FPR32 register classes. It also adds the new `HasZeroCycleRegMovFPR64` feature to Apple processors.
---
llvm/lib/Target/AArch64/AArch64Features.td | 6 ++++++
llvm/lib/Target/AArch64/AArch64Processors.td | 10 ++++++++++
2 files changed, 16 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 469c76752c78c..bcc5d438d9afc 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -612,6 +612,12 @@ def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r",
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;
+def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFPR64", "true",
+ "Has zero-cycle register moves for FPR64 registers">;
+
+def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
+ "Has zero-cycle register moves for FPR32 registers">;
+
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
"Has zero-cycle zeroing instructions for generic registers">;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index c7ea6393e2ad3..d8e4fcbcb5fa4 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -312,6 +312,7 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
FeatureFuseAES, FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing,
FeatureZCZeroingFPWorkaround]>;
@@ -325,6 +326,7 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
@@ -337,6 +339,7 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
@@ -349,6 +352,7 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
@@ -361,6 +365,7 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
@@ -378,6 +383,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
@@ -395,6 +401,7 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
@@ -412,6 +419,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
@@ -429,6 +437,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
@@ -445,6 +454,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing
]>;
>From 7c0a52625d177f496f3488254fc7370c72c664f3 Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8 at gmail.com>
Date: Sun, 22 Jun 2025 16:49:12 +0300
Subject: [PATCH 2/2] [AArch64] Use 0-cycle copy for FPR32, FPR16, FPR8
This change emits zero cycle copy instructions for FPR32, FPR16, FPR8 register classes on targets that support it. The implementation is similar to what has been done for GPR32. It adds a regression test with 2 variants for FPR32 and FPR16.
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 75 ++++++++++---
.../AArch64/arm64-zero-cycle-regmov-fpr.ll | 103 ++++++++++++++++++
2 files changed, 162 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 951cb93ea8f8c..d893294b3e25d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5302,30 +5302,73 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::FPR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.hasZeroCycleRegMoveFPR64() && !Subtarget.hasZeroCycleRegMoveFPR32()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
-
if (AArch64::FPR16RegClass.contains(DestReg) &&
AArch64::FPR16RegClass.contains(SrcReg)) {
- DestReg =
- RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
- SrcReg =
- RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.hasZeroCycleRegMoveFPR64() && !Subtarget.hasZeroCycleRegMoveFPR32()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
-
if (AArch64::FPR8RegClass.contains(DestReg) &&
AArch64::FPR8RegClass.contains(SrcReg)) {
- DestReg =
- RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
- SrcReg =
- RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.hasZeroCycleRegMoveFPR64() && !Subtarget.hasZeroCycleRegMoveFPR32()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
new file mode 100644
index 0000000000000..f422f96f33495
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
@@ -0,0 +1,103 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s -check-prefixes=NOTCPU-LINUX --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=NOTCPU-APPLE --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr64 | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines
+
+define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU-LINUX: fmov s0, s2
+; NOTCPU-LINUX: fmov s1, s3
+; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3
+; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2
+; NOTCPU-LINUX-NEXT: bl {{_?foo_float}}
+; NOTCPU-LINUX: fmov s0, [[REG1]]
+; NOTCPU-LINUX: fmov s1, [[REG2]]
+
+; NOTCPU-APPLE: fmov s0, s2
+; NOTCPU-APPLE: fmov s1, s3
+; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3
+; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2
+; NOTCPU-APPLE-NEXT: bl {{_?foo_float}}
+; NOTCPU-APPLE: fmov s0, [[REG1]]
+; NOTCPU-APPLE: fmov s1, [[REG2]]
+
+; CPU: fmov [[REG2:d[0-9]+]], d3
+; CPU: fmov [[REG1:d[0-9]+]], d2
+; CPU: fmov d0, d2
+; CPU: fmov d1, d3
+; CPU-NEXT: bl {{_?foo_float}}
+; CPU: fmov d0, [[REG1]]
+; CPU: fmov d1, [[REG2]]
+
+; NOTATTR: fmov [[REG2:s[0-9]+]], s3
+; NOTATTR: fmov [[REG1:s[0-9]+]], s2
+; NOTATTR: fmov s0, s2
+; NOTATTR: fmov s1, s3
+; NOTATTR-NEXT: bl {{_?foo_float}}
+; NOTATTR: fmov s0, [[REG1]]
+; NOTATTR: fmov s1, [[REG2]]
+
+; ATTR: fmov d0, d2
+; ATTR: fmov d1, d3
+; ATTR: fmov [[REG2:d[0-9]+]], d3
+; ATTR: fmov [[REG1:d[0-9]+]], d2
+; ATTR-NEXT: bl {{_?foo_float}}
+; ATTR: fmov d0, [[REG1]]
+; ATTR: fmov d1, [[REG2]]
+ %call = call float @foo_float(float %c, float %d)
+ %call1 = call float @foo_float(float %c, float %d)
+ unreachable
+}
+
+declare float @foo_float(float, float)
+
+define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU-LINUX: fmov s0, s2
+; NOTCPU-LINUX: fmov s1, s3
+; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3
+; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2
+; NOTCPU-LINUX-NEXT: bl {{_?foo_half}}
+; NOTCPU-LINUX: fmov s0, [[REG1]]
+; NOTCPU-LINUX: fmov s1, [[REG2]]
+
+; NOTCPU-APPLE: fmov s0, s2
+; NOTCPU-APPLE: fmov s1, s3
+; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3
+; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2
+; NOTCPU-APPLE-NEXT: bl {{_?foo_half}}
+; NOTCPU-APPLE: fmov s0, [[REG1]]
+; NOTCPU-APPLE: fmov s1, [[REG2]]
+
+; CPU: fmov [[REG2:d[0-9]+]], d3
+; CPU: fmov [[REG1:d[0-9]+]], d2
+; CPU: fmov d0, d2
+; CPU: fmov d1, d3
+; CPU-NEXT: bl {{_?foo_half}}
+; CPU: fmov d0, [[REG1]]
+; CPU: fmov d1, [[REG2]]
+
+; NOTATTR: fmov [[REG2:s[0-9]+]], s3
+; NOTATTR: fmov [[REG1:s[0-9]+]], s2
+; NOTATTR: fmov s0, s2
+; NOTATTR: fmov s1, s3
+; NOTATTR-NEXT: bl {{_?foo_half}}
+; NOTATTR: fmov s0, [[REG1]]
+; NOTATTR: fmov s1, [[REG2]]
+
+; ATTR: fmov d0, d2
+; ATTR: fmov d1, d3
+; ATTR: fmov [[REG2:d[0-9]+]], d3
+; ATTR: fmov [[REG1:d[0-9]+]], d2
+; ATTR-NEXT: bl {{_?foo_half}}
+; ATTR: fmov d0, [[REG1]]
+; ATTR: fmov d1, [[REG2]]
+ %call = call half @foo_half(half %c, half %d)
+ %call1 = call half @foo_half(half %c, half %d)
+ unreachable
+}
+
+declare half @foo_half(half, half)
More information about the llvm-commits
mailing list