[llvm] 928a7e6 - [AArch64] Use 0-cycle reg2reg MOVs for FPR32, FPR16, FPR8 (#144152)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 26 09:40:23 PDT 2025
Author: Tomer Shafir
Date: 2025-06-26T19:40:20+03:00
New Revision: 928a7e6cb9333480f6eb883f93ca6560a696b0fc
URL: https://github.com/llvm/llvm-project/commit/928a7e6cb9333480f6eb883f93ca6560a696b0fc
DIFF: https://github.com/llvm/llvm-project/commit/928a7e6cb9333480f6eb883f93ca6560a696b0fc.diff
LOG: [AArch64] Use 0-cycle reg2reg MOVs for FPR32, FPR16, FPR8 (#144152)
This change emits optimized copy instructions for FPR32, FPR16, FPR8
register classes on targets that support it. The implementation is
similar to what has been done for GPR32. It adds 2 regression tests for
FPR32 and FPR16.
Depends on: https://github.com/llvm/llvm-project/pull/143680 to resolve
the test structure.
Added:
llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
Modified:
llvm/lib/Target/AArch64/AArch64Features.td
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
llvm/lib/Target/AArch64/AArch64Processors.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 538a142118a89..24fbe207c4969 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -615,6 +615,12 @@ def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r",
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;
+def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFPR64", "true",
+ "Has zero-cycle register moves for FPR64 registers">;
+
+def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
+ "Has zero-cycle register moves for FPR32 registers">;
+
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
"Has zero-cycle zeroing instructions for generic registers">;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 951cb93ea8f8c..c3837cfe73d28 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5302,30 +5302,78 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::FPR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
if (AArch64::FPR16RegClass.contains(DestReg) &&
AArch64::FPR16RegClass.contains(SrcReg)) {
- DestReg =
- RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
- SrcReg =
- RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
if (AArch64::FPR8RegClass.contains(DestReg) &&
AArch64::FPR8RegClass.contains(SrcReg)) {
- DestReg =
- RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
- SrcReg =
- RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index ad041788e4d5d..4a5682475d107 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -312,6 +312,7 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
FeatureFuseAES, FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing,
FeatureZCZeroingFPWorkaround]>;
@@ -325,6 +326,7 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
@@ -337,6 +339,7 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
@@ -349,6 +352,7 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
@@ -361,6 +365,7 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
@@ -378,6 +383,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
@@ -395,6 +401,7 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
@@ -412,6 +419,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
@@ -429,6 +437,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
@@ -445,6 +454,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureZCRegMove,
+ FeatureZCRegMoveFPR64,
FeatureZCZeroing
]>;
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
new file mode 100644
index 0000000000000..f422f96f33495
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
@@ -0,0 +1,103 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s -check-prefixes=NOTCPU-LINUX --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=NOTCPU-APPLE --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr64 | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines
+
+define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU-LINUX: fmov s0, s2
+; NOTCPU-LINUX: fmov s1, s3
+; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3
+; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2
+; NOTCPU-LINUX-NEXT: bl {{_?foo_float}}
+; NOTCPU-LINUX: fmov s0, [[REG1]]
+; NOTCPU-LINUX: fmov s1, [[REG2]]
+
+; NOTCPU-APPLE: fmov s0, s2
+; NOTCPU-APPLE: fmov s1, s3
+; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3
+; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2
+; NOTCPU-APPLE-NEXT: bl {{_?foo_float}}
+; NOTCPU-APPLE: fmov s0, [[REG1]]
+; NOTCPU-APPLE: fmov s1, [[REG2]]
+
+; CPU: fmov [[REG2:d[0-9]+]], d3
+; CPU: fmov [[REG1:d[0-9]+]], d2
+; CPU: fmov d0, d2
+; CPU: fmov d1, d3
+; CPU-NEXT: bl {{_?foo_float}}
+; CPU: fmov d0, [[REG1]]
+; CPU: fmov d1, [[REG2]]
+
+; NOTATTR: fmov [[REG2:s[0-9]+]], s3
+; NOTATTR: fmov [[REG1:s[0-9]+]], s2
+; NOTATTR: fmov s0, s2
+; NOTATTR: fmov s1, s3
+; NOTATTR-NEXT: bl {{_?foo_float}}
+; NOTATTR: fmov s0, [[REG1]]
+; NOTATTR: fmov s1, [[REG2]]
+
+; ATTR: fmov d0, d2
+; ATTR: fmov d1, d3
+; ATTR: fmov [[REG2:d[0-9]+]], d3
+; ATTR: fmov [[REG1:d[0-9]+]], d2
+; ATTR-NEXT: bl {{_?foo_float}}
+; ATTR: fmov d0, [[REG1]]
+; ATTR: fmov d1, [[REG2]]
+ %call = call float @foo_float(float %c, float %d)
+ %call1 = call float @foo_float(float %c, float %d)
+ unreachable
+}
+
+declare float @foo_float(float, float)
+
+define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU-LINUX: fmov s0, s2
+; NOTCPU-LINUX: fmov s1, s3
+; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3
+; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2
+; NOTCPU-LINUX-NEXT: bl {{_?foo_half}}
+; NOTCPU-LINUX: fmov s0, [[REG1]]
+; NOTCPU-LINUX: fmov s1, [[REG2]]
+
+; NOTCPU-APPLE: fmov s0, s2
+; NOTCPU-APPLE: fmov s1, s3
+; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3
+; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2
+; NOTCPU-APPLE-NEXT: bl {{_?foo_half}}
+; NOTCPU-APPLE: fmov s0, [[REG1]]
+; NOTCPU-APPLE: fmov s1, [[REG2]]
+
+; CPU: fmov [[REG2:d[0-9]+]], d3
+; CPU: fmov [[REG1:d[0-9]+]], d2
+; CPU: fmov d0, d2
+; CPU: fmov d1, d3
+; CPU-NEXT: bl {{_?foo_half}}
+; CPU: fmov d0, [[REG1]]
+; CPU: fmov d1, [[REG2]]
+
+; NOTATTR: fmov [[REG2:s[0-9]+]], s3
+; NOTATTR: fmov [[REG1:s[0-9]+]], s2
+; NOTATTR: fmov s0, s2
+; NOTATTR: fmov s1, s3
+; NOTATTR-NEXT: bl {{_?foo_half}}
+; NOTATTR: fmov s0, [[REG1]]
+; NOTATTR: fmov s1, [[REG2]]
+
+; ATTR: fmov d0, d2
+; ATTR: fmov d1, d3
+; ATTR: fmov [[REG2:d[0-9]+]], d3
+; ATTR: fmov [[REG1:d[0-9]+]], d2
+; ATTR-NEXT: bl {{_?foo_half}}
+; ATTR: fmov d0, [[REG1]]
+; ATTR: fmov d1, [[REG2]]
+ %call = call half @foo_half(half %c, half %d)
+ %call1 = call half @foo_half(half %c, half %d)
+ unreachable
+}
+
+declare half @foo_half(half, half)
More information about the llvm-commits
mailing list