[llvm] [AArch64] Use 0-cycle reg2reg MOVs for FPR32, FPR16, FPR8 (PR #144152)
Tomer Shafir via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 16 10:03:41 PDT 2025
https://github.com/tomershafir updated https://github.com/llvm/llvm-project/pull/144152
>From ad98f61374939253b290d352d1a2d48e40dc4536 Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8 at gmail.com>
Date: Fri, 13 Jun 2025 22:34:26 +0300
Subject: [PATCH] [AArch64] Use 0-cycle reg2reg MOVs for FPR32, FPR16, FPR8
This change emits optimized copy instructions for FPR32, FPR16, FPR8 register classes on targets that support it. The implementation is similar to what has been done for GPR32. It adds 2 regression tests for FPR32 and FPR16.
Depends on: https://github.com/llvm/llvm-project/pull/143680 to resolve the test structure.
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 75 +++++++++++++++----
.../AArch64/arm64-zero-cycle-regmov-fpr16.ll | 45 +++++++++++
.../AArch64/arm64-zero-cycle-regmov-fpr32.ll | 45 +++++++++++
3 files changed, 149 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll
create mode 100644 llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 951cb93ea8f8c..70d8e918acbfa 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5302,30 +5302,73 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::FPR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
-
if (AArch64::FPR16RegClass.contains(DestReg) &&
AArch64::FPR16RegClass.contains(SrcReg)) {
- DestReg =
- RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
- SrcReg =
- RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
-
if (AArch64::FPR8RegClass.contains(DestReg) &&
AArch64::FPR8RegClass.contains(SrcReg)) {
- DestReg =
- RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
- SrcReg =
- RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll
new file mode 100644
index 0000000000000..6e005c9ad9c67
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s -check-prefixes=NOTCPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm | FileCheck %s -check-prefixes=ATTR --match-full-lines
+
+define half @t(half %a, half %b, half %c, half %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU: fmov s0, s2
+; NOTCPU: fmov s1, s3
+; NOTCPU: fmov [[REG2:s[0-9]+]], s3
+; NOTCPU: fmov [[REG1:s[0-9]+]], s2
+; NOTCPU: bl {{_?foo}}
+; NOTCPU: fmov s0, [[REG1]]
+; NOTCPU: fmov s1, [[REG2]]
+
+; CPU: fmov [[REG2:d[0-9]+]], d3
+; CPU: fmov [[REG1:d[0-9]+]], d2
+; CPU: fmov d0, d2
+; CPU: fmov d1, d3
+; CPU: bl {{_?foo}}
+; CPU: fmov d0, [[REG1]]
+; CPU: fmov d1, [[REG2]]
+
+; NOTATTR: fmov [[REG2:s[0-9]+]], s3
+; NOTATTR: fmov [[REG1:s[0-9]+]], s2
+; NOTATTR: fmov s0, s2
+; NOTATTR: fmov s1, s3
+; NOTATTR: bl {{_?foo}}
+; NOTATTR: fmov s0, [[REG1]]
+; NOTATTR: fmov s1, [[REG2]]
+
+; ATTR: fmov d0, d2
+; ATTR: fmov d1, d3
+; ATTR: fmov [[REG2:d[0-9]+]], d3
+; ATTR: fmov [[REG1:d[0-9]+]], d2
+; ATTR: bl {{_?foo}}
+; ATTR: fmov d0, [[REG1]]
+; ATTR: fmov d1, [[REG2]]
+ %call = call half @foo(half %c, half %d)
+ %call1 = call half @foo(half %c, half %d)
+ unreachable
+}
+
+declare half @foo(half, half)
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll
new file mode 100644
index 0000000000000..3726f6207d07e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s -check-prefixes=NOTCPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm | FileCheck %s -check-prefixes=ATTR --match-full-lines
+
+define float @t(float %a, float %b, float %c, float %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU: fmov s0, s2
+; NOTCPU: fmov s1, s3
+; NOTCPU: fmov [[REG2:s[0-9]+]], s3
+; NOTCPU: fmov [[REG1:s[0-9]+]], s2
+; NOTCPU: bl {{_?foo}}
+; NOTCPU: fmov s0, [[REG1]]
+; NOTCPU: fmov s1, [[REG2]]
+
+; CPU: fmov [[REG2:d[0-9]+]], d3
+; CPU: fmov [[REG1:d[0-9]+]], d2
+; CPU: fmov d0, d2
+; CPU: fmov d1, d3
+; CPU: bl {{_?foo}}
+; CPU: fmov d0, [[REG1]]
+; CPU: fmov d1, [[REG2]]
+
+; NOTATTR: fmov [[REG2:s[0-9]+]], s3
+; NOTATTR: fmov [[REG1:s[0-9]+]], s2
+; NOTATTR: fmov s0, s2
+; NOTATTR: fmov s1, s3
+; NOTATTR: bl {{_?foo}}
+; NOTATTR: fmov s0, [[REG1]]
+; NOTATTR: fmov s1, [[REG2]]
+
+; ATTR: fmov d0, d2
+; ATTR: fmov d1, d3
+; ATTR: fmov [[REG2:d[0-9]+]], d3
+; ATTR: fmov [[REG1:d[0-9]+]], d2
+; ATTR: bl {{_?foo}}
+; ATTR: fmov d0, [[REG1]]
+; ATTR: fmov d1, [[REG2]]
+ %call = call float @foo(float %c, float %d)
+ %call1 = call float @foo(float %c, float %d)
+ unreachable
+}
+
+declare float @foo(float, float)
More information about the llvm-commits
mailing list