[llvm] [AArch64] Use 0-cycle reg2reg MOVs for FPR32, FPR16, FPR8 (PR #144152)
Tomer Shafir via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 14 06:53:28 PDT 2025
https://github.com/tomershafir updated https://github.com/llvm/llvm-project/pull/144152
>From 0e8e6acdd83fb0edf9f00886133f4c5b8457cf3f Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8 at gmail.com>
Date: Fri, 13 Jun 2025 22:34:26 +0300
Subject: [PATCH] [AArch64] Use 0-cycle reg2reg MOVs for FPR32, FPR16, FPR8
This change emits optimized copy instructions for FPR32, FPR16, FPR8 register classes on targets that support it. The implementation is similar to what has been done for GPR32. It adds 2 regression tests for FPR32 and FPR16.
Depends on: https://github.com/llvm/llvm-project/pull/143680 to resolve the test structure.
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 75 +++++++++++++++----
.../AArch64/arm64-zero-cycle-regmov-fpr16.ll | 45 +++++++++++
.../AArch64/arm64-zero-cycle-regmov-fpr32.ll | 45 +++++++++++
3 files changed, 149 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll
create mode 100644 llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 951cb93ea8f8c..70d8e918acbfa 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5302,30 +5302,73 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::FPR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
-
if (AArch64::FPR16RegClass.contains(DestReg) &&
AArch64::FPR16RegClass.contains(SrcReg)) {
- DestReg =
- RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
- SrcReg =
- RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
-
if (AArch64::FPR8RegClass.contains(DestReg) &&
AArch64::FPR8RegClass.contains(SrcReg)) {
- DestReg =
- RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
- SrcReg =
- RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll
new file mode 100644
index 0000000000000..f168b90adde5f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s -check-prefixes=NOTCPU
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm | FileCheck %s -check-prefixes=NOTATTR
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm | FileCheck %s -check-prefixes=ATTR
+
+define half @t(half %a, half %b, half %c, half %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU: mov s0, s2
+; NOTCPU: mov s1, s3
+; NOTCPU: mov [[REG2:s[0-9]+]], s3
+; NOTCPU: mov [[REG1:s[0-9]+]], s2
+; NOTCPU: bl {{_?foo}}
+; NOTCPU: mov s0, [[REG1]]
+; NOTCPU: mov s1, [[REG2]]
+
+; CPU: mov [[REG2:d[0-9]+]], d3
+; CPU: mov [[REG1:d[0-9]+]], d2
+; CPU: mov d0, d2
+; CPU: mov d1, d3
+; CPU: bl {{_?foo}}
+; CPU: mov d0, [[REG1]]
+; CPU: mov d1, [[REG2]]
+
+; NOTATTR: mov [[REG2:s[0-9]+]], s3
+; NOTATTR: mov [[REG1:s[0-9]+]], s2
+; NOTATTR: mov s0, s2
+; NOTATTR: mov s1, s3
+; NOTATTR: bl {{_?foo}}
+; NOTATTR: mov s0, [[REG1]]
+; NOTATTR: mov s1, [[REG2]]
+
+; ATTR: mov d0, d2
+; ATTR: mov d1, d3
+; ATTR: mov [[REG2:d[0-9]+]], d3
+; ATTR: mov [[REG1:d[0-9]+]], d2
+; ATTR: bl {{_?foo}}
+; ATTR: mov d0, [[REG1]]
+; ATTR: mov d1, [[REG2]]
+ %call = call half @foo(half %c, half %d)
+ %call1 = call half @foo(half %c, half %d)
+ unreachable
+}
+
+declare half @foo(half, half)
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll
new file mode 100644
index 0000000000000..e5a2d17362a07
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s -check-prefixes=NOTCPU
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm | FileCheck %s -check-prefixes=NOTATTR
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm | FileCheck %s -check-prefixes=ATTR
+
+define float @t(float %a, float %b, float %c, float %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU: mov s0, s2
+; NOTCPU: mov s1, s3
+; NOTCPU: mov [[REG2:s[0-9]+]], s3
+; NOTCPU: mov [[REG1:s[0-9]+]], s2
+; NOTCPU: bl {{_?foo}}
+; NOTCPU: mov s0, [[REG1]]
+; NOTCPU: mov s1, [[REG2]]
+
+; CPU: mov [[REG2:d[0-9]+]], d3
+; CPU: mov [[REG1:d[0-9]+]], d2
+; CPU: mov d0, d2
+; CPU: mov d1, d3
+; CPU: bl {{_?foo}}
+; CPU: mov d0, [[REG1]]
+; CPU: mov d1, [[REG2]]
+
+; NOTATTR: mov [[REG2:s[0-9]+]], s3
+; NOTATTR: mov [[REG1:s[0-9]+]], s2
+; NOTATTR: mov s0, s2
+; NOTATTR: mov s1, s3
+; NOTATTR: bl {{_?foo}}
+; NOTATTR: mov s0, [[REG1]]
+; NOTATTR: mov s1, [[REG2]]
+
+; ATTR: mov d0, d2
+; ATTR: mov d1, d3
+; ATTR: mov [[REG2:d[0-9]+]], d3
+; ATTR: mov [[REG1:d[0-9]+]], d2
+; ATTR: bl {{_?foo}}
+; ATTR: mov d0, [[REG1]]
+; ATTR: mov d1, [[REG2]]
+ %call = call float @foo(float %c, float %d)
+ %call1 = call float @foo(float %c, float %d)
+ unreachable
+}
+
+declare float @foo(float, float)
More information about the llvm-commits
mailing list