[llvm] [AArch64] Use 0-cycle reg2reg MOVs for FPR32, FPR16, FPR8 (PR #144152)
Tomer Shafir via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 14 01:58:10 PDT 2025
https://github.com/tomershafir updated https://github.com/llvm/llvm-project/pull/144152
>From 32fa94f286f739843901300c72def692c494d0f8 Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8 at gmail.com>
Date: Fri, 13 Jun 2025 22:34:26 +0300
Subject: [PATCH] [AArch64] Use 0-cycle reg2reg MOVs for FPR32, FPR16, FPR8
This change emits optimized copy instructions for FPR32, FPR16, FPR8 register classes on targets that support it. The implementation is similar to what has been done for GPR32. It adds 2 regression tests for FPR32 and FPR16.
Depends on: https://github.com/llvm/llvm-project/pull/143680 to resolve the test structure.
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 89 +++++++++++++++----
.../AArch64/arm64-zero-cycle-regmov-fpr16.ll | 45 ++++++++++
.../AArch64/arm64-zero-cycle-regmov-fpr32.ll | 45 ++++++++++
3 files changed, 163 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll
create mode 100644 llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 951cb93ea8f8c..59cd913999717 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -58,6 +58,10 @@
#include <iterator>
#include <utility>
+#include "llvm/Support/Debug.h"
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "aarch64instrinfo2"
+
using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
@@ -5302,30 +5306,83 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::FPR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ LLVM_DEBUG(dbgs() << ">>>>>> FPR32\n");
+ LLVM_DEBUG(dbgs() << ">>>>>> Subtarget.isTargetDarwin(): "
+ << Subtarget.isTargetDarwin() << "\n");
+ LLVM_DEBUG(dbgs() << ">>>>>> Subtarget.hasZeroCycleRegMove(): "
+ << Subtarget.hasZeroCycleRegMove() << "\n");
+ if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
-
if (AArch64::FPR16RegClass.contains(DestReg) &&
AArch64::FPR16RegClass.contains(SrcReg)) {
- DestReg =
- RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
- SrcReg =
- RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ LLVM_DEBUG(dbgs() << ">>>>>> FPR16\n");
+ LLVM_DEBUG(dbgs() << ">>>>>> Subtarget.isTargetDarwin(): "
+ << Subtarget.isTargetDarwin() << "\n");
+ LLVM_DEBUG(dbgs() << ">>>>>> Subtarget.hasZeroCycleRegMove(): "
+ << Subtarget.hasZeroCycleRegMove() << "\n");
+ if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
-
if (AArch64::FPR8RegClass.contains(DestReg) &&
AArch64::FPR8RegClass.contains(SrcReg)) {
- DestReg =
- RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
- SrcReg =
- RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll
new file mode 100644
index 0000000000000..879b66843be27
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s -check-prefixes=NOTCPU
+; RUN: llc < %s -march=arm64 -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU
+; RUN: llc < %s -march=arm64 -mcpu=apple-m1 -mattr=-zcm | FileCheck %s -check-prefixes=NOTATTR
+; RUN: llc < %s -march=arm64 -mattr=+zcm | FileCheck %s -check-prefixes=ATTR
+
+define half @t(half %a, half %b, half %c, half %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU: mov s0, s2
+; NOTCPU: mov s1, s3
+; NOTCPU: mov [[REG2:s[0-9]+]], s3
+; NOTCPU: mov [[REG1:s[0-9]+]], s2
+; NOTCPU: bl {{_?foo}}
+; NOTCPU: mov s0, [[REG1]]
+; NOTCPU: mov s1, [[REG2]]
+
+; CPU: mov [[REG2:d[0-9]+]], d3
+; CPU: mov [[REG1:d[0-9]+]], d2
+; CPU: mov d0, d2
+; CPU: mov d1, d3
+; CPU: bl {{_?foo}}
+; CPU: mov d0, [[REG1]]
+; CPU: mov d1, [[REG2]]
+
+; NOTATTR: mov [[REG2:s[0-9]+]], s3
+; NOTATTR: mov [[REG1:s[0-9]+]], s2
+; NOTATTR: mov s0, s2
+; NOTATTR: mov s1, s3
+; NOTATTR: bl {{_?foo}}
+; NOTATTR: mov s0, [[REG1]]
+; NOTATTR: mov s1, [[REG2]]
+
+; ATTR: mov d0, d2
+; ATTR: mov d1, d3
+; ATTR: mov [[REG2:d[0-9]+]], d3
+; ATTR: mov [[REG1:d[0-9]+]], d2
+; ATTR: bl {{_?foo}}
+; ATTR: mov d0, [[REG1]]
+; ATTR: mov d1, [[REG2]]
+ %call = call half @foo(half %c, half %d)
+ %call1 = call half @foo(half %c, half %d)
+ unreachable
+}
+
+declare half @foo(half, half)
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll
new file mode 100644
index 0000000000000..d808e8fb7a385
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s -check-prefixes=NOTCPU
+; RUN: llc < %s -march=arm64 -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU
+; RUN: llc < %s -march=arm64 -mcpu=apple-m1 -mattr=-zcm | FileCheck %s -check-prefixes=NOTATTR
+; RUN: llc < %s -march=arm64 -mattr=+zcm | FileCheck %s -check-prefixes=ATTR
+
+define float @t(float %a, float %b, float %c, float %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU: mov s0, s2
+; NOTCPU: mov s1, s3
+; NOTCPU: mov [[REG2:s[0-9]+]], s3
+; NOTCPU: mov [[REG1:s[0-9]+]], s2
+; NOTCPU: bl {{_?foo}}
+; NOTCPU: mov s0, [[REG1]]
+; NOTCPU: mov s1, [[REG2]]
+
+; CPU: mov [[REG2:d[0-9]+]], d3
+; CPU: mov [[REG1:d[0-9]+]], d2
+; CPU: mov d0, d2
+; CPU: mov d1, d3
+; CPU: bl {{_?foo}}
+; CPU: mov d0, [[REG1]]
+; CPU: mov d1, [[REG2]]
+
+; NOTATTR: mov [[REG2:s[0-9]+]], s3
+; NOTATTR: mov [[REG1:s[0-9]+]], s2
+; NOTATTR: mov s0, s2
+; NOTATTR: mov s1, s3
+; NOTATTR: bl {{_?foo}}
+; NOTATTR: mov s0, [[REG1]]
+; NOTATTR: mov s1, [[REG2]]
+
+; ATTR: mov d0, d2
+; ATTR: mov d1, d3
+; ATTR: mov [[REG2:d[0-9]+]], d3
+; ATTR: mov [[REG1:d[0-9]+]], d2
+; ATTR: bl {{_?foo}}
+; ATTR: mov d0, [[REG1]]
+; ATTR: mov d1, [[REG2]]
+ %call = call float @foo(float %c, float %d)
+ %call1 = call float @foo(float %c, float %d)
+ unreachable
+}
+
+declare float @foo(float, float)
More information about the llvm-commits
mailing list