[llvm] [AArch64] Lower FPR register moves to zero cycle NEON (PR #153158)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 20 02:45:57 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Tomer Shafir (tomershafir)
<details>
<summary>Changes</summary>
[AArch64] Lower FPR register moves to zero cycle NEON
Lower FPR64, FPR32, FPR16, FPR8 register moves into NEON moves if the target supports zero cycle move for NEON but not for the narrower classes.
Adds a subtarget feature called FeatureZCRegMoveFPR128 that enables to query wether the target supports zero cycle reg move for FPR128 NEON registers, and embeds it into the appropriate processors.
Includes lowering test cases, and specializes check prefixes.
---
Full diff: https://github.com/llvm/llvm-project/pull/153158.diff
4 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64Features.td (+3)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+74-8)
- (modified) llvm/lib/Target/AArch64/AArch64Processors.td (+10)
- (modified) llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll (+104-45)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index c1c1f0a1024d0..55aea17d29f55 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -621,6 +621,9 @@ def FeatureZCRegMoveGPR64 : SubtargetFeature<"zcm-gpr64", "HasZeroCycleRegMoveGP
def FeatureZCRegMoveGPR32 : SubtargetFeature<"zcm-gpr32", "HasZeroCycleRegMoveGPR32", "true",
"Has zero-cycle register moves for GPR32 registers">;
+def FeatureZCRegMoveFPR128 : SubtargetFeature<"zcm-fpr128", "HasZeroCycleRegMoveFPR128", "true",
+ "Has zero-cycle register moves for FPR128 registers">;
+
def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFPR64", "true",
"Has zero-cycle register moves for FPR64 registers">;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index d15f90deba74e..103e56a83a5de 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5318,15 +5318,49 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR64RegClass.contains(DestReg) &&
AArch64::FPR64RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.hasZeroCycleRegMoveFPR128() &&
+ !Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::dsub,
+ &AArch64::FPR128RegClass);
+ MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::dsub,
+ &AArch64::FPR128RegClass);
+ // This instruction is reading and writing Q registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegQ, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::FPR32RegClass.contains(SrcReg)) {
- if (Subtarget.hasZeroCycleRegMoveFPR64() &&
- !Subtarget.hasZeroCycleRegMoveFPR32()) {
+ if (Subtarget.hasZeroCycleRegMoveFPR128() &&
+ !Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR128RegClass);
+ MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
+ &AArch64::FPR128RegClass);
+ // This instruction is reading and writing Q registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegQ, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32()) {
const TargetRegisterInfo *TRI = &getRegisterInfo();
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
&AArch64::FPR64RegClass);
@@ -5348,8 +5382,24 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR16RegClass.contains(DestReg) &&
AArch64::FPR16RegClass.contains(SrcReg)) {
- if (Subtarget.hasZeroCycleRegMoveFPR64() &&
- !Subtarget.hasZeroCycleRegMoveFPR32()) {
+ if (Subtarget.hasZeroCycleRegMoveFPR128() &&
+ !Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR128RegClass);
+ MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR128RegClass);
+ // This instruction is reading and writing Q registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegQ, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32()) {
const TargetRegisterInfo *TRI = &getRegisterInfo();
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
&AArch64::FPR64RegClass);
@@ -5375,8 +5425,24 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR8RegClass.contains(DestReg) &&
AArch64::FPR8RegClass.contains(SrcReg)) {
- if (Subtarget.hasZeroCycleRegMoveFPR64() &&
- !Subtarget.hasZeroCycleRegMoveFPR32()) {
+ if (Subtarget.hasZeroCycleRegMoveFPR128() &&
+ !Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR64() && Subtarget.isNeonAvailable()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR128RegClass);
+ MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR128RegClass);
+ // This instruction is reading and writing Q registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegQ, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32()) {
const TargetRegisterInfo *TRI = &getRegisterInfo();
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
&AArch64::FPR64RegClass);
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 42eaeca906e66..b7e08dbe7c792 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -321,6 +321,7 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
FeatureFuseAES, FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
+ FeatureZCRegMoveFPR128,
FeatureZCZeroing,
FeatureZCZeroingFPWorkaround]>;
@@ -334,6 +335,7 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
+ FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
@@ -346,6 +348,7 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
+ FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
@@ -358,6 +361,7 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
+ FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
@@ -370,6 +374,7 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
+ FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
@@ -387,6 +392,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
+ FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
@@ -404,6 +410,7 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
+ FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
@@ -421,6 +428,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
+ FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
@@ -438,6 +446,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
+ FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
@@ -454,6 +463,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureZCRegMoveGPR64,
+ FeatureZCRegMoveFPR128,
FeatureZCZeroing
]>;
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
index fa15ab42c2638..a0f1b719372b3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
@@ -1,33 +1,84 @@
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s -check-prefixes=NOTCPU-LINUX --match-full-lines
-; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=NOTCPU-APPLE --match-full-lines
-; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s -check-prefixes=NOZCM-FPR128-CPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=NOZCM-FPR128-CPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ZCM-FPR128-CPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr128 | FileCheck %s -check-prefixes=NOZCM-FPR128-ATTR --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr128 | FileCheck %s -check-prefixes=ZCM-FPR128-ATTR --match-full-lines
+
+define void @zero_cycle_regmov_FPR64(double %a, double %b, double %c, double %d) {
+entry:
+; CHECK-LABEL: t:
+; NOZCM-FPR128-CPU: fmov d0, d2
+; NOZCM-FPR128-CPU: fmov d1, d3
+; NOZCM-FPR128-CPU: fmov [[REG2:d[0-9]+]], d3
+; NOZCM-FPR128-CPU: fmov [[REG1:d[0-9]+]], d2
+; NOZCM-FPR128-CPU-NEXT: bl {{_?foo_double}}
+; NOZCM-FPR128-CPU: fmov d0, [[REG1]]
+; NOZCM-FPR128-CPU: fmov d1, [[REG2]]
+
+; ZCM-FPR128-CPU: mov.16b [[REG2:v[0-9]+]], v3
+; ZCM-FPR128-CPU: mov.16b [[REG1:v[0-9]+]], v2
+; ZCM-FPR128-CPU: mov.16b v0, v2
+; ZCM-FPR128-CPU: mov.16b v1, v3
+; ZCM-FPR128-CPU-NEXT: bl {{_?foo_double}}
+; ZCM-FPR128-CPU: mov.16b v0, [[REG1]]
+; ZCM-FPR128-CPU: mov.16b v1, [[REG2]]
+
+; NOZCM-FPR128-ATTR: fmov [[REG2:d[0-9]+]], d3
+; NOZCM-FPR128-ATTR: fmov [[REG1:d[0-9]+]], d2
+; NOZCM-FPR128-ATTR: fmov d0, d2
+; NOZCM-FPR128-ATTR: fmov d1, d3
+; NOZCM-FPR128-ATTR-NEXT: bl {{_?foo_double}}
+; NOZCM-FPR128-ATTR: fmov d0, [[REG1]]
+; NOZCM-FPR128-ATTR: fmov d1, [[REG2]]
+
+; ZCM-FPR128-ATTR: mov.16b v0, v2
+; ZCM-FPR128-ATTR: mov.16b v1, v3
+; ZCM-FPR128-ATTR: mov.16b [[REG2:v[0-9]+]], v3
+; ZCM-FPR128-ATTR: mov.16b [[REG1:v[0-9]+]], v2
+; ZCM-FPR128-ATTR-NEXT: bl {{_?foo_double}}
+; ZCM-FPR128-ATTR: mov.16b v0, [[REG1]]
+; ZCM-FPR128-ATTR: mov.16b v1, [[REG2]]
+ %call = call double @foo_double(double %c, double %d)
+ %call1 = call double @foo_double(double %c, double %d)
+ unreachable
+}
+
+declare float @foo_double(double, double)
define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) {
entry:
; CHECK-LABEL: t:
-; NOTCPU-LINUX: fmov s0, s2
-; NOTCPU-LINUX: fmov s1, s3
-; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3
-; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2
-; NOTCPU-LINUX-NEXT: bl {{_?foo_float}}
-; NOTCPU-LINUX: fmov s0, [[REG1]]
-; NOTCPU-LINUX: fmov s1, [[REG2]]
+; NOZCM-FPR128-CPU: fmov s0, s2
+; NOZCM-FPR128-CPU: fmov s1, s3
+; NOZCM-FPR128-CPU: fmov [[REG2:s[0-9]+]], s3
+; NOZCM-FPR128-CPU: fmov [[REG1:s[0-9]+]], s2
+; NOZCM-FPR128-CPU-NEXT: bl {{_?foo_float}}
+; NOZCM-FPR128-CPU: fmov s0, [[REG1]]
+; NOZCM-FPR128-CPU: fmov s1, [[REG2]]
-; NOTCPU-APPLE: fmov s0, s2
-; NOTCPU-APPLE: fmov s1, s3
-; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3
-; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2
-; NOTCPU-APPLE-NEXT: bl {{_?foo_float}}
-; NOTCPU-APPLE: fmov s0, [[REG1]]
-; NOTCPU-APPLE: fmov s1, [[REG2]]
+; ZCM-FPR128-CPU: mov.16b [[REG2:v[0-9]+]], v3
+; ZCM-FPR128-CPU: mov.16b [[REG1:v[0-9]+]], v2
+; ZCM-FPR128-CPU: mov.16b v0, v2
+; ZCM-FPR128-CPU: mov.16b v1, v3
+; ZCM-FPR128-CPU-NEXT: bl {{_?foo_float}}
+; ZCM-FPR128-CPU: mov.16b v0, [[REG1]]
+; ZCM-FPR128-CPU: mov.16b v1, [[REG2]]
-; ATTR: fmov d0, d2
-; ATTR: fmov d1, d3
-; ATTR: fmov [[REG2:d[0-9]+]], d3
-; ATTR: fmov [[REG1:d[0-9]+]], d2
-; ATTR-NEXT: bl {{_?foo_float}}
-; ATTR: fmov d0, [[REG1]]
-; ATTR: fmov d1, [[REG2]]
+; NOZCM-FPR128-ATTR: fmov [[REG2:s[0-9]+]], s3
+; NOZCM-FPR128-ATTR: fmov [[REG1:s[0-9]+]], s2
+; NOZCM-FPR128-ATTR: fmov s0, s2
+; NOZCM-FPR128-ATTR: fmov s1, s3
+; NOZCM-FPR128-ATTR-NEXT: bl {{_?foo_float}}
+; NOZCM-FPR128-ATTR: fmov s0, [[REG1]]
+; NOZCM-FPR128-ATTR: fmov s1, [[REG2]]
+
+; ZCM-FPR128-ATTR: mov.16b v0, v2
+; ZCM-FPR128-ATTR: mov.16b v1, v3
+; ZCM-FPR128-ATTR: mov.16b [[REG2:v[0-9]+]], v3
+; ZCM-FPR128-ATTR: mov.16b [[REG1:v[0-9]+]], v2
+; ZCM-FPR128-ATTR-NEXT: bl {{_?foo_float}}
+; ZCM-FPR128-ATTR: mov.16b v0, [[REG1]]
+; ZCM-FPR128-ATTR: mov.16b v1, [[REG2]]
%call = call float @foo_float(float %c, float %d)
%call1 = call float @foo_float(float %c, float %d)
unreachable
@@ -38,29 +89,37 @@ declare float @foo_float(float, float)
define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) {
entry:
; CHECK-LABEL: t:
-; NOTCPU-LINUX: fmov s0, s2
-; NOTCPU-LINUX: fmov s1, s3
-; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3
-; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2
-; NOTCPU-LINUX-NEXT: bl {{_?foo_half}}
-; NOTCPU-LINUX: fmov s0, [[REG1]]
-; NOTCPU-LINUX: fmov s1, [[REG2]]
+; NOZCM-FPR128-CPU: fmov s0, s2
+; NOZCM-FPR128-CPU: fmov s1, s3
+; NOZCM-FPR128-CPU: fmov [[REG2:s[0-9]+]], s3
+; NOZCM-FPR128-CPU: fmov [[REG1:s[0-9]+]], s2
+; NOZCM-FPR128-CPU-NEXT: bl {{_?foo_half}}
+; NOZCM-FPR128-CPU: fmov s0, [[REG1]]
+; NOZCM-FPR128-CPU: fmov s1, [[REG2]]
+
+; ZCM-FPR128-CPU: mov.16b [[REG2:v[0-9]+]], v3
+; ZCM-FPR128-CPU: mov.16b [[REG1:v[0-9]+]], v2
+; ZCM-FPR128-CPU: mov.16b v0, v2
+; ZCM-FPR128-CPU: mov.16b v1, v3
+; ZCM-FPR128-CPU-NEXT: bl {{_?foo_half}}
+; ZCM-FPR128-CPU: mov.16b v0, [[REG1]]
+; ZCM-FPR128-CPU: mov.16b v1, [[REG2]]
-; NOTCPU-APPLE: fmov s0, s2
-; NOTCPU-APPLE: fmov s1, s3
-; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3
-; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2
-; NOTCPU-APPLE-NEXT: bl {{_?foo_half}}
-; NOTCPU-APPLE: fmov s0, [[REG1]]
-; NOTCPU-APPLE: fmov s1, [[REG2]]
+; NOZCM-FPR128-ATTR: fmov [[REG2:s[0-9]+]], s3
+; NOZCM-FPR128-ATTR: fmov [[REG1:s[0-9]+]], s2
+; NOZCM-FPR128-ATTR: fmov s0, s2
+; NOZCM-FPR128-ATTR: fmov s1, s3
+; NOZCM-FPR128-ATTR-NEXT: bl {{_?foo_half}}
+; NOZCM-FPR128-ATTR: fmov s0, [[REG1]]
+; NOZCM-FPR128-ATTR: fmov s1, [[REG2]]
-; ATTR: fmov d0, d2
-; ATTR: fmov d1, d3
-; ATTR: fmov [[REG2:d[0-9]+]], d3
-; ATTR: fmov [[REG1:d[0-9]+]], d2
-; ATTR-NEXT: bl {{_?foo_half}}
-; ATTR: fmov d0, [[REG1]]
-; ATTR: fmov d1, [[REG2]]
+; ZCM-FPR128-ATTR: mov.16b v0, v2
+; ZCM-FPR128-ATTR: mov.16b v1, v3
+; ZCM-FPR128-ATTR: mov.16b [[REG2:v[0-9]+]], v3
+; ZCM-FPR128-ATTR: mov.16b [[REG1:v[0-9]+]], v2
+; ZCM-FPR128-ATTR-NEXT: bl {{_?foo_half}}
+; ZCM-FPR128-ATTR: mov.16b v0, [[REG1]]
+; ZCM-FPR128-ATTR: mov.16b v1, [[REG2]]
%call = call half @foo_half(half %c, half %d)
%call1 = call half @foo_half(half %c, half %d)
unreachable
``````````
</details>
https://github.com/llvm/llvm-project/pull/153158
More information about the llvm-commits
mailing list