[llvm] [AArch64] Avoid using NEON FCVTXN in Streaming-SVE mode. (PR #91981)
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Thu May 16 06:24:04 PDT 2024
https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/91981
>From 189aad6e57aa35112994759ec494b54838f046c5 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 26 Apr 2024 13:57:41 +0100
Subject: [PATCH 1/2] [AArch64] Avoid using NEON FCVTXN in Streaming-SVE mode.
We can still lower these operations using (streaming-compatible) SVE
instructions when compiling for SME or SVE2.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 28 ++++++++++++---
...e-streaming-mode-fixed-length-fcopysign.ll | 36 +++++++++++--------
2 files changed, 45 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7344387ffe552..522f2dc95f87b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19507,7 +19507,27 @@ static SDValue performBuildVectorCombine(SDNode *N,
SDLoc DL(N);
EVT VT = N->getValueType(0);
- if (VT == MVT::v4f16 || VT == MVT::v4bf16) {
+ const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ bool CanUseFCVTXN = Subtarget.isNeonAvailable() ||
+ (Subtarget.useSVEForFixedLengthVectors() &&
+ (Subtarget.hasSVE2() || Subtarget.hasSME()));
+ if (CanUseFCVTXN && (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
+ // Convenience function to build an FCVT instruction, which is needed
+ // once for the bottom bits and once for the top bits.
+ auto MakeFCVTXN = [&](SDValue V) {
+ if (Subtarget.isNeonAvailable())
+ return DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, V);
+ else {
+ SDValue In = convertToScalableVector(DAG, MVT::nxv2f64, V);
+ SDValue PTrue = getPredicateForVector(DAG, DL, MVT::v2f64);
+ SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_fcvtx_f32f64,
+ DL, MVT::i64);
+ SDValue Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv4f32,
+ {ID, In, PTrue, In});
+ return convertFromScalableVector(DAG, MVT::v2f32, Op);
+ }
+ };
+
SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
if (Elt0->getOpcode() == ISD::FP_ROUND &&
@@ -19548,12 +19568,10 @@ static SDValue performBuildVectorCombine(SDNode *N,
Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
- HighLanes =
- DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
+ HighLanes = MakeFCVTXN(HighLanesSrcVec);
}
if (HighLanes) {
- SDValue DoubleToSingleSticky =
- DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
+ SDValue DoubleToSingleSticky = MakeFCVTXN(LowLanesSrcVec);
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
DoubleToSingleSticky, HighLanes);
return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index 0d6675def8b52..196105d3f26d0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -427,28 +427,36 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
; SVE-LABEL: test_copysign_v4f16_v4f64:
; SVE: // %bb.0:
-; SVE-NEXT: ldp q0, q1, [x1]
-; SVE-NEXT: ptrue p0.s, vl2
-; SVE-NEXT: fcvtxn v1.2s, v1.2d
-; SVE-NEXT: fcvtxn v0.2s, v0.2d
-; SVE-NEXT: splice z0.s, p0, z0.s, z1.s
-; SVE-NEXT: ptrue p0.s
-; SVE-NEXT: ldr d1, [x0]
-; SVE-NEXT: and z1.h, z1.h, #0x7fff
-; SVE-NEXT: fcvt z0.h, p0/m, z0.s
-; SVE-NEXT: uzp1 z0.h, z0.h, z0.h
+; SVE-NEXT: sub sp, sp, #16
+; SVE-NEXT: .cfi_def_cfa_offset 16
+; SVE-NEXT: ldp q1, q0, [x1]
+; SVE-NEXT: ldr d4, [x0]
+; SVE-NEXT: and z4.h, z4.h, #0x7fff
+; SVE-NEXT: mov z2.d, z0.d[1]
+; SVE-NEXT: mov z3.d, z1.d[1]
+; SVE-NEXT: fcvt h0, d0
+; SVE-NEXT: fcvt h1, d1
+; SVE-NEXT: fcvt h2, d2
+; SVE-NEXT: fcvt h3, d3
+; SVE-NEXT: str h0, [sp, #12]
+; SVE-NEXT: str h1, [sp, #8]
+; SVE-NEXT: str h2, [sp, #14]
+; SVE-NEXT: str h3, [sp, #10]
+; SVE-NEXT: ldr d0, [sp, #8]
; SVE-NEXT: and z0.h, z0.h, #0x8000
-; SVE-NEXT: orr z0.d, z1.d, z0.d
+; SVE-NEXT: orr z0.d, z4.d, z0.d
; SVE-NEXT: str d0, [x0]
+; SVE-NEXT: add sp, sp, #16
; SVE-NEXT: ret
;
; SVE2-LABEL: test_copysign_v4f16_v4f64:
; SVE2: // %bb.0:
; SVE2-NEXT: ldp q0, q1, [x1]
-; SVE2-NEXT: ptrue p0.s, vl2
+; SVE2-NEXT: ptrue p0.d, vl2
; SVE2-NEXT: ldr d2, [x0]
-; SVE2-NEXT: fcvtxn v1.2s, v1.2d
-; SVE2-NEXT: fcvtxn v0.2s, v0.2d
+; SVE2-NEXT: fcvtx z1.s, p0/m, z1.d
+; SVE2-NEXT: fcvtx z0.s, p0/m, z0.d
+; SVE2-NEXT: ptrue p0.s, vl2
; SVE2-NEXT: splice z0.s, p0, z0.s, z1.s
; SVE2-NEXT: ptrue p0.s
; SVE2-NEXT: mov z1.h, #32767 // =0x7fff
>From d739b3b2d0c5bf6486a9a3ed27300542e9151bb0 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Thu, 16 May 2024 14:16:00 +0100
Subject: [PATCH 2/2] Don't emulate FCVTXN with SVE
@paulwalker-arm requested to simplify the code for now since the SVE code is already
not great and this is something we'll want to change separately.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 29 ++++-------------
...e-streaming-mode-fixed-length-fcopysign.ll | 32 +++++++++++--------
2 files changed, 25 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 522f2dc95f87b..7b12c6ea2c1ec 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19507,27 +19507,8 @@ static SDValue performBuildVectorCombine(SDNode *N,
SDLoc DL(N);
EVT VT = N->getValueType(0);
- const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
- bool CanUseFCVTXN = Subtarget.isNeonAvailable() ||
- (Subtarget.useSVEForFixedLengthVectors() &&
- (Subtarget.hasSVE2() || Subtarget.hasSME()));
- if (CanUseFCVTXN && (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
- // Convenience function to build an FCVT instruction, which is needed
- // once for the bottom bits and once for the top bits.
- auto MakeFCVTXN = [&](SDValue V) {
- if (Subtarget.isNeonAvailable())
- return DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, V);
- else {
- SDValue In = convertToScalableVector(DAG, MVT::nxv2f64, V);
- SDValue PTrue = getPredicateForVector(DAG, DL, MVT::v2f64);
- SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_fcvtx_f32f64,
- DL, MVT::i64);
- SDValue Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv4f32,
- {ID, In, PTrue, In});
- return convertFromScalableVector(DAG, MVT::v2f32, Op);
- }
- };
-
+ if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
+ (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
if (Elt0->getOpcode() == ISD::FP_ROUND &&
@@ -19568,10 +19549,12 @@ static SDValue performBuildVectorCombine(SDNode *N,
Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
- HighLanes = MakeFCVTXN(HighLanesSrcVec);
+ HighLanes =
+ DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
}
if (HighLanes) {
- SDValue DoubleToSingleSticky = MakeFCVTXN(LowLanesSrcVec);
+ SDValue DoubleToSingleSticky =
+ DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
DoubleToSingleSticky, HighLanes);
return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index 196105d3f26d0..f017eead92cff 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -451,19 +451,25 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
;
; SVE2-LABEL: test_copysign_v4f16_v4f64:
; SVE2: // %bb.0:
-; SVE2-NEXT: ldp q0, q1, [x1]
-; SVE2-NEXT: ptrue p0.d, vl2
-; SVE2-NEXT: ldr d2, [x0]
-; SVE2-NEXT: fcvtx z1.s, p0/m, z1.d
-; SVE2-NEXT: fcvtx z0.s, p0/m, z0.d
-; SVE2-NEXT: ptrue p0.s, vl2
-; SVE2-NEXT: splice z0.s, p0, z0.s, z1.s
-; SVE2-NEXT: ptrue p0.s
-; SVE2-NEXT: mov z1.h, #32767 // =0x7fff
-; SVE2-NEXT: fcvt z0.h, p0/m, z0.s
-; SVE2-NEXT: uzp1 z0.h, z0.h, z0.h
-; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d
-; SVE2-NEXT: str d2, [x0]
+; SVE2-NEXT: sub sp, sp, #16
+; SVE2-NEXT: .cfi_def_cfa_offset 16
+; SVE2-NEXT: ldp q2, q1, [x1]
+; SVE2-NEXT: mov z0.h, #32767 // =0x7fff
+; SVE2-NEXT: ldr d5, [x0]
+; SVE2-NEXT: mov z3.d, z1.d[1]
+; SVE2-NEXT: mov z4.d, z2.d[1]
+; SVE2-NEXT: fcvt h1, d1
+; SVE2-NEXT: fcvt h2, d2
+; SVE2-NEXT: fcvt h3, d3
+; SVE2-NEXT: fcvt h4, d4
+; SVE2-NEXT: str h1, [sp, #12]
+; SVE2-NEXT: str h2, [sp, #8]
+; SVE2-NEXT: str h3, [sp, #14]
+; SVE2-NEXT: str h4, [sp, #10]
+; SVE2-NEXT: ldr d1, [sp, #8]
+; SVE2-NEXT: bsl z5.d, z5.d, z1.d, z0.d
+; SVE2-NEXT: str d5, [x0]
+; SVE2-NEXT: add sp, sp, #16
; SVE2-NEXT: ret
%a = load <4 x half>, ptr %ap
%b = load <4 x double>, ptr %bp
More information about the llvm-commits
mailing list