[llvm] [SDAG] Support expanding `FSINCOS` to vector library calls (PR #114039)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 29 05:26:50 PDT 2024
https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/114039
This shares most of its code with the scalar sincos expansion. It allows expanding vector FSINCOS nodes to a library call from the specified `-vector-library`. The upside of this is it will mean the vectorizer only needs to handle the sincos intrinsic, which has no memory effects, and this can handle lowering the intrinsic to a call that takes output pointers.
>From 915aef3b2d33584ddd0f40d1503116b50999524a Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 29 Oct 2024 11:15:13 +0000
Subject: [PATCH] [SDAG] Support expanding `FSINCOS` to vector library calls
This shares most of its code with the scalar sincos expansion. It allows
expanding vector FSINCOS nodes to a library call from the specified
`-vector-library`. The upside of this is it will mean the vectorizer
only needs to handle the sincos intrinsic, which has no memory effects,
and this can handle lowering the intrinsic to a call that takes output
pointers.
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 3 +
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 71 +-----------
.../SelectionDAG/LegalizeVectorOps.cpp | 5 +
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 98 ++++++++++++++++
.../CodeGen/AArch64/veclib-llvm.sincos.ll | 109 ++++++++++++++++++
5 files changed, 216 insertions(+), 70 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index e82bdb6906163c..73c5df8ac47f55 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1599,6 +1599,9 @@ class SelectionDAG {
SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
SDValue Op2);
+ /// Expand the specified \c ISD::FSINCOS node as the Legalize pass would.
+ bool expandFSINCOS(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
/// Expand the specified \c ISD::VAARG node as the Legalize pass would.
SDValue expandVAArg(SDNode *Node);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 47a9ae12248ccb..b4846f32391918 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2348,75 +2348,6 @@ static bool useSinCos(SDNode *Node) {
return false;
}
-/// Issue libcalls to sincos to compute sin / cos pairs.
-void SelectionDAGLegalize::ExpandSinCosLibCall(
- SDNode *Node, SmallVectorImpl<SDValue> &Results) {
- EVT VT = Node->getValueType(0);
- Type *Ty = VT.getTypeForEVT(*DAG.getContext());
- RTLIB::Libcall LC = RTLIB::getFSINCOS(VT);
-
- // Find users of the node that store the results (and share input chains). The
- // destination pointers can be used instead of creating stack allocations.
- SDValue StoresInChain{};
- std::array<StoreSDNode *, 2> ResultStores = {nullptr};
- for (SDNode *User : Node->uses()) {
- if (!ISD::isNormalStore(User))
- continue;
- auto *ST = cast<StoreSDNode>(User);
- if (!ST->isSimple() || ST->getAddressSpace() != 0 ||
- ST->getAlign() < DAG.getDataLayout().getABITypeAlign(Ty) ||
- (StoresInChain && ST->getChain() != StoresInChain) ||
- Node->isPredecessorOf(ST->getChain().getNode()))
- continue;
- ResultStores[ST->getValue().getResNo()] = ST;
- StoresInChain = ST->getChain();
- }
-
- TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry{};
-
- // Pass the argument.
- Entry.Node = Node->getOperand(0);
- Entry.Ty = Ty;
- Args.push_back(Entry);
-
- // Pass the output pointers for sin and cos.
- SmallVector<SDValue, 2> ResultPtrs{};
- for (StoreSDNode *ST : ResultStores) {
- SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(VT);
- Entry.Node = ResultPtr;
- Entry.Ty = PointerType::getUnqual(Ty->getContext());
- Args.push_back(Entry);
- ResultPtrs.push_back(ResultPtr);
- }
-
- SDLoc DL(Node);
- SDValue InChain = StoresInChain ? StoresInChain : DAG.getEntryNode();
- SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
- TLI.getPointerTy(DAG.getDataLayout()));
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(
- TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
- std::move(Args));
-
- auto [Call, OutChain] = TLI.LowerCallTo(CLI);
-
- for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) {
- MachinePointerInfo PtrInfo;
- if (StoreSDNode *ST = ResultStores[ResNo]) {
- // Replace store with the library call.
- DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
- PtrInfo = ST->getPointerInfo();
- } else {
- PtrInfo = MachinePointerInfo::getFixedStack(
- DAG.getMachineFunction(),
- cast<FrameIndexSDNode>(ResultPtr)->getIndex());
- }
- SDValue LoadResult = DAG.getLoad(VT, DL, OutChain, ResultPtr, PtrInfo);
- Results.push_back(LoadResult);
- }
-}
-
SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
SDLoc dl(Node);
EVT VT = Node->getValueType(0);
@@ -4633,7 +4564,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
break;
case ISD::FSINCOS:
// Expand into sincos libcall.
- ExpandSinCosLibCall(Node, Results);
+ (void)DAG.expandFSINCOS(Node, Results);
break;
case ISD::FLOG:
case ISD::STRICT_FLOG:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index c80da28b3dc34d..e31fcd7a57b7cd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1191,6 +1191,11 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
RTLIB::REM_PPCF128, Results))
return;
+ break;
+ case ISD::FSINCOS:
+ if (DAG.expandFSINCOS(Node, Results))
+ return;
+
break;
case ISD::VECTOR_COMPRESS:
Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8f255cce1fe15d..1781abd0e6b035 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -25,6 +25,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/BinaryFormat/Dwarf.h"
@@ -2483,6 +2484,103 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
return Subvectors[0];
}
+bool SelectionDAG::expandFSINCOS(SDNode *Node,
+ SmallVectorImpl<SDValue> &Results) {
+ EVT VT = Node->getValueType(0);
+ LLVMContext *Ctx = getContext();
+ Type *Ty = VT.getTypeForEVT(*Ctx);
+ RTLIB::Libcall LC =
+ RTLIB::getFSINCOS(VT.isVector() ? VT.getVectorElementType() : VT);
+
+ const char *LCName = TLI->getLibcallName(LC);
+ if (!LC || !LCName)
+ return false;
+
+ auto getVecDesc = [&]() -> VecDesc const * {
+ for (bool Masked : {false, true}) {
+ if (VecDesc const *VD = getLibInfo().getVectorMappingInfo(
+ LCName, VT.getVectorElementCount(), Masked)) {
+ return VD;
+ }
+ }
+ return nullptr;
+ };
+
+ VecDesc const *VD = nullptr;
+ if (VT.isVector() && !(VD = getVecDesc()))
+ return false;
+
+ // Find users of the node that store the results (and share input chains). The
+ // destination pointers can be used instead of creating stack allocations.
+ SDValue StoresInChain{};
+ std::array<StoreSDNode *, 2> ResultStores = {nullptr};
+ for (SDNode *User : Node->uses()) {
+ if (!ISD::isNormalStore(User))
+ continue;
+ auto *ST = cast<StoreSDNode>(User);
+ if (!ST->isSimple() || ST->getAddressSpace() != 0 ||
+ ST->getAlign() < getDataLayout().getABITypeAlign(Ty->getScalarType()) ||
+ (StoresInChain && ST->getChain() != StoresInChain) ||
+ Node->isPredecessorOf(ST->getChain().getNode()))
+ continue;
+ ResultStores[ST->getValue().getResNo()] = ST;
+ StoresInChain = ST->getChain();
+ }
+
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry{};
+
+ // Pass the argument.
+ Entry.Node = Node->getOperand(0);
+ Entry.Ty = Ty;
+ Args.push_back(Entry);
+
+ // Pass the output pointers for sin and cos.
+ SmallVector<SDValue, 2> ResultPtrs{};
+ for (StoreSDNode *ST : ResultStores) {
+ SDValue ResultPtr = ST ? ST->getBasePtr() : CreateStackTemporary(VT);
+ Entry.Node = ResultPtr;
+ Entry.Ty = PointerType::getUnqual(Ty->getContext());
+ Args.push_back(Entry);
+ ResultPtrs.push_back(ResultPtr);
+ }
+
+ SDLoc DL(Node);
+
+ if (VD && VD->isMasked()) {
+ EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), *Ctx, VT);
+ Entry.Node = getBoolConstant(true, DL, MaskVT, VT);
+ Entry.Ty = MaskVT.getTypeForEVT(*Ctx);
+ Args.push_back(Entry);
+ }
+
+ SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
+ SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName,
+ TLI->getPointerTy(getDataLayout()));
+ TargetLowering::CallLoweringInfo CLI(*this);
+ CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(
+ TLI->getLibcallCallingConv(LC), Type::getVoidTy(*Ctx), Callee,
+ std::move(Args));
+
+ auto [Call, OutChain] = TLI->LowerCallTo(CLI);
+
+ for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) {
+ MachinePointerInfo PtrInfo;
+ if (StoreSDNode *ST = ResultStores[ResNo]) {
+ // Replace store with the library call.
+ ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
+ PtrInfo = ST->getPointerInfo();
+ } else {
+ PtrInfo = MachinePointerInfo::getFixedStack(
+ getMachineFunction(), cast<FrameIndexSDNode>(ResultPtr)->getIndex());
+ }
+ SDValue LoadResult = getLoad(VT, DL, OutChain, ResultPtr, PtrInfo);
+ Results.push_back(LoadResult);
+ }
+
+ return true;
+}
+
SDValue SelectionDAG::expandVAArg(SDNode *Node) {
SDLoc dl(Node);
const TargetLowering &TLI = getTargetLoweringInfo();
diff --git a/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll b/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll
new file mode 100644
index 00000000000000..46ace2bf54f83b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=sleefgnuabi < %s | FileCheck %s -check-prefix=SLEEF
+; RUN: llc -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL < %s | FileCheck %s -check-prefix=ARMPL
+
+define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; SLEEF-LABEL: test_sincos_v4f32:
+; SLEEF: // %bb.0:
+; SLEEF-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; SLEEF-NEXT: .cfi_def_cfa_offset 16
+; SLEEF-NEXT: .cfi_offset w30, -16
+; SLEEF-NEXT: bl _ZGVnN4vl4l4_sincosf
+; SLEEF-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; SLEEF-NEXT: ret
+;
+; ARMPL-LABEL: test_sincos_v4f32:
+; ARMPL: // %bb.0:
+; ARMPL-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; ARMPL-NEXT: .cfi_def_cfa_offset 16
+; ARMPL-NEXT: .cfi_offset w30, -16
+; ARMPL-NEXT: bl armpl_vsincosq_f32
+; ARMPL-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; ARMPL-NEXT: ret
+ %result = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x)
+ %result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0
+ %result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1
+ store <4 x float> %result.0, ptr %out_sin, align 4
+ store <4 x float> %result.1, ptr %out_cos, align 4
+ ret void
+}
+
+define void @test_sincos_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; SLEEF-LABEL: test_sincos_v2f64:
+; SLEEF: // %bb.0:
+; SLEEF-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; SLEEF-NEXT: .cfi_def_cfa_offset 16
+; SLEEF-NEXT: .cfi_offset w30, -16
+; SLEEF-NEXT: bl _ZGVnN2vl8l8_sincos
+; SLEEF-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; SLEEF-NEXT: ret
+;
+; ARMPL-LABEL: test_sincos_v2f64:
+; ARMPL: // %bb.0:
+; ARMPL-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; ARMPL-NEXT: .cfi_def_cfa_offset 16
+; ARMPL-NEXT: .cfi_offset w30, -16
+; ARMPL-NEXT: bl armpl_vsincosq_f64
+; ARMPL-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; ARMPL-NEXT: ret
+ %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x)
+ %result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0
+ %result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1
+ store <2 x double> %result.0, ptr %out_sin, align 8
+ store <2 x double> %result.1, ptr %out_cos, align 8
+ ret void
+}
+
+define void @test_sincos_nxv4f32(<vscale x 4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; SLEEF-LABEL: test_sincos_nxv4f32:
+; SLEEF: // %bb.0:
+; SLEEF-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; SLEEF-NEXT: .cfi_def_cfa_offset 16
+; SLEEF-NEXT: .cfi_offset w30, -16
+; SLEEF-NEXT: bl _ZGVsNxvl4l4_sincosf
+; SLEEF-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; SLEEF-NEXT: ret
+;
+; ARMPL-LABEL: test_sincos_nxv4f32:
+; ARMPL: // %bb.0:
+; ARMPL-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; ARMPL-NEXT: .cfi_def_cfa_offset 16
+; ARMPL-NEXT: .cfi_offset w30, -16
+; ARMPL-NEXT: ptrue p0.s
+; ARMPL-NEXT: bl armpl_svsincos_f32_x
+; ARMPL-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; ARMPL-NEXT: ret
+ %result = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> %x)
+ %result.0 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %result, 0
+ %result.1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %result, 1
+ store <vscale x 4 x float> %result.0, ptr %out_sin, align 4
+ store <vscale x 4 x float> %result.1, ptr %out_cos, align 4
+ ret void
+}
+
+define void @test_sincos_nxv2f64(<vscale x 2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; SLEEF-LABEL: test_sincos_nxv2f64:
+; SLEEF: // %bb.0:
+; SLEEF-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; SLEEF-NEXT: .cfi_def_cfa_offset 16
+; SLEEF-NEXT: .cfi_offset w30, -16
+; SLEEF-NEXT: bl _ZGVsNxvl8l8_sincos
+; SLEEF-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; SLEEF-NEXT: ret
+;
+; ARMPL-LABEL: test_sincos_nxv2f64:
+; ARMPL: // %bb.0:
+; ARMPL-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; ARMPL-NEXT: .cfi_def_cfa_offset 16
+; ARMPL-NEXT: .cfi_offset w30, -16
+; ARMPL-NEXT: ptrue p0.d
+; ARMPL-NEXT: bl armpl_svsincos_f64_x
+; ARMPL-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; ARMPL-NEXT: ret
+ %result = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> %x)
+ %result.0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %result, 0
+ %result.1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %result, 1
+ store <vscale x 2 x double> %result.0, ptr %out_sin, align 8
+ store <vscale x 2 x double> %result.1, ptr %out_cos, align 8
+ ret void
+}
More information about the llvm-commits
mailing list