[llvm] [SDAG] Support expanding `FSINCOS` to vector library calls (PR #114039)

Tue Oct 29 05:27:24 PDT 2024

llvmbot wrote:



@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-aarch64

Author: Benjamin Maxwell (MacDue)

<details>
<summary>Changes</summary>

This shares most of its code with the scalar sincos expansion. It allows expanding vector FSINCOS nodes to a library call from the specified `-vector-library`. The upside of this is it will mean the vectorizer only needs to handle the sincos intrinsic, which has no memory effects, and this can handle lowering the intrinsic to a call that takes output pointers.

---
Full diff: https://github.com/llvm/llvm-project/pull/114039.diff


5 Files Affected:

- (modified) llvm/include/llvm/CodeGen/SelectionDAG.h (+3) 
- (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp (+1-70) 
- (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp (+5) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+98) 
- (added) llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll (+109) 


``````````diff

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index e82bdb6906163c..73c5df8ac47f55 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1599,6 +1599,9 @@ class SelectionDAG {
   SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
                               SDValue Op2);
 
+  /// Expand the specified \c ISD::FSINCOS node as the Legalize pass would.
+  bool expandFSINCOS(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
   /// Expand the specified \c ISD::VAARG node as the Legalize pass would.
   SDValue expandVAArg(SDNode *Node);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 47a9ae12248ccb..b4846f32391918 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2348,75 +2348,6 @@ static bool useSinCos(SDNode *Node) {
   return false;
 }
 
-/// Issue libcalls to sincos to compute sin / cos pairs.
-void SelectionDAGLegalize::ExpandSinCosLibCall(
-    SDNode *Node, SmallVectorImpl<SDValue> &Results) {
-  EVT VT = Node->getValueType(0);
-  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
-  RTLIB::Libcall LC = RTLIB::getFSINCOS(VT);
-
-  // Find users of the node that store the results (and share input chains). The
-  // destination pointers can be used instead of creating stack allocations.
-  SDValue StoresInChain{};
-  std::array<StoreSDNode *, 2> ResultStores = {nullptr};
-  for (SDNode *User : Node->uses()) {
-    if (!ISD::isNormalStore(User))
-      continue;
-    auto *ST = cast<StoreSDNode>(User);
-    if (!ST->isSimple() || ST->getAddressSpace() != 0 ||
-        ST->getAlign() < DAG.getDataLayout().getABITypeAlign(Ty) ||
-        (StoresInChain && ST->getChain() != StoresInChain) ||
-        Node->isPredecessorOf(ST->getChain().getNode()))
-      continue;
-    ResultStores[ST->getValue().getResNo()] = ST;
-    StoresInChain = ST->getChain();
-  }
-
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry{};
-
-  // Pass the argument.
-  Entry.Node = Node->getOperand(0);
-  Entry.Ty = Ty;
-  Args.push_back(Entry);
-
-  // Pass the output pointers for sin and cos.
-  SmallVector<SDValue, 2> ResultPtrs{};
-  for (StoreSDNode *ST : ResultStores) {
-    SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(VT);
-    Entry.Node = ResultPtr;
-    Entry.Ty = PointerType::getUnqual(Ty->getContext());
-    Args.push_back(Entry);
-    ResultPtrs.push_back(ResultPtr);
-  }
-
-  SDLoc DL(Node);
-  SDValue InChain = StoresInChain ? StoresInChain : DAG.getEntryNode();
-  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
-                                         TLI.getPointerTy(DAG.getDataLayout()));
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(
-      TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
-      std::move(Args));
-
-  auto [Call, OutChain] = TLI.LowerCallTo(CLI);
-
-  for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) {
-    MachinePointerInfo PtrInfo;
-    if (StoreSDNode *ST = ResultStores[ResNo]) {
-      // Replace store with the library call.
-      DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
-      PtrInfo = ST->getPointerInfo();
-    } else {
-      PtrInfo = MachinePointerInfo::getFixedStack(
-          DAG.getMachineFunction(),
-          cast<FrameIndexSDNode>(ResultPtr)->getIndex());
-    }
-    SDValue LoadResult = DAG.getLoad(VT, DL, OutChain, ResultPtr, PtrInfo);
-    Results.push_back(LoadResult);
-  }
-}
-
 SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
   SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
@@ -4633,7 +4564,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   case ISD::FSINCOS:
     // Expand into sincos libcall.
-    ExpandSinCosLibCall(Node, Results);
+    (void)DAG.expandFSINCOS(Node, Results);
     break;
   case ISD::FLOG:
   case ISD::STRICT_FLOG:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index c80da28b3dc34d..e31fcd7a57b7cd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1191,6 +1191,11 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
                              RTLIB::REM_PPCF128, Results))
       return;
 
+    break;
+  case ISD::FSINCOS:
+    if (DAG.expandFSINCOS(Node, Results))
+      return;
+
     break;
   case ISD::VECTOR_COMPRESS:
     Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8f255cce1fe15d..1781abd0e6b035 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/BinaryFormat/Dwarf.h"
@@ -2483,6 +2484,103 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
   return Subvectors[0];
 }
 
+bool SelectionDAG::expandFSINCOS(SDNode *Node,
+                                 SmallVectorImpl<SDValue> &Results) {
+  EVT VT = Node->getValueType(0);
+  LLVMContext *Ctx = getContext();
+  Type *Ty = VT.getTypeForEVT(*Ctx);
+  RTLIB::Libcall LC =
+      RTLIB::getFSINCOS(VT.isVector() ? VT.getVectorElementType() : VT);
+
+  const char *LCName = TLI->getLibcallName(LC);
+  if (!LC || !LCName)
+    return false;
+
+  auto getVecDesc = [&]() -> VecDesc const * {
+    for (bool Masked : {false, true}) {
+      if (VecDesc const *VD = getLibInfo().getVectorMappingInfo(
+              LCName, VT.getVectorElementCount(), Masked)) {
+        return VD;
+      }
+    }
+    return nullptr;
+  };
+
+  VecDesc const *VD = nullptr;
+  if (VT.isVector() && !(VD = getVecDesc()))
+    return false;
+
+  // Find users of the node that store the results (and share input chains). The
+  // destination pointers can be used instead of creating stack allocations.
+  SDValue StoresInChain{};
+  std::array<StoreSDNode *, 2> ResultStores = {nullptr};
+  for (SDNode *User : Node->uses()) {
+    if (!ISD::isNormalStore(User))
+      continue;
+    auto *ST = cast<StoreSDNode>(User);
+    if (!ST->isSimple() || ST->getAddressSpace() != 0 ||
+        ST->getAlign() < getDataLayout().getABITypeAlign(Ty->getScalarType()) ||
+        (StoresInChain && ST->getChain() != StoresInChain) ||
+        Node->isPredecessorOf(ST->getChain().getNode()))
+      continue;
+    ResultStores[ST->getValue().getResNo()] = ST;
+    StoresInChain = ST->getChain();
+  }
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry{};
+
+  // Pass the argument.
+  Entry.Node = Node->getOperand(0);
+  Entry.Ty = Ty;
+  Args.push_back(Entry);
+
+  // Pass the output pointers for sin and cos.
+  SmallVector<SDValue, 2> ResultPtrs{};
+  for (StoreSDNode *ST : ResultStores) {
+    SDValue ResultPtr = ST ? ST->getBasePtr() : CreateStackTemporary(VT);
+    Entry.Node = ResultPtr;
+    Entry.Ty = PointerType::getUnqual(Ty->getContext());
+    Args.push_back(Entry);
+    ResultPtrs.push_back(ResultPtr);
+  }
+
+  SDLoc DL(Node);
+
+  if (VD && VD->isMasked()) {
+    EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), *Ctx, VT);
+    Entry.Node = getBoolConstant(true, DL, MaskVT, VT);
+    Entry.Ty = MaskVT.getTypeForEVT(*Ctx);
+    Args.push_back(Entry);
+  }
+
+  SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
+  SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName,
+                                     TLI->getPointerTy(getDataLayout()));
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(
+      TLI->getLibcallCallingConv(LC), Type::getVoidTy(*Ctx), Callee,
+      std::move(Args));
+
+  auto [Call, OutChain] = TLI->LowerCallTo(CLI);
+
+  for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) {
+    MachinePointerInfo PtrInfo;
+    if (StoreSDNode *ST = ResultStores[ResNo]) {
+      // Replace store with the library call.
+      ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
+      PtrInfo = ST->getPointerInfo();
+    } else {
+      PtrInfo = MachinePointerInfo::getFixedStack(
+          getMachineFunction(), cast<FrameIndexSDNode>(ResultPtr)->getIndex());
+    }
+    SDValue LoadResult = getLoad(VT, DL, OutChain, ResultPtr, PtrInfo);
+    Results.push_back(LoadResult);
+  }
+
+  return true;
+}
+
 SDValue SelectionDAG::expandVAArg(SDNode *Node) {
   SDLoc dl(Node);
   const TargetLowering &TLI = getTargetLoweringInfo();
diff --git a/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll b/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll
new file mode 100644
index 00000000000000..46ace2bf54f83b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=sleefgnuabi < %s | FileCheck %s -check-prefix=SLEEF
+; RUN: llc -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL < %s | FileCheck %s -check-prefix=ARMPL
+
+define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; SLEEF-LABEL: test_sincos_v4f32:
+; SLEEF:       // %bb.0:
+; SLEEF-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; SLEEF-NEXT:    .cfi_def_cfa_offset 16
+; SLEEF-NEXT:    .cfi_offset w30, -16
+; SLEEF-NEXT:    bl _ZGVnN4vl4l4_sincosf
+; SLEEF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; SLEEF-NEXT:    ret
+;
+; ARMPL-LABEL: test_sincos_v4f32:
+; ARMPL:       // %bb.0:
+; ARMPL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; ARMPL-NEXT:    .cfi_def_cfa_offset 16
+; ARMPL-NEXT:    .cfi_offset w30, -16
+; ARMPL-NEXT:    bl armpl_vsincosq_f32
+; ARMPL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; ARMPL-NEXT:    ret
+  %result = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x)
+  %result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0
+  %result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1
+  store <4 x float> %result.0, ptr %out_sin, align 4
+  store <4 x float> %result.1, ptr %out_cos, align 4
+  ret void
+}
+
+define void @test_sincos_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; SLEEF-LABEL: test_sincos_v2f64:
+; SLEEF:       // %bb.0:
+; SLEEF-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; SLEEF-NEXT:    .cfi_def_cfa_offset 16
+; SLEEF-NEXT:    .cfi_offset w30, -16
+; SLEEF-NEXT:    bl _ZGVnN2vl8l8_sincos
+; SLEEF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; SLEEF-NEXT:    ret
+;
+; ARMPL-LABEL: test_sincos_v2f64:
+; ARMPL:       // %bb.0:
+; ARMPL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; ARMPL-NEXT:    .cfi_def_cfa_offset 16
+; ARMPL-NEXT:    .cfi_offset w30, -16
+; ARMPL-NEXT:    bl armpl_vsincosq_f64
+; ARMPL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; ARMPL-NEXT:    ret
+  %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x)
+  %result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0
+  %result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1
+  store <2 x double> %result.0, ptr %out_sin, align 8
+  store <2 x double> %result.1, ptr %out_cos, align 8
+  ret void
+}
+
+define void @test_sincos_nxv4f32(<vscale x 4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; SLEEF-LABEL: test_sincos_nxv4f32:
+; SLEEF:       // %bb.0:
+; SLEEF-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; SLEEF-NEXT:    .cfi_def_cfa_offset 16
+; SLEEF-NEXT:    .cfi_offset w30, -16
+; SLEEF-NEXT:    bl _ZGVsNxvl4l4_sincosf
+; SLEEF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; SLEEF-NEXT:    ret
+;
+; ARMPL-LABEL: test_sincos_nxv4f32:
+; ARMPL:       // %bb.0:
+; ARMPL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; ARMPL-NEXT:    .cfi_def_cfa_offset 16
+; ARMPL-NEXT:    .cfi_offset w30, -16
+; ARMPL-NEXT:    ptrue p0.s
+; ARMPL-NEXT:    bl armpl_svsincos_f32_x
+; ARMPL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; ARMPL-NEXT:    ret
+  %result = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> %x)
+  %result.0 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %result, 0
+  %result.1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %result, 1
+  store <vscale x 4 x float> %result.0, ptr %out_sin, align 4
+  store <vscale x 4 x float> %result.1, ptr %out_cos, align 4
+  ret void
+}
+
+define void @test_sincos_nxv2f64(<vscale x 2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; SLEEF-LABEL: test_sincos_nxv2f64:
+; SLEEF:       // %bb.0:
+; SLEEF-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; SLEEF-NEXT:    .cfi_def_cfa_offset 16
+; SLEEF-NEXT:    .cfi_offset w30, -16
+; SLEEF-NEXT:    bl _ZGVsNxvl8l8_sincos
+; SLEEF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; SLEEF-NEXT:    ret
+;
+; ARMPL-LABEL: test_sincos_nxv2f64:
+; ARMPL:       // %bb.0:
+; ARMPL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; ARMPL-NEXT:    .cfi_def_cfa_offset 16
+; ARMPL-NEXT:    .cfi_offset w30, -16
+; ARMPL-NEXT:    ptrue p0.d
+; ARMPL-NEXT:    bl armpl_svsincos_f64_x
+; ARMPL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; ARMPL-NEXT:    ret
+  %result = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> %x)
+  %result.0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %result, 0
+  %result.1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %result, 1
+  store <vscale x 2 x double> %result.0, ptr %out_sin, align 8
+  store <vscale x 2 x double> %result.1, ptr %out_cos, align 8
+  ret void
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/114039