[llvm] [IR] Add llvm.sincospi intrinsic (PR #125873)

Wed Feb 5 07:53:00 PST 2025

https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/125873

This adds the `llvm.sincospi` intrinsic, legalization, and lowering (mostly reusing the lowering for sincos and frexp).

The `llvm.sincospi` intrinsic takes a floating-point value and returns both the sine and cosine of the value multiplied by pi. It computes the result more accurately than the naive approach of doing the multiplication ahead of time, especially for large input values.

```
declare { float, float }          @llvm.sincospi.f32(float  %Val)
declare { double, double }        @llvm.sincospi.f64(double %Val)
declare { x86_fp80, x86_fp80 }    @llvm.sincospi.f80(x86_fp80  %Val)
declare { fp128, fp128 }          @llvm.sincospi.f128(fp128 %Val)
declare { ppc_fp128, ppc_fp128 }  @llvm.sincospi.ppcf128(ppc_fp128  %Val)
declare { <4 x float>, <4 x float> } @llvm.sincospi.v4f32(<4 x float>  %Val)
```

Currently, the default lowering of this intrinsic relies on the `sincospi[f|l]` functions being available in the target's runtime (e.g. libc).

>From a91d1886432e1dc576872a6173eba70102954421 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 7 Jan 2025 13:50:46 +0000
Subject: [PATCH 1/2] [IR] Add `llvm.modf` intrinsic

This adds the `llvm.modf` intrinsic, legalization, and lowering.

The `llvm.modf` intrinsic takes a floating-point value and returns both
the integral and fractional parts (as a struct).

```
declare { float, float }             @llvm.modf.f32(float  %Val)
declare { double, double }           @llvm.modf.f64(double %Val)
declare { x86_fp80, x86_fp80 }       @llvm.modf.f80(x86_fp80  %Val)
declare { fp128, fp128 }             @llvm.modf.f128(fp128 %Val)
declare { ppc_fp128, ppc_fp128 }     @llvm.modf.ppcf128(ppc_fp128  %Val)
declare { <4 x float>, <4 x float> } @llvm.modf.v4f32(<4 x float>  %Val)
```

This corresponds to the libm `modf` function but returns multiple values
in a struct (rather than take output pointers), which makes it easier to
vectorize.
---
 llvm/docs/LangRef.rst                         |  60 +++++
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |   3 +
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |   4 +
 .../include/llvm/CodeGen/RuntimeLibcallUtil.h |   4 +
 llvm/include/llvm/IR/Intrinsics.td            |   2 +
 llvm/include/llvm/IR/RuntimeLibcalls.def      |   5 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  10 +-
 .../SelectionDAG/LegalizeFloatTypes.cpp       |   3 +-
 .../SelectionDAG/LegalizeVectorOps.cpp        |   9 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   3 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   4 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   1 +
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  11 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  26 +-
 llvm/test/CodeGen/AArch64/llvm.modf.ll        | 255 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/veclib-llvm.modf.ll |  57 ++++
 16 files changed, 437 insertions(+), 20 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/llvm.modf.ll
 create mode 100644 llvm/test/CodeGen/AArch64/veclib-llvm.modf.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 8891aedcb58e552..587774137b8981e 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -16122,6 +16122,66 @@ of the argument.
 When specified with the fast-math-flag 'afn', the result may be approximated
 using a less accurate calculation.
 
+'``llvm.modf.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.modf`` on any floating-point
+or vector of floating-point type. However, not all targets support all types.
+
+::
+
+ declare { float, float }             @llvm.modf.f32(float  %Val)
+ declare { double, double }           @llvm.modf.f64(double %Val)
+ declare { x86_fp80, x86_fp80 }       @llvm.modf.f80(x86_fp80  %Val)
+ declare { fp128, fp128 }             @llvm.modf.f128(fp128 %Val)
+ declare { ppc_fp128, ppc_fp128 }     @llvm.modf.ppcf128(ppc_fp128  %Val)
+ declare { <4 x float>, <4 x float> } @llvm.modf.v4f32(<4 x float>  %Val)
+
+Overview:
+"""""""""
+
+The '``llvm.modf.*``' intrinsics return the operand's integral and fractional
+parts.
+
+Arguments:
+""""""""""
+
+The argument is a :ref:`floating-point <t_floating>` value or
+:ref:`vector <t_vector>` of floating-point values. Returns two values matching
+the argument type in a struct.
+
+Semantics:
+""""""""""
+
+Return the same values as a corresponding libm '``modf``' function without
+trapping or setting ``errno``.
+
+The first result is the fractional part of the operand and the second result is
+the integral part of the operand. Both results have the same sign as the operand.
+
+Not including exceptional inputs (listed below), `llvm.modf.*` is semantically
+equivalent to:
+
+  %fp = frem <fptype> %x, 1.0  ; Fractional part
+  %ip = fsub <fptype> %x, %fp  ; Integral part
+
+(assuming no floating-point precision errors)
+
+If the argument is a zero, returns a zero with the same sign and a 0 exponent
+for both the fractional and integral parts.
+
+If the argument is an infinity, returns a fractional part of zero with the same
+sign, and infinity with the same sign as the integral part.
+
+If the argument is a NaN, a NaN is returned as both fractional and integral
+parts.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
+
 '``llvm.pow.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index a76de251c713823..8468992ed4b7a3d 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2101,6 +2101,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::sincos:
       ISD = ISD::FSINCOS;
       break;
+    case Intrinsic::modf:
+      ISD = ISD::FMODF;
+      break;
     case Intrinsic::tan:
       ISD = ISD::FTAN;
       break;
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index fd8784a4c10034c..74639a81223b28e 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1058,6 +1058,10 @@ enum NodeType {
   /// FSINCOS - Compute both fsin and fcos as a single operation.
   FSINCOS,
 
+  /// FMODF - Decomposes the given arg in integral and fractional parts, each
+  /// having the same type and sign as the arg.
+  FMODF,
+
   /// Gets the current floating-point environment. The first operand is a token
   /// chain. The results are FP environment, represented by an integer value,
   /// and a token chain.
diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
index 045ec7d3653119d..59313520e0d831c 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
@@ -66,6 +66,10 @@ Libcall getFREXP(EVT RetVT);
 /// UNKNOWN_LIBCALL if there is none.
 Libcall getFSINCOS(EVT RetVT);
 
+/// getMODF - Return the MODF_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getMODF(EVT RetVT);
+
 /// Return the SYNC_FETCH_AND_* value for the given opcode and type, or
 /// UNKNOWN_LIBCALL if there is none.
 Libcall getSYNC(unsigned Opc, MVT VT);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index ee877349a33149e..2c22060237faa61 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1063,6 +1063,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
   def int_roundeven    : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
                              [llvm_anyfloat_ty]>;
+  def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
+                             [llvm_anyfloat_ty]>;
 
   // Truncate a floating point number with a specific rounding mode
   def int_fptrunc_round : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def
index 8153845b52c7ae0..dc69b1ae19769ea 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -354,6 +354,11 @@ HANDLE_LIBCALL(FREXP_F64, "frexp")
 HANDLE_LIBCALL(FREXP_F80, "frexpl")
 HANDLE_LIBCALL(FREXP_F128, "frexpl")
 HANDLE_LIBCALL(FREXP_PPCF128, "frexpl")
+HANDLE_LIBCALL(MODF_F32, "modff")
+HANDLE_LIBCALL(MODF_F64, "modf")
+HANDLE_LIBCALL(MODF_F80, "modfl")
+HANDLE_LIBCALL(MODF_F128, "modfl")
+HANDLE_LIBCALL(MODF_PPCF128, "modfl")
 
 // Floating point environment
 HANDLE_LIBCALL(FEGETENV, "fegetenv")
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 6c9c96ceaa4ba81..f61928a66eb3cff 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4609,12 +4609,15 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     ExpandFPLibCall(Node, RTLIB::LDEXP_F32, RTLIB::LDEXP_F64, RTLIB::LDEXP_F80,
                     RTLIB::LDEXP_F128, RTLIB::LDEXP_PPCF128, Results);
     break;
+  case ISD::FMODF:
   case ISD::FFREXP: {
-    RTLIB::Libcall LC = RTLIB::getFREXP(Node->getValueType(0));
+    EVT VT = Node->getValueType(0);
+    RTLIB::Libcall LC = Node->getOpcode() == ISD::FMODF ? RTLIB::getMODF(VT)
+                                                        : RTLIB::getFREXP(VT);
     bool Expanded = DAG.expandMultipleResultFPLibCall(LC, Node, Results,
                                                       /*CallRetResNo=*/0);
     if (!Expanded)
-      llvm_unreachable("Expected scalar FFREXP to expand to libcall!");
+      llvm_unreachable("Expected scalar FFREXP/FMODF to expand to libcall!");
     break;
   }
   case ISD::FPOWI:
@@ -5503,9 +5506,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Results.push_back(Tmp2.getValue(1));
     break;
   }
+  case ISD::FMODF:
   case ISD::FSINCOS: {
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
-    Tmp2 = DAG.getNode(ISD::FSINCOS, dl, DAG.getVTList(NVT, NVT), Tmp1,
+    Tmp2 = DAG.getNode(Node->getOpcode(), dl, DAG.getVTList(NVT, NVT), Tmp1,
                        Node->getFlags());
     Tmp3 = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
     for (unsigned ResNum = 0; ResNum < Node->getNumValues(); ResNum++)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 71f100bfa034343..2a4eed1ed527a81 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2766,10 +2766,10 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FLDEXP:     R = PromoteFloatRes_ExpOp(N); break;
     case ISD::FFREXP:     R = PromoteFloatRes_FFREXP(N); break;
 
+    case ISD::FMODF:
     case ISD::FSINCOS:
       R = PromoteFloatRes_UnaryWithTwoFPResults(N);
       break;
-
     case ISD::FP_ROUND:   R = PromoteFloatRes_FP_ROUND(N); break;
     case ISD::STRICT_FP_ROUND:
       R = PromoteFloatRes_STRICT_FP_ROUND(N);
@@ -3228,6 +3228,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
 
   case ISD::FFREXP:      R = SoftPromoteHalfRes_FFREXP(N); break;
 
+  case ISD::FMODF:
   case ISD::FSINCOS:
     R = SoftPromoteHalfRes_UnaryWithTwoFPResults(N);
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 6ad08bce44b0a45..416da1bb7bfcf01 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -454,6 +454,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::UMULO:
   case ISD::FCANONICALIZE:
   case ISD::FFREXP:
+  case ISD::FMODF:
   case ISD::FSINCOS:
   case ISD::SADDSAT:
   case ISD::UADDSAT:
@@ -1223,6 +1224,14 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
     break;
   }
+  case ISD::FMODF: {
+    RTLIB::Libcall LC =
+        RTLIB::getMODF(Node->getValueType(0).getVectorElementType());
+    if (DAG.expandMultipleResultFPLibCall(LC, Node, Results,
+                                          /*CallRetResNo=*/0))
+      return;
+    break;
+  }
   case ISD::VECTOR_COMPRESS:
     Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG));
     return;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1000235ab4061f7..adafbe7cdcaa656 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -133,6 +133,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::ADDRSPACECAST:
     R = ScalarizeVecRes_ADDRSPACECAST(N);
     break;
+  case ISD::FMODF:
   case ISD::FFREXP:
   case ISD::FSINCOS:
     R = ScalarizeVecRes_UnaryOpWithTwoResults(N, ResNo);
@@ -1261,6 +1262,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::ADDRSPACECAST:
     SplitVecRes_ADDRSPACECAST(N, Lo, Hi);
     break;
+  case ISD::FMODF:
   case ISD::FFREXP:
   case ISD::FSINCOS:
     SplitVecRes_UnaryOpWithTwoResults(N, ResNo, Lo, Hi);
@@ -4783,6 +4785,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_FSHR:
     Res = WidenVecRes_Ternary(N);
     break;
+  case ISD::FMODF:
   case ISD::FFREXP:
   case ISD::FSINCOS: {
     if (!unrollExpandedOp())
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 428e7a316d247b0..6833f6c183d645b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6977,6 +6977,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1)), Flags));
     return;
+  case Intrinsic::modf:
   case Intrinsic::sincos:
   case Intrinsic::frexp: {
     unsigned Opcode;
@@ -6986,6 +6987,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     case Intrinsic::sincos:
       Opcode = ISD::FSINCOS;
       break;
+    case Intrinsic::modf:
+      Opcode = ISD::FMODF;
+      break;
     case Intrinsic::frexp:
       Opcode = ISD::FFREXP;
       break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index f63c8dd3df1c838..7b1a2d640a2bd4e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -219,6 +219,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FCOS:                       return "fcos";
   case ISD::STRICT_FCOS:                return "strict_fcos";
   case ISD::FSINCOS:                    return "fsincos";
+  case ISD::FMODF:                      return "fmodf";
   case ISD::FTAN:                       return "ftan";
   case ISD::STRICT_FTAN:                return "strict_ftan";
   case ISD::FASIN:                      return "fasin";
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 9c56912aa6ba031..1f39ec205c51794 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -407,6 +407,11 @@ RTLIB::Libcall RTLIB::getFSINCOS(EVT RetVT) {
                       SINCOS_PPCF128);
 }
 
+RTLIB::Libcall RTLIB::getMODF(EVT RetVT) {
+  return getFPLibCall(RetVT, MODF_F32, MODF_F64, MODF_F80, MODF_F128,
+                      MODF_PPCF128);
+}
+
 RTLIB::Libcall RTLIB::getOutlineAtomicHelper(const Libcall (&LC)[5][4],
                                              AtomicOrdering Order,
                                              uint64_t MemSize) {
@@ -775,9 +780,9 @@ void TargetLoweringBase::initActions() {
     setOperationAction({ISD::BITREVERSE, ISD::PARITY}, VT, Expand);
 
     // These library functions default to expand.
-    setOperationAction(
-        {ISD::FROUND, ISD::FPOWI, ISD::FLDEXP, ISD::FFREXP, ISD::FSINCOS}, VT,
-        Expand);
+    setOperationAction({ISD::FROUND, ISD::FPOWI, ISD::FLDEXP, ISD::FFREXP,
+                        ISD::FSINCOS, ISD::FMODF},
+                       VT, Expand);
 
     // These operations default to expand for vector types.
     if (VT.isVector())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8617377ffc55b58..558a8b03bda9711 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -735,19 +735,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote);
   }
 
-  for (auto Op : {ISD::FREM,          ISD::FPOW,          ISD::FPOWI,
-                  ISD::FCOS,          ISD::FSIN,          ISD::FSINCOS,
-                  ISD::FACOS,         ISD::FASIN,         ISD::FATAN,
-                  ISD::FATAN2,        ISD::FCOSH,         ISD::FSINH,
-                  ISD::FTANH,         ISD::FTAN,          ISD::FEXP,
-                  ISD::FEXP2,         ISD::FEXP10,        ISD::FLOG,
-                  ISD::FLOG2,         ISD::FLOG10,        ISD::STRICT_FREM,
-                  ISD::STRICT_FPOW,   ISD::STRICT_FPOWI,  ISD::STRICT_FCOS,
-                  ISD::STRICT_FSIN,   ISD::STRICT_FACOS,  ISD::STRICT_FASIN,
-                  ISD::STRICT_FATAN,  ISD::STRICT_FATAN2, ISD::STRICT_FCOSH,
-                  ISD::STRICT_FSINH,  ISD::STRICT_FTANH,  ISD::STRICT_FEXP,
-                  ISD::STRICT_FEXP2,  ISD::STRICT_FLOG,   ISD::STRICT_FLOG2,
-                  ISD::STRICT_FLOG10, ISD::STRICT_FTAN}) {
+  for (auto Op : {ISD::FREM,         ISD::FPOW,          ISD::FPOWI,
+                  ISD::FCOS,         ISD::FSIN,          ISD::FSINCOS,
+                  ISD::FMODF,        ISD::FACOS,         ISD::FASIN,
+                  ISD::FATAN,        ISD::FATAN2,        ISD::FCOSH,
+                  ISD::FSINH,        ISD::FTANH,         ISD::FTAN,
+                  ISD::FEXP,         ISD::FEXP2,         ISD::FEXP10,
+                  ISD::FLOG,         ISD::FLOG2,         ISD::FLOG10,
+                  ISD::STRICT_FREM,  ISD::STRICT_FPOW,   ISD::STRICT_FPOWI,
+                  ISD::STRICT_FCOS,  ISD::STRICT_FSIN,   ISD::STRICT_FACOS,
+                  ISD::STRICT_FASIN, ISD::STRICT_FATAN,  ISD::STRICT_FATAN2,
+                  ISD::STRICT_FCOSH, ISD::STRICT_FSINH,  ISD::STRICT_FTANH,
+                  ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,  ISD::STRICT_FLOG,
+                  ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, ISD::STRICT_FTAN}) {
     setOperationAction(Op, MVT::f16, Promote);
     setOperationAction(Op, MVT::v4f16, Expand);
     setOperationAction(Op, MVT::v8f16, Expand);
diff --git a/llvm/test/CodeGen/AArch64/llvm.modf.ll b/llvm/test/CodeGen/AArch64/llvm.modf.ll
new file mode 100644
index 000000000000000..41fe796daca86c0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/llvm.modf.ll
@@ -0,0 +1,255 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=aarch64-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s
+
+define { half, half } @test_modf_f16(half %a) {
+; CHECK-LABEL: test_modf_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    ldr s1, [sp, #12]
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    fcvt h1, s1
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %result = call { half, half } @llvm.modf.f16(half %a)
+  ret { half, half } %result
+}
+
+define half @test_modf_f16_only_use_fractional_part(half %a) {
+; CHECK-LABEL: test_modf_f16_only_use_fractional_part:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %result = call { half, half } @llvm.modf.f16(half %a)
+  %result.0 = extractvalue { half, half } %result, 0
+  ret half %result.0
+}
+
+define half @test_modf_f16_only_use_integral_part(half %a) {
+; CHECK-LABEL: test_modf_f16_only_use_integral_part:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %result = call { half, half } @llvm.modf.f16(half %a)
+  %result.1 = extractvalue { half, half } %result, 1
+  ret half %result.1
+}
+
+define { <2 x half>, <2 x half> } @test_modf_v2f16(<2 x half> %a) {
+; CHECK-LABEL: test_modf_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h1, v0.h[1]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    add x0, sp, #44
+; CHECK-NEXT:    fcvt s0, h1
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    add x0, sp, #40
+; CHECK-NEXT:    fcvt s1, h1
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    fcvt h2, s0
+; CHECK-NEXT:    add x0, sp, #56
+; CHECK-NEXT:    mov h1, v1.h[2]
+; CHECK-NEXT:    fcvt s0, h1
+; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v2.h[1], v1.h[0]
+; CHECK-NEXT:    str q2, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    fcvt h2, s0
+; CHECK-NEXT:    add x0, sp, #60
+; CHECK-NEXT:    mov h1, v1.h[3]
+; CHECK-NEXT:    fcvt s0, h1
+; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    ldp s2, s1, [sp, #40]
+; CHECK-NEXT:    fcvt h4, s0
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-NEXT:    fcvt h3, s1
+; CHECK-NEXT:    fcvt h1, s2
+; CHECK-NEXT:    ldr s2, [sp, #56]
+; CHECK-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-NEXT:    fcvt h2, s2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-NEXT:    ldr s3, [sp, #60]
+; CHECK-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-NEXT:    fcvt h2, s3
+; CHECK-NEXT:    mov v1.h[3], v2.h[0]
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+  %result = call { <2 x half>, <2 x half> } @llvm.modf.v2f16(<2 x half> %a)
+  ret { <2 x half>, <2 x half> } %result
+}
+
+define { float, float } @test_modf_f32(float %a) {
+; CHECK-LABEL: test_modf_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    ldr s1, [sp, #12]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %result = call { float, float } @llvm.modf.f32(float %a)
+  ret { float, float } %result
+}
+
+define { <3 x float>, <3 x float> } @test_modf_v3f32(<3 x float> %a) {
+; CHECK-LABEL: test_modf_v3f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    add x0, sp, #56
+; CHECK-NEXT:    add x19, sp, #56
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    add x0, sp, #44
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    add x0, sp, #60
+; CHECK-NEXT:    add x20, sp, #60
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    ldr s1, [sp, #44]
+; CHECK-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-NEXT:    ld1 { v1.s }[1], [x19]
+; CHECK-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-NEXT:    ld1 { v1.s }[2], [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+  %result = call { <3 x float>, <3 x float> } @llvm.modf.v3f32(<3 x float> %a)
+  ret { <3 x float>, <3 x float> } %result
+}
+
+define { <2 x float>, <2 x float> } @test_modf_v2f32(<2 x float> %a) {
+; CHECK-LABEL: test_modf_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    add x0, sp, #40
+; CHECK-NEXT:    add x19, sp, #40
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    add x0, sp, #44
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl modff
+; CHECK-NEXT:    ldr s1, [sp, #44]
+; CHECK-NEXT:    ldr q2, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ld1 { v1.s }[1], [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+  %result = call { <2 x float>, <2 x float> } @llvm.modf.v2f32(<2 x float> %a)
+  ret { <2 x float>, <2 x float> } %result
+}
+
+define { double, double } @test_modf_f64(double %a) {
+; CHECK-LABEL: test_modf_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add x0, sp, #8
+; CHECK-NEXT:    bl modf
+; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %result = call { double, double } @llvm.modf.f64(double %a)
+  ret { double, double } %result
+}
+
+define { <2 x double>, <2 x double> } @test_modf_v2f64(<2 x double> %a) {
+; CHECK-LABEL: test_modf_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov d0, v0.d[1]
+; CHECK-NEXT:    add x0, sp, #32
+; CHECK-NEXT:    add x19, sp, #32
+; CHECK-NEXT:    bl modf
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    add x0, sp, #40
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    bl modf
+; CHECK-NEXT:    ldr d1, [sp, #40]
+; CHECK-NEXT:    ldr q2, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    ld1 { v1.d }[1], [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+  %result = call { <2 x double>, <2 x double> } @llvm.modf.v2f64(<2 x double> %a)
+  ret { <2 x double>, <2 x double> } %result
+}
diff --git a/llvm/test/CodeGen/AArch64/veclib-llvm.modf.ll b/llvm/test/CodeGen/AArch64/veclib-llvm.modf.ll
new file mode 100644
index 000000000000000..1874d265978d702
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/veclib-llvm.modf.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "(bl|ptrue)" --version 5
+; RUN: llc -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=sleefgnuabi < %s | FileCheck %s -check-prefix=SLEEF
+; RUN: llc -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL < %s | FileCheck %s -check-prefix=ARMPL
+
+define <4 x float> @test_modf_v4f32(<4 x float> %x, ptr %out_integral) {
+; SLEEF-LABEL: test_modf_v4f32:
+; SLEEF:    bl _ZGVnN4vl4_modff
+;
+; ARMPL-LABEL: test_modf_v4f32:
+; ARMPL:    bl armpl_vmodfq_f32
+  %result = call { <4 x float>, <4 x float> } @llvm.modf.v4f32(<4 x float> %x)
+  %result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0
+  %result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1
+  store <4 x float> %result.1, ptr %out_integral, align 4
+  ret <4 x float> %result.0
+}
+
+define <2 x double> @test_modf_v2f64(<2 x double> %x, ptr %out_integral) {
+; SLEEF-LABEL: test_modf_v2f64:
+; SLEEF:    bl _ZGVnN2vl8_modf
+;
+; ARMPL-LABEL: test_modf_v2f64:
+; ARMPL:    bl armpl_vmodfq_f64
+  %result = call { <2 x double>, <2 x double> } @llvm.modf.v2f64(<2 x double> %x)
+  %result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0
+  %result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1
+  store <2 x double> %result.1, ptr %out_integral, align 8
+  ret <2 x double> %result.0
+}
+
+define <vscale x 4 x float> @test_modf_nxv4f32(<vscale x 4 x float> %x, ptr %out_integral) {
+; SLEEF-LABEL: test_modf_nxv4f32:
+; SLEEF:    bl _ZGVsNxvl4_modff
+;
+; ARMPL-LABEL: test_modf_nxv4f32:
+; ARMPL:    ptrue p0.s
+; ARMPL:    bl armpl_svmodf_f32_x
+  %result = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> %x)
+  %result.0 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %result, 0
+  %result.1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %result, 1
+  store <vscale x 4 x float> %result.1, ptr %out_integral, align 4
+  ret <vscale x 4 x float> %result.0
+}
+
+define <vscale x 2 x double> @test_modf_nxv2f64(<vscale x 2 x double> %x, ptr %out_integral) {
+; SLEEF-LABEL: test_modf_nxv2f64:
+; SLEEF:    bl _ZGVsNxvl8_modf
+;
+; ARMPL-LABEL: test_modf_nxv2f64:
+; ARMPL:    ptrue p0.d
+; ARMPL:    bl armpl_svmodf_f64_x
+  %result = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.modf.nxv2f64(<vscale x 2 x double> %x)
+  %result.0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %result, 0
+  %result.1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %result, 1
+  store <vscale x 2 x double> %result.1, ptr %out_integral, align 8
+  ret <vscale x 2 x double> %result.0
+}

>From cb929bce3aa4451004ff14d0202f64c46bf6f7c9 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 5 Feb 2025 15:42:17 +0000
Subject: [PATCH 2/2] [IR] Add `llvm.sincospi` intrinsic

This adds the `llvm.sincospi` intrinsic, legalization, and lowering
(mostly reusing the lowering for sincos and frexp).

The `llvm.sincospi` intrinsic takes a floating-point value and returns
both the sine and cosine of the value multiplied by pi. It computes the
result more accurately than the naive approach of doing the
multiplication ahead of time, especially for large input values.

```
declare { float, float }          @llvm.sincospi.f32(float  %Val)
declare { double, double }        @llvm.sincospi.f64(double %Val)
declare { x86_fp80, x86_fp80 }    @llvm.sincospi.f80(x86_fp80  %Val)
declare { fp128, fp128 }          @llvm.sincospi.f128(fp128 %Val)
declare { ppc_fp128, ppc_fp128 }  @llvm.sincospi.ppcf128(ppc_fp128  %Val)
declare { <4 x float>, <4 x float> } @llvm.sincospi.v4f32(<4 x float>  %Val)
```

Currently, the default lowering of this intrinsic relies on the
`sincospi[f|l]` functions being available in the target's runtime
(e.g. libc).
---
 llvm/docs/LangRef.rst                         |  46 +++
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |   3 +
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |   4 +
 .../include/llvm/CodeGen/RuntimeLibcallUtil.h |   4 +
 llvm/include/llvm/IR/Intrinsics.td            |   2 +
 llvm/include/llvm/IR/RuntimeLibcalls.def      |   5 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  13 +-
 .../SelectionDAG/LegalizeFloatTypes.cpp       |   6 +-
 .../SelectionDAG/LegalizeVectorOps.cpp        |  10 +-
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   5 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   4 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   1 +
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |   7 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  31 +-
 llvm/test/CodeGen/AArch64/llvm.sincospi.ll    | 268 ++++++++++++++++++
 .../CodeGen/AArch64/veclib-llvm.sincospi.ll   |  61 ++++
 16 files changed, 444 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/llvm.sincospi.ll
 create mode 100644 llvm/test/CodeGen/AArch64/veclib-llvm.sincospi.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 587774137b8981e..2f1a70e7d52612c 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -16122,6 +16122,52 @@ of the argument.
 When specified with the fast-math-flag 'afn', the result may be approximated
 using a less accurate calculation.
 
+'``llvm.sincospi.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.sincospi`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+      declare { float, float }          @llvm.sincospi.f32(float  %Val)
+      declare { double, double }        @llvm.sincospi.f64(double %Val)
+      declare { x86_fp80, x86_fp80 }    @llvm.sincospi.f80(x86_fp80  %Val)
+      declare { fp128, fp128 }          @llvm.sincospi.f128(fp128 %Val)
+      declare { ppc_fp128, ppc_fp128 }  @llvm.sincospi.ppcf128(ppc_fp128  %Val)
+      declare { <4 x float>, <4 x float> } @llvm.sincospi.v4f32(<4 x float>  %Val)
+
+Overview:
+"""""""""
+
+The '``llvm.sincospi.*``' intrinsics returns the sine and cosine of pi*operand.
+
+Arguments:
+""""""""""
+
+The argument is a :ref:`floating-point <t_floating>` value or
+:ref:`vector <t_vector>` of floating-point values. Returns two values matching
+the argument type in a struct.
+
+Semantics:
+""""""""""
+
+This is equivalent to the `llvm.sincos.*` intrinsic where the argument has been
+multiplied by pi, however, it computes the result more accurately especially
+for large input values.
+
+.. note::
+
+  Currently, the default lowering of this intrinsic relies on the `sincospi[f|l]`
+  functions being available in the target's runtime (e.g. libc).
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
+
 '``llvm.modf.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 8468992ed4b7a3d..339b83637fa8f1c 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2101,6 +2101,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::sincos:
       ISD = ISD::FSINCOS;
       break;
+    case Intrinsic::sincospi:
+      ISD = ISD::FSINCOSPI;
+      break;
     case Intrinsic::modf:
       ISD = ISD::FMODF;
       break;
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 74639a81223b28e..28f407df0597368 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1058,6 +1058,10 @@ enum NodeType {
   /// FSINCOS - Compute both fsin and fcos as a single operation.
   FSINCOS,
 
+  /// FSINCOSPI - Compute both the sine and cosine times pi more accurately
+  /// than FSINCOS(pi*x), especially for large x.
+  FSINCOSPI,
+
   /// FMODF - Decomposes the given arg in integral and fractional parts, each
   /// having the same type and sign as the arg.
   FMODF,
diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
index 59313520e0d831c..34d783ae3f513e9 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
@@ -66,6 +66,10 @@ Libcall getFREXP(EVT RetVT);
 /// UNKNOWN_LIBCALL if there is none.
 Libcall getFSINCOS(EVT RetVT);
 
+/// getSINCOSPI - Return the SINCOSPI_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getSINCOSPI(EVT RetVT);
+
 /// getMODF - Return the MODF_* value for the given types, or
 /// UNKNOWN_LIBCALL if there is none.
 Libcall getMODF(EVT RetVT);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 2c22060237faa61..6fd5c1a820a9464 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1063,6 +1063,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
   def int_roundeven    : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
                              [llvm_anyfloat_ty]>;
+  def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
+                             [llvm_anyfloat_ty]>;
   def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
                              [llvm_anyfloat_ty]>;
 
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def
index dc69b1ae19769ea..a7963543c4350de 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -354,6 +354,11 @@ HANDLE_LIBCALL(FREXP_F64, "frexp")
 HANDLE_LIBCALL(FREXP_F80, "frexpl")
 HANDLE_LIBCALL(FREXP_F128, "frexpl")
 HANDLE_LIBCALL(FREXP_PPCF128, "frexpl")
+HANDLE_LIBCALL(SINCOSPI_F32, "sincospif")
+HANDLE_LIBCALL(SINCOSPI_F64, "sincospi")
+HANDLE_LIBCALL(SINCOSPI_F80, "sincospil")
+HANDLE_LIBCALL(SINCOSPI_F128, "sincospil")
+HANDLE_LIBCALL(SINCOSPI_PPCF128, "sincospil")
 HANDLE_LIBCALL(MODF_F32, "modff")
 HANDLE_LIBCALL(MODF_F64, "modf")
 HANDLE_LIBCALL(MODF_F80, "modfl")
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index f61928a66eb3cff..66d7f57b93fb773 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4520,11 +4520,15 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     ExpandFPLibCall(Node, RTLIB::TANH_F32, RTLIB::TANH_F64, RTLIB::TANH_F80,
                     RTLIB::TANH_F128, RTLIB::TANH_PPCF128, Results);
     break;
-  case ISD::FSINCOS: {
-    RTLIB::Libcall LC = RTLIB::getFSINCOS(Node->getValueType(0));
+  case ISD::FSINCOS:
+  case ISD::FSINCOSPI: {
+    EVT VT = Node->getValueType(0);
+    RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
+                            ? RTLIB::getFSINCOS(VT)
+                            : RTLIB::getSINCOSPI(VT);
     bool Expanded = DAG.expandMultipleResultFPLibCall(LC, Node, Results);
     if (!Expanded)
-      llvm_unreachable("Expected scalar FSINCOS to expand to libcall!");
+      llvm_unreachable("Expected scalar FSINCOS[PI] to expand to libcall!");
     break;
   }
   case ISD::FLOG:
@@ -5507,7 +5511,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     break;
   }
   case ISD::FMODF:
-  case ISD::FSINCOS: {
+  case ISD::FSINCOS:
+  case ISD::FSINCOSPI: {
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(Node->getOpcode(), dl, DAG.getVTList(NVT, NVT), Tmp1,
                        Node->getFlags());
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 2a4eed1ed527a81..4abd07546a84df2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2768,8 +2768,9 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
 
     case ISD::FMODF:
     case ISD::FSINCOS:
-      R = PromoteFloatRes_UnaryWithTwoFPResults(N);
-      break;
+    case ISD::FSINCOSPI:
+                          R = PromoteFloatRes_UnaryWithTwoFPResults(N);
+                          break;
     case ISD::FP_ROUND:   R = PromoteFloatRes_FP_ROUND(N); break;
     case ISD::STRICT_FP_ROUND:
       R = PromoteFloatRes_STRICT_FP_ROUND(N);
@@ -3230,6 +3231,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
 
   case ISD::FMODF:
   case ISD::FSINCOS:
+  case ISD::FSINCOSPI:
     R = SoftPromoteHalfRes_UnaryWithTwoFPResults(N);
     break;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 416da1bb7bfcf01..111b08aeab185a6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -456,6 +456,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FFREXP:
   case ISD::FMODF:
   case ISD::FSINCOS:
+  case ISD::FSINCOSPI:
   case ISD::SADDSAT:
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
@@ -1217,9 +1218,12 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
 
     break;
-  case ISD::FSINCOS: {
-    RTLIB::Libcall LC =
-        RTLIB::getFSINCOS(Node->getValueType(0).getVectorElementType());
+  case ISD::FSINCOS:
+  case ISD::FSINCOSPI: {
+    EVT VT = Node->getValueType(0).getVectorElementType();
+    RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
+                            ? RTLIB::getFSINCOS(VT)
+                            : RTLIB::getSINCOSPI(VT);
     if (DAG.expandMultipleResultFPLibCall(LC, Node, Results))
       return;
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index adafbe7cdcaa656..96d364d32bb6640 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -136,6 +136,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMODF:
   case ISD::FFREXP:
   case ISD::FSINCOS:
+  case ISD::FSINCOSPI:
     R = ScalarizeVecRes_UnaryOpWithTwoResults(N, ResNo);
     break;
   case ISD::ADD:
@@ -1265,6 +1266,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMODF:
   case ISD::FFREXP:
   case ISD::FSINCOS:
+  case ISD::FSINCOSPI:
     SplitVecRes_UnaryOpWithTwoResults(N, ResNo, Lo, Hi);
     break;
 
@@ -4787,7 +4789,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     break;
   case ISD::FMODF:
   case ISD::FFREXP:
-  case ISD::FSINCOS: {
+  case ISD::FSINCOS:
+  case ISD::FSINCOSPI: {
     if (!unrollExpandedOp())
       Res = WidenVecRes_UnaryOpWithTwoResults(N, ResNo);
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 6833f6c183d645b..ef49c6a9eda46ee 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6979,6 +6979,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   case Intrinsic::modf:
   case Intrinsic::sincos:
+  case Intrinsic::sincospi:
   case Intrinsic::frexp: {
     unsigned Opcode;
     switch (Intrinsic) {
@@ -6987,6 +6988,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     case Intrinsic::sincos:
       Opcode = ISD::FSINCOS;
       break;
+    case Intrinsic::sincospi:
+      Opcode = ISD::FSINCOSPI;
+      break;
     case Intrinsic::modf:
       Opcode = ISD::FMODF;
       break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 7b1a2d640a2bd4e..5d3e404350cda32 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -219,6 +219,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FCOS:                       return "fcos";
   case ISD::STRICT_FCOS:                return "strict_fcos";
   case ISD::FSINCOS:                    return "fsincos";
+  case ISD::FSINCOSPI:                  return "fsincospi";
   case ISD::FMODF:                      return "fmodf";
   case ISD::FTAN:                       return "ftan";
   case ISD::STRICT_FTAN:                return "strict_ftan";
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 1f39ec205c51794..d9a19dfceb6d315 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -407,6 +407,11 @@ RTLIB::Libcall RTLIB::getFSINCOS(EVT RetVT) {
                       SINCOS_PPCF128);
 }
 
+RTLIB::Libcall RTLIB::getSINCOSPI(EVT RetVT) {
+  return getFPLibCall(RetVT, SINCOSPI_F32, SINCOSPI_F64, SINCOSPI_F80,
+                      SINCOSPI_F128, SINCOSPI_PPCF128);
+}
+
 RTLIB::Libcall RTLIB::getMODF(EVT RetVT) {
   return getFPLibCall(RetVT, MODF_F32, MODF_F64, MODF_F80, MODF_F128,
                       MODF_PPCF128);
@@ -781,7 +786,7 @@ void TargetLoweringBase::initActions() {
 
     // These library functions default to expand.
     setOperationAction({ISD::FROUND, ISD::FPOWI, ISD::FLDEXP, ISD::FFREXP,
-                        ISD::FSINCOS, ISD::FMODF},
+                        ISD::FSINCOS, ISD::FSINCOSPI, ISD::FMODF},
                        VT, Expand);
 
     // These operations default to expand for vector types.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 558a8b03bda9711..668bfd6a76aae06 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -735,19 +735,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote);
   }
 
-  for (auto Op : {ISD::FREM,         ISD::FPOW,          ISD::FPOWI,
-                  ISD::FCOS,         ISD::FSIN,          ISD::FSINCOS,
-                  ISD::FMODF,        ISD::FACOS,         ISD::FASIN,
-                  ISD::FATAN,        ISD::FATAN2,        ISD::FCOSH,
-                  ISD::FSINH,        ISD::FTANH,         ISD::FTAN,
-                  ISD::FEXP,         ISD::FEXP2,         ISD::FEXP10,
-                  ISD::FLOG,         ISD::FLOG2,         ISD::FLOG10,
-                  ISD::STRICT_FREM,  ISD::STRICT_FPOW,   ISD::STRICT_FPOWI,
-                  ISD::STRICT_FCOS,  ISD::STRICT_FSIN,   ISD::STRICT_FACOS,
-                  ISD::STRICT_FASIN, ISD::STRICT_FATAN,  ISD::STRICT_FATAN2,
-                  ISD::STRICT_FCOSH, ISD::STRICT_FSINH,  ISD::STRICT_FTANH,
-                  ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,  ISD::STRICT_FLOG,
-                  ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, ISD::STRICT_FTAN}) {
+  for (auto Op : {ISD::FREM,          ISD::FPOW,         ISD::FPOWI,
+                  ISD::FCOS,          ISD::FSIN,         ISD::FSINCOS,
+                  ISD::FSINCOSPI,     ISD::FMODF,        ISD::FACOS,
+                  ISD::FASIN,         ISD::FATAN,        ISD::FATAN2,
+                  ISD::FCOSH,         ISD::FSINH,        ISD::FTANH,
+                  ISD::FTAN,          ISD::FEXP,         ISD::FEXP2,
+                  ISD::FEXP10,        ISD::FLOG,         ISD::FLOG2,
+                  ISD::FLOG10,        ISD::STRICT_FREM,  ISD::STRICT_FPOW,
+                  ISD::STRICT_FPOWI,  ISD::STRICT_FCOS,  ISD::STRICT_FSIN,
+                  ISD::STRICT_FACOS,  ISD::STRICT_FASIN, ISD::STRICT_FATAN,
+                  ISD::STRICT_FATAN2, ISD::STRICT_FCOSH, ISD::STRICT_FSINH,
+                  ISD::STRICT_FTANH,  ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,
+                  ISD::STRICT_FLOG,   ISD::STRICT_FLOG2, ISD::STRICT_FLOG10,
+                  ISD::STRICT_FTAN}) {
     setOperationAction(Op, MVT::f16, Promote);
     setOperationAction(Op, MVT::v4f16, Expand);
     setOperationAction(Op, MVT::v8f16, Expand);
@@ -1208,7 +1209,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
           ISD::FSIN,              ISD::FCOS,           ISD::FTAN,
           ISD::FASIN,             ISD::FACOS,          ISD::FATAN,
           ISD::FSINH,             ISD::FCOSH,          ISD::FTANH,
-          ISD::FPOW,              ISD::FLOG,           ISD::FLOG2,          
+          ISD::FPOW,              ISD::FLOG,           ISD::FLOG2,
           ISD::FLOG10,            ISD::FEXP,           ISD::FEXP2,
           ISD::FEXP10,            ISD::FRINT,          ISD::FROUND,
           ISD::FROUNDEVEN,        ISD::FTRUNC,         ISD::FMINNUM,
@@ -1217,7 +1218,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
           ISD::STRICT_FADD,       ISD::STRICT_FSUB,    ISD::STRICT_FMUL,
           ISD::STRICT_FDIV,       ISD::STRICT_FMA,     ISD::STRICT_FCEIL,
           ISD::STRICT_FFLOOR,     ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,
-          ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  
+          ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,
           ISD::STRICT_FROUNDEVEN, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
           ISD::STRICT_FMINIMUM,   ISD::STRICT_FMAXIMUM})
       setOperationAction(Op, MVT::v1f64, Expand);
diff --git a/llvm/test/CodeGen/AArch64/llvm.sincospi.ll b/llvm/test/CodeGen/AArch64/llvm.sincospi.ll
new file mode 100644
index 000000000000000..d1d7d92adc05ae3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/llvm.sincospi.ll
@@ -0,0 +1,268 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=aarch64-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s
+
+define { half, half } @test_sincospi_f16(half %a) {
+; CHECK-LABEL: test_sincospi_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldp s1, s0, [sp, #8]
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    fcvt h1, s1
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  ret { half, half } %result
+}
+
+define half @test_sincospi_f16_only_use_sin(half %a) {
+; CHECK-LABEL: test_sincospi_f16_only_use_sin:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  %result.0 = extractvalue { half, half } %result, 0
+  ret half %result.0
+}
+
+define half @test_sincospi_f16_only_use_cos(half %a) {
+; CHECK-LABEL: test_sincospi_f16_only_use_cos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldr s0, [sp, #8]
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  %result.1 = extractvalue { half, half } %result, 1
+  ret half %result.1
+}
+
+define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) {
+; CHECK-LABEL: test_sincospi_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h1, v0.h[1]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    add x0, sp, #36
+; CHECK-NEXT:    add x1, sp, #32
+; CHECK-NEXT:    fcvt s0, h1
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #28
+; CHECK-NEXT:    add x1, sp, #24
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #44
+; CHECK-NEXT:    add x1, sp, #40
+; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #60
+; CHECK-NEXT:    add x1, sp, #56
+; CHECK-NEXT:    mov h0, v0.h[3]
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldp s2, s0, [sp, #32]
+; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp s3, s1, [sp, #24]
+; CHECK-NEXT:    fcvt h4, s0
+; CHECK-NEXT:    fcvt h2, s2
+; CHECK-NEXT:    fcvt h0, s1
+; CHECK-NEXT:    fcvt h1, s3
+; CHECK-NEXT:    ldp s5, s3, [sp, #40]
+; CHECK-NEXT:    fcvt h3, s3
+; CHECK-NEXT:    mov v0.h[1], v4.h[0]
+; CHECK-NEXT:    fcvt h4, s5
+; CHECK-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-NEXT:    ldp s5, s2, [sp, #56]
+; CHECK-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-NEXT:    fcvt h2, s2
+; CHECK-NEXT:    fcvt h3, s5
+; CHECK-NEXT:    mov v1.h[2], v4.h[0]
+; CHECK-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+  %result = call { <2 x half>, <2 x half> } @llvm.sincospi.v2f16(<2 x half> %a)
+  ret { <2 x half>, <2 x half> } %result
+}
+
+define { float, float } @test_sincospi_f32(float %a) {
+; CHECK-LABEL: test_sincospi_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldp s1, s0, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %result = call { float, float } @llvm.sincospi.f32(float %a)
+  ret { float, float } %result
+}
+
+define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) {
+; CHECK-LABEL: test_sincospi_v3f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    add x0, sp, #20
+; CHECK-NEXT:    add x1, sp, #16
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #28
+; CHECK-NEXT:    add x1, sp, #24
+; CHECK-NEXT:    add x19, sp, #28
+; CHECK-NEXT:    add x20, sp, #24
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #44
+; CHECK-NEXT:    add x1, sp, #40
+; CHECK-NEXT:    add x21, sp, #44
+; CHECK-NEXT:    add x22, sp, #40
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldp s1, s0, [sp, #16]
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    ld1 { v0.s }[1], [x19]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ld1 { v0.s }[2], [x21]
+; CHECK-NEXT:    ld1 { v1.s }[2], [x22]
+; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+  %result = call { <3 x float>, <3 x float> } @llvm.sincospi.v3f32(<3 x float> %a)
+  ret { <3 x float>, <3 x float> } %result
+}
+
+define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) {
+; CHECK-LABEL: test_sincospi_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    add x0, sp, #44
+; CHECK-NEXT:    add x1, sp, #40
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #28
+; CHECK-NEXT:    add x1, sp, #24
+; CHECK-NEXT:    add x19, sp, #28
+; CHECK-NEXT:    add x20, sp, #24
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    ldp s1, s0, [sp, #40]
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    ld1 { v0.s }[1], [x19]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+  %result = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> %a)
+  ret { <2 x float>, <2 x float> } %result
+}
+
+define { double, double } @test_sincospi_f64(double %a) {
+; CHECK-LABEL: test_sincospi_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add x0, sp, #24
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincospi
+; CHECK-NEXT:    ldr d0, [sp, #24]
+; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+  %result = call { double, double } @llvm.sincospi.f64(double %a)
+  ret { double, double } %result
+}
+
+define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) {
+; CHECK-LABEL: test_sincospi_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    add x0, sp, #56
+; CHECK-NEXT:    add x1, sp, #40
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    bl sincospi
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #32
+; CHECK-NEXT:    add x1, sp, #24
+; CHECK-NEXT:    add x19, sp, #32
+; CHECK-NEXT:    add x20, sp, #24
+; CHECK-NEXT:    mov d0, v0.d[1]
+; CHECK-NEXT:    bl sincospi
+; CHECK-NEXT:    ldr d0, [sp, #56]
+; CHECK-NEXT:    ldr d1, [sp, #40]
+; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-NEXT:    ld1 { v0.d }[1], [x19]
+; CHECK-NEXT:    ld1 { v1.d }[1], [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+  %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x double> %a)
+  ret { <2 x double>, <2 x double> } %result
+}
diff --git a/llvm/test/CodeGen/AArch64/veclib-llvm.sincospi.ll b/llvm/test/CodeGen/AArch64/veclib-llvm.sincospi.ll
new file mode 100644
index 000000000000000..6cd05ad8824dbb1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/veclib-llvm.sincospi.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "(bl|ptrue)" --version 5
+; RUN: llc -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=sleefgnuabi < %s | FileCheck %s -check-prefix=SLEEF
+; RUN: llc -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL < %s | FileCheck %s -check-prefix=ARMPL
+
+define void @test_sincospi_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; SLEEF-LABEL: test_sincospi_v4f32:
+; SLEEF:    bl _ZGVnN4vl4l4_sincospif
+;
+; ARMPL-LABEL: test_sincospi_v4f32:
+; ARMPL:    bl armpl_vsincospiq_f32
+  %result = call { <4 x float>, <4 x float> } @llvm.sincospi.v4f32(<4 x float> %x)
+  %result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0
+  %result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1
+  store <4 x float> %result.0, ptr %out_sin, align 4
+  store <4 x float> %result.1, ptr %out_cos, align 4
+  ret void
+}
+
+define void @test_sincospi_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; SLEEF-LABEL: test_sincospi_v2f64:
+; SLEEF:    bl _ZGVnN2vl8l8_sincospi
+;
+; ARMPL-LABEL: test_sincospi_v2f64:
+; ARMPL:    bl armpl_vsincospiq_f64
+  %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x double> %x)
+  %result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0
+  %result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1
+  store <2 x double> %result.0, ptr %out_sin, align 8
+  store <2 x double> %result.1, ptr %out_cos, align 8
+  ret void
+}
+
+define void @test_sincospi_nxv4f32(<vscale x 4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; SLEEF-LABEL: test_sincospi_nxv4f32:
+; SLEEF:    bl _ZGVsNxvl4l4_sincospif
+;
+; ARMPL-LABEL: test_sincospi_nxv4f32:
+; ARMPL:    ptrue p0.s
+; ARMPL:    bl armpl_svsincospi_f32_x
+  %result = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> %x)
+  %result.0 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %result, 0
+  %result.1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %result, 1
+  store <vscale x 4 x float> %result.0, ptr %out_sin, align 4
+  store <vscale x 4 x float> %result.1, ptr %out_cos, align 4
+  ret void
+}
+
+define void @test_sincospi_nxv2f64(<vscale x 2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; SLEEF-LABEL: test_sincospi_nxv2f64:
+; SLEEF:    bl _ZGVsNxvl8l8_sincospi
+;
+; ARMPL-LABEL: test_sincospi_nxv2f64:
+; ARMPL:    ptrue p0.d
+; ARMPL:    bl armpl_svsincospi_f64_x
+  %result = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincospi.nxv2f64(<vscale x 2 x double> %x)
+  %result.0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %result, 0
+  %result.1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %result, 1
+  store <vscale x 2 x double> %result.0, ptr %out_sin, align 8
+  store <vscale x 2 x double> %result.1, ptr %out_cos, align 8
+  ret void
+}