[llvm] 2f0308e - [arm64] Add tan intrinsic lowering (#94545)

Fri Jun 7 06:42:10 PDT 2024

Author: Farzon Lotfi
Date: 2024-06-07T09:42:06-04:00
New Revision: 2f0308ed02ea622b8228d271d9aebe4bd4deacdd

URL: https://github.com/llvm/llvm-project/commit/2f0308ed02ea622b8228d271d9aebe4bd4deacdd
DIFF: https://github.com/llvm/llvm-project/commit/2f0308ed02ea622b8228d271d9aebe4bd4deacdd.diff

LOG: [arm64] Add tan intrinsic lowering (#94545)

This change is an implementation of
https://github.com/llvm/llvm-project/issues/87367's investigation on
supporting IEEE math operations as intrinsics.
Which was discussed in this RFC:
https://discourse.llvm.org/t/rfc-all-the-math-intrinsics/78294

This PR is just for Tan.

Now that x86 tan backend landed:
https://github.com/llvm/llvm-project/pull/90503 we can add other
backends since the shared pieces are in tree now.

Changes:
- `llvm/include/llvm/Analysis/VecFuncs.def` - vectorization of tan for
arm64 backends.
- `llvm/lib/Target/AArch64/AArch64FastISel.cpp` - Add tan to the libcall
table
- `llvm/lib/Target/AArch64/AArch64ISelLowering.cpp` - Add tan expansion
for f128, f16, and vector\neon operations
- `llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp` define
`G_FTAN` as a legal arm64 instruction

resolves #94755

Added: 
    llvm/test/CodeGen/AArch64/GlobalISel/legalize-tan.mir

Modified: 
    llvm/include/llvm/Analysis/VecFuncs.def
    llvm/lib/Target/AArch64/AArch64FastISel.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
    llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
    llvm/test/CodeGen/AArch64/f16-instructions.ll
    llvm/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll
    llvm/test/CodeGen/AArch64/illegal-float-ops.ll
    llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
    llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll
    llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll
    llvm/test/CodeGen/AArch64/vec-libcalls.ll
    llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
    llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 402189769c20c..ffdf8b8c3bc79 100644

--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -92,6 +92,11 @@ TLI_DEFINE_VECFUNC("llvm.sin.f64", "_simd_sin_d2", FIXED(2), "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("sinf", "_simd_sin_f4", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.sin.f32", "_simd_sin_f4", FIXED(4), "_ZGV_LLVM_N4v")
 
+TLI_DEFINE_VECFUNC("tan", "_simd_tan_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.tan.f64", "_simd_tan_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("tanf", "_simd_tan_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "_simd_tan_f4", FIXED(4), "_ZGV_LLVM_N4v")
+
 // Floating-Point Arithmetic and Auxiliary Functions
 TLI_DEFINE_VECFUNC("cbrt", "_simd_cbrt_d2", FIXED(2), "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("cbrtf", "_simd_cbrt_f4", FIXED(4), "_ZGV_LLVM_N4v")
@@ -584,6 +589,7 @@ TLI_DEFINE_VECFUNC("sinpi", "_ZGVnN2v_sinpi", FIXED(2), "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("sqrt", "_ZGVnN2v_sqrt", FIXED(2), "_ZGV_LLVM_N2v")
 
 TLI_DEFINE_VECFUNC("tan", "_ZGVnN2v_tan", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGVnN2v_tan", FIXED(2), "_ZGV_LLVM_N2v")
 
 TLI_DEFINE_VECFUNC("tanh", "_ZGVnN2v_tanh", FIXED(2), "_ZGV_LLVM_N2v")
 
@@ -681,6 +687,7 @@ TLI_DEFINE_VECFUNC("sinpif", "_ZGVnN4v_sinpif", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("sqrtf", "_ZGVnN4v_sqrtf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("tanf", "_ZGVnN4v_tanf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVnN4v_tanf", FIXED(4), "_ZGV_LLVM_N4v")
 
 TLI_DEFINE_VECFUNC("tanhf", "_ZGVnN4v_tanhf", FIXED(4), "_ZGV_LLVM_N4v")
 
@@ -828,6 +835,8 @@ TLI_DEFINE_VECFUNC("sqrtf", "_ZGVsMxv_sqrtf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
 TLI_DEFINE_VECFUNC("tan", "_ZGVsMxv_tan",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("tanf", "_ZGVsMxv_tanf", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGVsMxv_tan", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVsMxv_tanf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
 TLI_DEFINE_VECFUNC("tanh", "_ZGVsMxv_tanh",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("tanhf", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED, "_ZGVsMxv")
@@ -1087,6 +1096,11 @@ TLI_DEFINE_VECFUNC("tanf", "armpl_vtanq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tan", "armpl_svtan_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("tanf", "armpl_svtan_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
 
+TLI_DEFINE_VECFUNC("llvm.tan.f64", "armpl_vtanq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "armpl_vtanq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tan.f64", "armpl_svtan_f64_x", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "armpl_svtan_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
 TLI_DEFINE_VECFUNC("tanh", "armpl_vtanhq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("tanhf", "armpl_vtanhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tanh", "armpl_svtanh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")

diff  --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 62cf6a2c47acc..56fd15f23363d 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3534,6 +3534,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
   }
   case Intrinsic::sin:
   case Intrinsic::cos:
+  case Intrinsic::tan:
   case Intrinsic::pow: {
     MVT RetVT;
     if (!isTypeLegal(II->getType(), RetVT))
@@ -3542,11 +3543,11 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (RetVT != MVT::f32 && RetVT != MVT::f64)
       return false;
 
-    static const RTLIB::Libcall LibCallTable[3][2] = {
-      { RTLIB::SIN_F32, RTLIB::SIN_F64 },
-      { RTLIB::COS_F32, RTLIB::COS_F64 },
-      { RTLIB::POW_F32, RTLIB::POW_F64 }
-    };
+    static const RTLIB::Libcall LibCallTable[4][2] = {
+        {RTLIB::SIN_F32, RTLIB::SIN_F64},
+        {RTLIB::COS_F32, RTLIB::COS_F64},
+        {RTLIB::TAN_F32, RTLIB::TAN_F64},
+        {RTLIB::POW_F32, RTLIB::POW_F64}};
     RTLIB::Libcall LC;
     bool Is64Bit = RetVT == MVT::f64;
     switch (II->getIntrinsicID()) {
@@ -3558,9 +3559,12 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     case Intrinsic::cos:
       LC = LibCallTable[1][Is64Bit];
       break;
-    case Intrinsic::pow:
+    case Intrinsic::tan:
       LC = LibCallTable[2][Is64Bit];
       break;
+    case Intrinsic::pow:
+      LC = LibCallTable[3][Is64Bit];
+      break;
     }
 
     ArgListTy Args;

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 360a841bdade4..48bf648b00522 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -543,6 +543,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
   setOperationAction(ISD::FSQRT, MVT::f128, Expand);
   setOperationAction(ISD::FSUB, MVT::f128, LibCall);
+  setOperationAction(ISD::FTAN, MVT::f128, Expand);
   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
   setOperationAction(ISD::SETCC, MVT::f128, Custom);
   setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
@@ -727,14 +728,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote);
   }
 
-  for (auto Op : {ISD::FREM,        ISD::FPOW,         ISD::FPOWI,
-                  ISD::FCOS,        ISD::FSIN,         ISD::FSINCOS,
-                  ISD::FEXP,        ISD::FEXP2,        ISD::FEXP10,
-                  ISD::FLOG,        ISD::FLOG2,        ISD::FLOG10,
-                  ISD::STRICT_FREM,
-                  ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS,
-                  ISD::STRICT_FSIN, ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,
-                  ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
+  for (auto Op : {ISD::FREM,         ISD::FPOW,         ISD::FPOWI,
+                  ISD::FCOS,         ISD::FSIN,         ISD::FSINCOS,
+                  ISD::FTAN,         ISD::FEXP,         ISD::FEXP2,
+                  ISD::FEXP10,       ISD::FLOG,         ISD::FLOG2,
+                  ISD::FLOG10,       ISD::STRICT_FREM,  ISD::STRICT_FPOW,
+                  ISD::STRICT_FPOWI, ISD::STRICT_FCOS,  ISD::STRICT_FSIN,
+                  ISD::STRICT_FEXP,  ISD::STRICT_FEXP2, ISD::STRICT_FLOG,
+                  ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
     setOperationAction(Op, MVT::f16, Promote);
     setOperationAction(Op, MVT::v4f16, Expand);
     setOperationAction(Op, MVT::v8f16, Expand);
@@ -1171,26 +1172,27 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   if (Subtarget->isNeonAvailable()) {
     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
     // silliness like this:
+    // clang-format off
     for (auto Op :
-         {ISD::SELECT,         ISD::SELECT_CC,
-          ISD::BR_CC,          ISD::FADD,           ISD::FSUB,
-          ISD::FMUL,           ISD::FDIV,           ISD::FMA,
-          ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
-          ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
-          ISD::FSIN,           ISD::FCOS,           ISD::FPOW,
-          ISD::FLOG,           ISD::FLOG2,          ISD::FLOG10,
-          ISD::FEXP,           ISD::FEXP2,          ISD::FEXP10,
-          ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
-          ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
-          ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
-          ISD::STRICT_FSUB,    ISD::STRICT_FMUL,    ISD::STRICT_FDIV,
-          ISD::STRICT_FMA,     ISD::STRICT_FCEIL,   ISD::STRICT_FFLOOR,
-          ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,   ISD::STRICT_FNEARBYINT,
-          ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  ISD::STRICT_FROUNDEVEN,
-          ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
-          ISD::STRICT_FMAXIMUM})
+         {ISD::SELECT,            ISD::SELECT_CC,
+          ISD::BR_CC,             ISD::FADD,           ISD::FSUB,
+          ISD::FMUL,              ISD::FDIV,           ISD::FMA,
+          ISD::FNEG,              ISD::FABS,           ISD::FCEIL,
+          ISD::FSQRT,             ISD::FFLOOR,         ISD::FNEARBYINT,
+          ISD::FSIN,              ISD::FCOS,           ISD::FTAN,
+          ISD::FPOW,              ISD::FLOG,           ISD::FLOG2,          
+          ISD::FLOG10,            ISD::FEXP,           ISD::FEXP2,
+          ISD::FEXP10,            ISD::FRINT,          ISD::FROUND,
+          ISD::FROUNDEVEN,        ISD::FTRUNC,         ISD::FMINNUM,
+          ISD::FMAXNUM,           ISD::FMINIMUM,       ISD::FMAXIMUM,
+          ISD::STRICT_FADD,       ISD::STRICT_FSUB,    ISD::STRICT_FMUL,
+          ISD::STRICT_FDIV,       ISD::STRICT_FMA,     ISD::STRICT_FCEIL,
+          ISD::STRICT_FFLOOR,     ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,
+          ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  
+          ISD::STRICT_FROUNDEVEN, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
+          ISD::STRICT_FMINIMUM,   ISD::STRICT_FMAXIMUM})
       setOperationAction(Op, MVT::v1f64, Expand);
-
+    // clang-format on
     for (auto Op :
          {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
           ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
@@ -1622,6 +1624,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FCOS, VT, Expand);
       setOperationAction(ISD::FSIN, VT, Expand);
       setOperationAction(ISD::FSINCOS, VT, Expand);
+      setOperationAction(ISD::FTAN, VT, Expand);
       setOperationAction(ISD::FEXP, VT, Expand);
       setOperationAction(ISD::FEXP2, VT, Expand);
       setOperationAction(ISD::FEXP10, VT, Expand);
@@ -1803,6 +1806,7 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
     setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FTAN, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
     setOperationAction(ISD::FLOG, VT, Expand);
     setOperationAction(ISD::FLOG2, VT, Expand);

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 07a0473888ee5..42cd43c3afa37 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -267,9 +267,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .libcallFor({{s64, s128}})
       .minScalarOrElt(1, MinFPScalar);
 
-  getActionDefinitionsBuilder(
-      {G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2, G_FLOG10,
-       G_FEXP, G_FEXP2, G_FEXP10})
+  getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2,
+                               G_FLOG10, G_FTAN, G_FEXP, G_FEXP2, G_FEXP10})
       // We need a call for these, so we always need to scalarize.
       .scalarize(0)
       // Regardless of FP16 support, widen 16-bit elements to 32-bits.

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index a61931b898aea..eb94cc5d0fb61 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -2313,6 +2313,14 @@ define float @test_sin_f32(float %x) {
   ret float %y
 }
 
+declare float @llvm.tan.f32(float)
+define float @test_tan_f32(float %x) {
+  ; CHECK-LABEL: name:            test_tan_f32
+  ; CHECK: %{{[0-9]+}}:_(s32) = G_FTAN %{{[0-9]+}}
+  %y = call float @llvm.tan.f32(float %x)
+  ret float %y
+}
+
 declare float @llvm.sqrt.f32(float)
 define float @test_sqrt_f32(float %x) {
   ; CHECK-LABEL: name:            test_sqrt_f32

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-tan.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-tan.mir
new file mode 100644
index 0000000000000..455aae0128364
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-tan.mir
@@ -0,0 +1,227 @@
+# RUN: llc -verify-machineinstrs -mtriple aarch64--- \
+# RUN: -run-pass=legalizer -mattr=+fullfp16 -global-isel %s -o - \
+# RUN: | FileCheck %s
+...
+---
+name:            test_v4f16.tan
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+    ; CHECK-LABEL: name:            test_v4f16.tan
+    ; CHECK: [[V1:%[0-9]+]]:_(s16), [[V2:%[0-9]+]]:_(s16), [[V3:%[0-9]+]]:_(s16), [[V4:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s16>)
+
+    ; CHECK-DAG: [[V1_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V1]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V1_S32]](s32)
+    ; CHECK-NEXT: BL &tanf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT1_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT1:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT1_S32]](s32)
+
+    ; CHECK-DAG: [[V2_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V2]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V2_S32]](s32)
+    ; CHECK-NEXT: BL &tanf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT2_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT2:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT2_S32]](s32)
+
+    ; CHECK-DAG: [[V3_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V3]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V3_S32]](s32)
+    ; CHECK-NEXT: BL &tanf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT3_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT3:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT3_S32]](s32)
+
+    ; CHECK-DAG: [[V4_S32:%[0-9]+]]:_(s32) = G_FPEXT [[V4]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[V4_S32]](s32)
+    ; CHECK-NEXT: BL &tanf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[ELT4_S32:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[ELT4:%[0-9]+]]:_(s16) = G_FPTRUNC [[ELT4_S32]](s32)
+
+    ; CHECK-DAG: %{{[0-9]+}}:_(<4 x s16>) = G_BUILD_VECTOR [[ELT1]](s16), [[ELT2]](s16), [[ELT3]](s16), [[ELT4]](s16)
+
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = G_FTAN %0
+    $d0 = COPY %1(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v8f16.tan
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v8f16.tan
+
+    ; This is big, so let's just check for the 8 calls to tanf, the the
+    ; G_UNMERGE_VALUES, and the G_BUILD_VECTOR. The other instructions ought
+    ; to be covered by the other tests.
+
+    ; CHECK: G_UNMERGE_VALUES
+    ; CHECK: BL &tanf
+    ; CHECK: BL &tanf
+    ; CHECK: BL &tanf
+    ; CHECK: BL &tanf
+    ; CHECK: BL &tanf
+    ; CHECK: BL &tanf
+    ; CHECK: BL &tanf
+    ; CHECK: BL &tanf
+    ; CHECK: G_BUILD_VECTOR
+
+    %0:_(<8 x s16>) = COPY $q0
+    %1:_(<8 x s16>) = G_FTAN %0
+    $q0 = COPY %1(<8 x s16>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f32.tan
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name:            test_v2f32.tan
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &tanf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &tanf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32)
+
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s32>) = G_FTAN %0
+    $d0 = COPY %1(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            test_v4f32.tan
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+    ; CHECK-LABEL: name:            test_v4f32.tan
+    ; CHECK: [[V1:%[0-9]+]]:_(s32), [[V2:%[0-9]+]]:_(s32), [[V3:%[0-9]+]]:_(s32), [[V4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s32>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V1]](s32)
+    ; CHECK-DAG: BL &tanf
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V2]](s32)
+    ; CHECK-DAG: BL &tanf
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V3]](s32)
+    ; CHECK-DAG: BL &tanf
+    ; CHECK-DAG: [[ELT3:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $s0 = COPY [[V4]](s32)
+    ; CHECK-DAG: BL &tanf
+    ; CHECK-DAG: [[ELT4:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<4 x s32>) = G_BUILD_VECTOR [[ELT1]](s32), [[ELT2]](s32), [[ELT3]](s32), [[ELT4]](s32)
+
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s32>) = G_FTAN %0
+    $q0 = COPY %1(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v2f64.tan
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name:            test_v2f64.tan
+    ; CHECK: [[V1:%[0-9]+]]:_(s64), [[V2:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES %{{[0-9]+}}(<2 x s64>)
+
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V1]](s64)
+    ; CHECK-DAG: BL &tan
+    ; CHECK-DAG: [[ELT1:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: ADJCALLSTACKDOWN
+    ; CHECK-DAG: $d0 = COPY [[V2]](s64)
+    ; CHECK-DAG: BL &tan
+    ; CHECK-DAG: [[ELT2:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-DAG: ADJCALLSTACKUP
+
+    ; CHECK-DAG: %1:_(<2 x s64>) = G_BUILD_VECTOR [[ELT1]](s64), [[ELT2]](s64)
+
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>) = G_FTAN %0
+    $q0 = COPY %1(<2 x s64>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_tan_half
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $h0
+    ; CHECK-LABEL: name:            test_tan_half
+    ; CHECK: [[REG1:%[0-9]+]]:_(s32) = G_FPEXT %0(s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN
+    ; CHECK-NEXT: $s0 = COPY [[REG1]](s32)
+    ; CHECK-NEXT: BL &tanf
+    ; CHECK-NEXT: ADJCALLSTACKUP
+    ; CHECK-NEXT: [[REG2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[RES:%[0-9]+]]:_(s16) = G_FPTRUNC [[REG2]](s32)
+
+    %0:_(s16) = COPY $h0
+    %1:_(s16) = G_FTAN %0
+    $h0 = COPY %1(s16)
+    RET_ReallyLR implicit $h0

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index d71111b57efe5..04c9a08c50038 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -675,8 +675,9 @@
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FTAN (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FSQRT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected

diff  --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index fa362ae798aa8..998f7dd38c3f5 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -759,6 +759,7 @@ declare half @llvm.sqrt.f16(half %a) #0
 declare half @llvm.powi.f16.i32(half %a, i32 %b) #0
 declare half @llvm.sin.f16(half %a) #0
 declare half @llvm.cos.f16(half %a) #0
+declare half @llvm.tan.f16(half %a) #0
 declare half @llvm.pow.f16(half %a, half %b) #0
 declare half @llvm.exp.f16(half %a) #0
 declare half @llvm.exp2.f16(half %a) #0
@@ -870,6 +871,31 @@ define half @test_cos(half %a) #0 {
   ret half %r
 }
 
+; FALLBACK-NOT: remark:{{.*}}test_tan
+; FALLBACK-FP16-NOT: remark:{{.*}}test_tan
+
+; CHECK-COMMON-LABEL: test_tan:
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-COMMON-NEXT: mov  x29, sp
+; CHECK-COMMON-NEXT: fcvt s0, h0
+; CHECK-COMMON-NEXT: bl {{_?}}tanf
+; CHECK-COMMON-NEXT: fcvt h0, s0
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
+; CHECK-COMMON-NEXT: ret
+
+; GISEL-LABEL: test_tan:
+; GISEL-NEXT: stp x29, x30, [sp, #-16]!
+; GISEL-NEXT: mov  x29, sp
+; GISEL-NEXT: fcvt s0, h0
+; GISEL-NEXT: bl {{_?}}tanf
+; GISEL-NEXT: fcvt h0, s0
+; GISEL-NEXT: ldp x29, x30, [sp], #16
+; GISEL-NEXT: ret
+define half @test_tan(half %a) #0 {
+  %r = call half @llvm.tan.f16(half %a)
+  ret half %r
+}
+
 ; CHECK-COMMON-LABEL: test_pow:
 ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
 ; CHECK-COMMON-NEXT: mov  x29, sp

diff  --git a/llvm/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll b/llvm/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll
index 16bab1565e7fa..7ff4dfca3f4c3 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll
@@ -67,6 +67,28 @@ define double @cos_f64(double %a) {
   ret double %1
 }
 
+define float @tan_f32(float %a) {
+; SMALL-LABEL: tan_f32
+; SMALL:       bl _tanf
+; LARGE-LABEL: tan_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _tanf at GOTPAGE
+; LARGE:       ldr [[REG]], [[[REG]], _tanf at GOTPAGEOFF]
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call float @llvm.tan.f32(float %a)
+  ret float %1
+}
+
+define double @tan_f64(double %a) {
+; SMALL-LABEL: tan_f64
+; SMALL:       bl _tan
+; LARGE-LABEL: tan_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _tan at GOTPAGE
+; LARGE:       ldr [[REG]], [[[REG]], _tan at GOTPAGEOFF]
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call double @llvm.tan.f64(double %a)
+  ret double %1
+}
+
 define float @pow_f32(float %a, float %b) {
 ; SMALL-LABEL: pow_f32
 ; SMALL:       bl _powf
@@ -92,5 +114,7 @@ declare float @llvm.sin.f32(float)
 declare double @llvm.sin.f64(double)
 declare float @llvm.cos.f32(float)
 declare double @llvm.cos.f64(double)
+declare float @llvm.tan.f32(float)
+declare double @llvm.tan.f64(double)
 declare float @llvm.pow.f32(float, float)
 declare double @llvm.pow.f64(double, double)

diff  --git a/llvm/test/CodeGen/AArch64/illegal-float-ops.ll b/llvm/test/CodeGen/AArch64/illegal-float-ops.ll
index 20435e10d89ff..3281a98767795 100644
--- a/llvm/test/CodeGen/AArch64/illegal-float-ops.ll
+++ b/llvm/test/CodeGen/AArch64/illegal-float-ops.ll
@@ -159,6 +159,28 @@ define void @test_sin(float %float, double %double, fp128 %fp128) {
 
 }
 
+declare float @llvm.tan.f32(float)
+declare double @llvm.tan.f64(double)
+declare fp128 @llvm.tan.f128(fp128)
+
+define void @test_tan(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_tan:
+
+   %tanfloat = call float @llvm.tan.f32(float %float)
+   store float %tanfloat, ptr @varfloat
+; CHECK: bl tanf
+
+   %tandouble = call double @llvm.tan.f64(double %double)
+   store double %tandouble, ptr @vardouble
+; CHECK: bl tan
+
+   %tanfp128 = call fp128 @llvm.tan.f128(fp128 %fp128)
+   store fp128 %tanfp128, ptr @varfp128
+; CHECK: bl tanl
+  ret void
+
+}
+
 declare float @llvm.pow.f32(float, float)
 declare double @llvm.pow.f64(double, double)
 declare fp128 @llvm.pow.f128(fp128, fp128)

diff  --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
index 758df0493cc50..ce0c5572577dc 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
@@ -15,7 +15,7 @@ declare <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [36 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_svcos_f64_x, ptr @armpl_svcos_f32_x, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_svexp_f64_x, ptr @armpl_svexp_f32_x, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_svexp10_f64_x, ptr @armpl_svexp10_f32_x, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_svexp2_f64_x, ptr @armpl_svexp2_f32_x, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_svlog_f64_x, ptr @armpl_svlog_f32_x, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32, ptr @armpl_svlog10_f64_x, ptr @armpl_svlog10_f32_x, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_svlog2_f64_x, ptr @armpl_svlog2_f32_x, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_svsin_f64_x, ptr @armpl_svsin_f32_x, ptr @armpl_vfmodq_f64, ptr @armpl_vfmodq_f32, ptr @armpl_svfmod_f64_x, ptr @armpl_svfmod_f32_x], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [40 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_svcos_f64_x, ptr @armpl_svcos_f32_x, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_svexp_f64_x, ptr @armpl_svexp_f32_x, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_svexp10_f64_x, ptr @armpl_svexp10_f32_x, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_svexp2_f64_x, ptr @armpl_svexp2_f32_x, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_svlog_f64_x, ptr @armpl_svlog_f32_x, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32, ptr @armpl_svlog10_f64_x, ptr @armpl_svlog10_f32_x, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_svlog2_f64_x, ptr @armpl_svlog2_f32_x, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_svsin_f64_x, ptr @armpl_svsin_f32_x, ptr @armpl_vtanq_f64, ptr @armpl_vtanq_f32, ptr @armpl_svtan_f64_x, ptr @armpl_svtan_f32_x, ptr @armpl_vfmodq_f64, ptr @armpl_vfmodq_f32, ptr @armpl_svfmod_f64_x, ptr @armpl_svfmod_f32_x], section "llvm.metadata"
 ;.
 define <2 x double> @llvm_cos_f64(<2 x double> %in) {
 ; CHECK-LABEL: define <2 x double> @llvm_cos_f64
@@ -424,6 +424,50 @@ define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) #0 {
   ret <vscale x 4 x float> %1
 }
 
+declare <2 x double> @llvm.tan.v2f64(<2 x double>)
+declare <4 x float> @llvm.tan.v4f32(<4 x float>)
+declare <vscale x 2 x double> @llvm.tan.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.tan.nxv4f32(<vscale x 4 x float>)
+
+define <2 x double> @llvm_tan_f64(<2 x double> %in) {
+; CHECK-LABEL: define <2 x double> @llvm_tan_f64
+; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @armpl_vtanq_f64(<2 x double> [[IN]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.tan.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_tan_f32(<4 x float> %in) {
+; CHECK-LABEL: define <4 x float> @llvm_tan_f32
+; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vtanq_f32(<4 x float> [[IN]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.tan.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_tan_vscale_f64(<vscale x 2 x double> %in) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @llvm_tan_vscale_f64
+; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svtan_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.tan.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_tan_vscale_f32(<vscale x 4 x float> %in) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @llvm_tan_vscale_f32
+; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svtan_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.tan.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
 
 define <2 x double> @frem_f64(<2 x double> %in) {
 ; CHECK-LABEL: define <2 x double> @frem_f64

diff  --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll
index 9e214655e413a..c58a9bc8342de 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll
@@ -4,7 +4,7 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [18 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf, ptr @_ZGVsMxvv_fmod, ptr @_ZGVsMxvv_fmodf], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [20 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf, ptr @_ZGVsMxv_tan, ptr @_ZGVsMxv_tanf, ptr @_ZGVsMxvv_fmod, ptr @_ZGVsMxvv_fmodf], section "llvm.metadata"
 ;.
 define <vscale x 2 x double> @llvm_ceil_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_ceil_vscale_f64(
@@ -366,6 +366,25 @@ define <vscale x 4 x float> @llvm_sqrt_vscale_f32(<vscale x 4 x float> %in) {
   ret <vscale x 4 x float> %1
 }
 
+define <vscale x 2 x double> @llvm_tan_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_tan_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.tan.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_tan_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_tan_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.tan.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+
 define <vscale x 2 x double> @llvm_trunc_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_trunc_vscale_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
@@ -442,6 +461,8 @@ declare <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float>)
 declare <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.tan.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.tan.nxv4f32(<vscale x 4 x float>)
 declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float>)
 ;.

diff  --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll
index f408df570fdc0..1df2caed4244e 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll
@@ -4,7 +4,7 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [18 x ptr] [ptr @_ZGVnN2v_cos, ptr @_ZGVnN4v_cosf, ptr @_ZGVnN2v_exp, ptr @_ZGVnN4v_expf, ptr @_ZGVnN2v_exp10, ptr @_ZGVnN4v_exp10f, ptr @_ZGVnN2v_exp2, ptr @_ZGVnN4v_exp2f, ptr @_ZGVnN2v_log, ptr @_ZGVnN4v_logf, ptr @_ZGVnN2v_log10, ptr @_ZGVnN4v_log10f, ptr @_ZGVnN2v_log2, ptr @_ZGVnN4v_log2f, ptr @_ZGVnN2v_sin, ptr @_ZGVnN4v_sinf, ptr @_ZGVnN2vv_fmod, ptr @_ZGVnN4vv_fmodf], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [20 x ptr] [ptr @_ZGVnN2v_cos, ptr @_ZGVnN4v_cosf, ptr @_ZGVnN2v_exp, ptr @_ZGVnN4v_expf, ptr @_ZGVnN2v_exp10, ptr @_ZGVnN4v_exp10f, ptr @_ZGVnN2v_exp2, ptr @_ZGVnN4v_exp2f, ptr @_ZGVnN2v_log, ptr @_ZGVnN4v_logf, ptr @_ZGVnN2v_log10, ptr @_ZGVnN4v_log10f, ptr @_ZGVnN2v_log2, ptr @_ZGVnN4v_log2f, ptr @_ZGVnN2v_sin, ptr @_ZGVnN4v_sinf, ptr @_ZGVnN2v_tan, ptr @_ZGVnN4v_tanf, ptr @_ZGVnN2vv_fmod, ptr @_ZGVnN4vv_fmodf], section "llvm.metadata"
 ;.
 define <2 x double> @llvm_ceil_f64(<2 x double> %in) {
 ; CHECK-LABEL: @llvm_ceil_f64(
@@ -366,6 +366,24 @@ define <4 x float> @llvm_sqrt_f32(<4 x float> %in) {
   ret <4 x float> %1
 }
 
+define <2 x double> @llvm_tan_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_tan_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_tan(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.tan.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_tan_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_tan_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_tanf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.tan.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
 define <2 x double> @llvm_trunc_f64(<2 x double> %in) {
 ; CHECK-LABEL: @llvm_trunc_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.trunc.v2f64(<2 x double> [[IN:%.*]])
@@ -442,6 +460,8 @@ declare <2 x double> @llvm.sin.v2f64(<2 x double>)
 declare <4 x float> @llvm.sin.v4f32(<4 x float>)
 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+declare <2 x double> @llvm.tan.v2f64(<2 x double>)
+declare <4 x float> @llvm.tan.v4f32(<4 x float>)
 declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
 declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
 ;.

diff  --git a/llvm/test/CodeGen/AArch64/vec-libcalls.ll b/llvm/test/CodeGen/AArch64/vec-libcalls.ll
index aa2cca40a92c2..9bbac5d69c28c 100644
--- a/llvm/test/CodeGen/AArch64/vec-libcalls.ll
+++ b/llvm/test/CodeGen/AArch64/vec-libcalls.ll
@@ -20,6 +20,7 @@ declare <3 x double> @llvm.sin.v3f64(<3 x double>)
 declare <3 x float> @llvm.fabs.v3f32(<3 x float>)
 declare <3 x float> @llvm.ceil.v3f32(<3 x float>)
 declare <3 x float> @llvm.cos.v3f32(<3 x float>)
+declare <3 x float> @llvm.tan.v3f32(<3 x float>)
 declare <3 x float> @llvm.exp.v3f32(<3 x float>)
 declare <3 x float> @llvm.exp2.v3f32(<3 x float>)
 declare <3 x float> @llvm.floor.v3f32(<3 x float>)
@@ -297,6 +298,37 @@ define <3 x float> @cos_v3f32(<3 x float> %x) nounwind {
   ret <3 x float> %r
 }
 
+define <3 x float> @tan_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: tan_v3f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+  %r = call <3 x float> @llvm.tan.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
 define <3 x float> @exp_v3f32(<3 x float> %x) nounwind {
 ; CHECK-LABEL: exp_v3f32:
 ; CHECK:       // %bb.0:

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
index 2017797288c29..89a513515b205 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
@@ -291,6 +291,150 @@ for.end:
   ret void
 }
 
+declare float @sinf(float) nounwind readnone
+define void @sinf_v4f32(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @sinf_v4f32(
+; CHECK: call <4 x float> @_simd_sin_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
+  %lv = load float, ptr %gep.y, align 4
+  %call = tail call float @sinf(float %lv)
+  %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
+  store float %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @sin(double) nounwind readnone
+define void @sin_v2f64(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @sin_v2f64(
+; CHECK: call <2 x double> @_simd_sin_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, ptr %y, i64 %iv
+  %lv = load double, ptr %gep.y, align 4
+  %call = tail call double @sin(double %lv)
+  %gep.x = getelementptr inbounds double, ptr %x, i64 %iv
+  store double %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @tanf(float) nounwind readnone
+define void @tanf_v4f32(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @tanf_v4f32(
+; CHECK: call <4 x float> @_simd_tan_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
+  %lv = load float, ptr %gep.y, align 4
+  %call = tail call float @tanf(float %lv)
+  %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
+  store float %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @tan(double) nounwind readnone
+define void @tan_v2f64(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @tan_v2f64(
+; CHECK: call <2 x double> @_simd_tan_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, ptr %y, i64 %iv
+  %lv = load double, ptr %gep.y, align 4
+  %call = tail call double @tan(double %lv)
+  %gep.x = getelementptr inbounds double, ptr %x, i64 %iv
+  store double %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @llvm.tan.f32(float) nounwind readnone
+define void @tan_v4f32_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @tan_v4f32_intrinsic(
+; CHECK: call <4 x float> @_simd_tan_f4(<4 x float>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
+  %lv = load float, ptr %gep.y, align 4
+  %call = tail call float @llvm.tan.f32(float %lv)
+  %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
+  store float %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @llvm.tan.f64(double) nounwind readnone
+define void @tan_v2f64_intrinsic(i64 %n, ptr noalias %y, ptr noalias %x) {
+; CHECK-LABEL: @tan_v2f64_intrinsic(
+; CHECK: call <2 x double> @_simd_tan_d2(<2 x double>
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, ptr %y, i64 %iv
+  %lv = load double, ptr %gep.y, align 4
+  %call = tail call double @llvm.tan.f64(double %lv)
+  %gep.x = getelementptr inbounds double, ptr %x, i64 %iv
+  store double %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 declare float @cbrtf(float) nounwind readnone
 define void @cbrtf_v4f32(i64 %n, ptr noalias %y, ptr noalias %x) {
 ; CHECK-LABEL: @cbrtf_v4f32(

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
index 2a552077a42b6..f0ae0093b2641 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(cos|exp|log|sin|pow|ceil|copysign|fabs|floor|fma|m..num|nearbyint|rint|round|sqrt|trunc)" --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(cos|exp|log|sin|pow|ceil|copysign|fabs|floor|fma|m..num|nearbyint|rint|round|sqrt|tan|trunc)" --version 2
 
 ; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SLEEF-NEON
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SLEEF-SVE
@@ -1472,6 +1472,79 @@ define void @sqrt_f32(ptr noalias %in.ptr, ptr %out.ptr) {
   ret void
 }
 
+declare double @llvm.tan.f64(double)
+declare float @llvm.tan.f32(float)
+
+define void @tan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tan_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tan_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tan_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vtanq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tan_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svtan_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.tan.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tan_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tan_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tan_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tan_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vtanq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tan_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svtan_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.tan.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
 declare double @llvm.trunc.f64(double)
 declare float @llvm.trunc.f32(float)