[llvm] c4fa504 - [AArch64] Enable libm vectorized functions via SLEEF

Fri Jan 20 09:52:45 PST 2023

Author: Daniel Kiss
Date: 2023-01-20T18:52:38+01:00
New Revision: c4fa504f797f68297c252dc91a24c7d37c1de4df

URL: https://github.com/llvm/llvm-project/commit/c4fa504f797f68297c252dc91a24c7d37c1de4df
DIFF: https://github.com/llvm/llvm-project/commit/c4fa504f797f68297c252dc91a24c7d37c1de4df.diff

LOG: [AArch64] Enable libm vectorized functions via SLEEF

It enables trigonometry functions vectorization via SLEEF: http://sleef.org/.

  - A new vectorization library enum is added to TargetLibraryInfo.h: SLEEF.
  - A new option is added to TargetLibraryInfoImpl - ClVectorLibrary: SLEEF.
  - A comprehensive test case is included in this changeset.
  - A new vectorization library argument is added to -fveclib: -fveclib=SLEEF.

Trigonometry functions that are vectorized by sleef:
acos
asin
atan
atanh
cos
cosh
exp
exp2
exp10
lgamma
log10
log2
log
sin
sinh
sqrt
tan
tanh
tgamma

Co-authored-by: Stefan Teleman

Reviewed By: paulwalker-arm

Differential Revision: https://reviews.llvm.org/D134719

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll

Modified: 
    clang/include/clang/Basic/CodeGenOptions.h
    clang/include/clang/Driver/Options.td
    clang/lib/CodeGen/BackendUtil.cpp
    clang/lib/Driver/ToolChains/Clang.cpp
    clang/test/Driver/autocomplete.c
    clang/test/Driver/fveclib.c
    llvm/include/llvm/Analysis/TargetLibraryInfo.h
    llvm/include/llvm/Analysis/VecFuncs.def
    llvm/lib/Analysis/TargetLibraryInfo.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index 4cc0d05d177b3..4175fe3072ab8 100644

--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -60,6 +60,7 @@ class CodeGenOptions : public CodeGenOptionsBase {
     LIBMVEC,           // GLIBC vector math library.
     MASSV,             // IBM MASS vector library.
     SVML,              // Intel short vector math library.
+    SLEEF,             // SLEEF SIMD Library for Evaluating Elementary Functions.
     Darwin_libsystem_m // Use Darwin's libsytem_m vector functions.
   };
 

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index ec7b5bfa1554a..343cc77a18c43 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2473,9 +2473,9 @@ def fno_experimental_isel : Flag<["-"], "fno-experimental-isel">, Group<f_clang_
   Alias<fno_global_isel>;
 def fveclib : Joined<["-"], "fveclib=">, Group<f_Group>, Flags<[CC1Option]>,
     HelpText<"Use the given vector functions library">,
-    Values<"Accelerate,libmvec,MASSV,SVML,Darwin_libsystem_m,none">,
+    Values<"Accelerate,libmvec,MASSV,SVML,SLEEF,Darwin_libsystem_m,none">,
     NormalizedValuesScope<"CodeGenOptions">,
-    NormalizedValues<["Accelerate", "LIBMVEC", "MASSV", "SVML",
+    NormalizedValues<["Accelerate", "LIBMVEC", "MASSV", "SVML", "SLEEF",
                       "Darwin_libsystem_m", "NoLibrary"]>,
     MarshallingInfoEnum<CodeGenOpts<"VecLib">, "NoLibrary">;
 def fno_lax_vector_conversions : Flag<["-"], "fno-lax-vector-conversions">, Group<f_Group>,

diff  --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 937a8dc40667e..ecc727d6dd281 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -271,27 +271,28 @@ static TargetLibraryInfoImpl *createTLII(llvm::Triple &TargetTriple,
 
   switch (CodeGenOpts.getVecLib()) {
   case CodeGenOptions::Accelerate:
-    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::Accelerate);
+    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::Accelerate,
+                                             TargetTriple);
     break;
   case CodeGenOptions::LIBMVEC:
-    switch(TargetTriple.getArch()) {
-      default:
-        break;
-      case llvm::Triple::x86_64:
-        TLII->addVectorizableFunctionsFromVecLib
-                (TargetLibraryInfoImpl::LIBMVEC_X86);
-        break;
-    }
+    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::LIBMVEC_X86,
+                                             TargetTriple);
     break;
   case CodeGenOptions::MASSV:
-    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::MASSV);
+    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::MASSV,
+                                             TargetTriple);
     break;
   case CodeGenOptions::SVML:
-    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::SVML);
+    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::SVML,
+                                             TargetTriple);
+    break;
+  case CodeGenOptions::SLEEF:
+    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::SLEEFGNUABI,
+                                             TargetTriple);
     break;
   case CodeGenOptions::Darwin_libsystem_m:
     TLII->addVectorizableFunctionsFromVecLib(
-        TargetLibraryInfoImpl::DarwinLibSystemM);
+        TargetLibraryInfoImpl::DarwinLibSystemM, TargetTriple);
     break;
   default:
     break;

diff  --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 9821c386c87d6..f766eeda3cc4a 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5131,7 +5131,26 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
-  Args.AddLastArg(CmdArgs, options::OPT_fveclib);
+  if (Arg *A = Args.getLastArg(options::OPT_fveclib)) {
+    StringRef Name = A->getValue();
+    if (Name == "SVML") {
+      if (Triple.getArch() != llvm::Triple::x86 &&
+          Triple.getArch() != llvm::Triple::x86_64)
+        D.Diag(diag::err_drv_unsupported_opt_for_target)
+            << Name << Triple.getArchName();
+    } else if (Name == "LIBMVEC-X86") {
+      if (Triple.getArch() != llvm::Triple::x86 &&
+          Triple.getArch() != llvm::Triple::x86_64)
+        D.Diag(diag::err_drv_unsupported_opt_for_target)
+            << Name << Triple.getArchName();
+    } else if (Name == "SLEEF") {
+      if (Triple.getArch() != llvm::Triple::aarch64 &&
+          Triple.getArch() != llvm::Triple::aarch64_be)
+        D.Diag(diag::err_drv_unsupported_opt_for_target)
+            << Name << Triple.getArchName();
+    }
+    A->render(Args, CmdArgs);
+  }
 
   if (Args.hasFlag(options::OPT_fmerge_all_constants,
                    options::OPT_fno_merge_all_constants, false))

diff  --git a/clang/test/Driver/autocomplete.c b/clang/test/Driver/autocomplete.c
index 502eee107d0b8..ea647e3c7ef0f 100644
--- a/clang/test/Driver/autocomplete.c
+++ b/clang/test/Driver/autocomplete.c
@@ -84,6 +84,7 @@
 // FVECLIBALL-NEXT: libmvec
 // FVECLIBALL-NEXT: MASSV
 // FVECLIBALL-NEXT: none
+// FVECLIBALL-NEXT: SLEEF
 // FVECLIBALL-NEXT: SVML
 // RUN: %clang --autocomplete=-fshow-overloads= | FileCheck %s -check-prefix=FSOVERALL
 // FSOVERALL: all

diff  --git a/clang/test/Driver/fveclib.c b/clang/test/Driver/fveclib.c
index 2bf7558a02af8..d6049763dd112 100644
--- a/clang/test/Driver/fveclib.c
+++ b/clang/test/Driver/fveclib.c
@@ -3,6 +3,7 @@
 // RUN: %clang -### -c -fveclib=libmvec %s 2>&1 | FileCheck -check-prefix CHECK-libmvec %s
 // RUN: %clang -### -c -fveclib=MASSV %s 2>&1 | FileCheck -check-prefix CHECK-MASSV %s
 // RUN: %clang -### -c -fveclib=Darwin_libsystem_m %s 2>&1 | FileCheck -check-prefix CHECK-DARWIN_LIBSYSTEM_M %s
+// RUN: %clang -### -c --target=aarch64-none-none -fveclib=SLEEF %s 2>&1 | FileCheck -check-prefix CHECK-SLEEF %s
 // RUN: not %clang -c -fveclib=something %s 2>&1 | FileCheck -check-prefix CHECK-INVALID %s
 
 // CHECK-NOLIB: "-fveclib=none"
@@ -10,9 +11,15 @@
 // CHECK-libmvec: "-fveclib=libmvec"
 // CHECK-MASSV: "-fveclib=MASSV"
 // CHECK-DARWIN_LIBSYSTEM_M: "-fveclib=Darwin_libsystem_m"
+// CHECK-SLEEF: "-fveclib=SLEEF"
 
 // CHECK-INVALID: error: invalid value 'something' in '-fveclib=something'
 
+// RUN: not %clang --target=x86-none-none -c -fveclib=SLEEF %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
+// RUN: not %clang --target=aarch64-none-none -c -fveclib=LIBMVEC-X86 %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
+// RUN: not %clang --target=aarch64-none-none -c -fveclib=SVML %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
+// CHECK-ERROR: unsupported option {{.*}} for target
+
 // RUN: %clang -fveclib=Accelerate %s -target arm64-apple-ios8.0.0 -### 2>&1 | FileCheck --check-prefix=CHECK-LINK %s
 // CHECK-LINK: "-framework" "Accelerate"
 

diff  --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index f4a48262328ee..8fcfbdbd6665c 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -94,7 +94,8 @@ class TargetLibraryInfoImpl {
     DarwinLibSystemM, // Use Darwin's libsystem_m.
     LIBMVEC_X86,      // GLIBC Vector Math library.
     MASSV,            // IBM MASS vector library.
-    SVML              // Intel short vector math library.
+    SVML,             // Intel short vector math library.
+    SLEEFGNUABI       // SLEEF - SIMD Library for Evaluating Elementary Functions.
   };
 
   TargetLibraryInfoImpl();
@@ -154,7 +155,8 @@ class TargetLibraryInfoImpl {
 
   /// Calls addVectorizableFunctions with a known preset of functions for the
   /// given vector library.
-  void addVectorizableFunctionsFromVecLib(enum VectorLibrary VecLib);
+  void addVectorizableFunctionsFromVecLib(enum VectorLibrary VecLib,
+                                          const llvm::Triple &TargetTriple);
 
   /// Return true if the function F has a vector equivalent with vectorization
   /// factor VF.

diff  --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 8a1ebec4c7277..85d208b946252 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -466,6 +466,146 @@ TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f4", FIXED(4))
 TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f8", FIXED(8))
 TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f16", FIXED(16))
 
+#elif defined(TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS)
+
+TLI_DEFINE_VECFUNC( "acos", "_ZGVnN2v_acos", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.acos.f64", "_ZGVnN2v_acos", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "asin", "_ZGVnN2v_asin", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.asin.f64", "_ZGVnN2v_asin", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "atan", "_ZGVnN2v_atan", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.atan.f64", "_ZGVnN2v_atan", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "atan2", "_ZGVnN2vv_atan2", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.atan2.f64", "_ZGVnN2vv_atan2", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.atan2.v2f64", "_ZGVnN2vv_atan2", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "atanh", "_ZGVnN2v_atanh", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.atanh.f64", "_ZGVnN2v_atanh", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "cos", "_ZGVnN2v_cos", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.cos.f64", "_ZGVnN2v_cos", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "cosh", "_ZGVnN2v_cosh", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.cosh.f64", "_ZGVnN2v_cosh", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "exp", "_ZGVnN2v_exp", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.exp.v2f64", "_ZGVnN2v_exp", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "exp2", "_ZGVnN2v_exp2", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.exp2.f64", "_ZGVnN2v_exp2", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.exp2.v2f64", "_ZGVnN2v_exp2", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "exp10", "_ZGVnN2v_exp10", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.exp10.f64", "_ZGVnN2v_exp10", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.exp10.v2f64", "_ZGVnN2v_exp10", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "lgamma", "_ZGVnN2v_lgamma", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.lgamma.f64", "_ZGVnN2v_lgamma", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "log", "_ZGVnN2v_log", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.log.f64", "_ZGVnN2v_log", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "log2", "_ZGVnN2v_log2", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.log2.f64", "_ZGVnN2v_log2", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "log10", "_ZGVnN2v_log10", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.log10.f64", "_ZGVnN2v_log10", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "pow", "_ZGVnN2vv_pow", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.pow.v2f64", "_ZGVnN2vv_pow", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "sin", "_ZGVnN2v_sin", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.sin.f64", "_ZGVnN2v_sin", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "sinh", "_ZGVnN2v_sinh", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.sinh.f64", "_ZGVnN2v_sinh", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "sqrt", "_ZGVnN2v_sqrt", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.sqrt.f64", "_ZGVnN2v_sqrt", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "tan", "_ZGVnN2v_tan", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.tan.f64", "_ZGVnN2v_tan", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "tanh", "_ZGVnN2v_tanh", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.tanh.f64", "_ZGVnN2v_tanh", FIXED(2))
+
+TLI_DEFINE_VECFUNC( "tgamma", "_ZGVnN2v_tgamma", FIXED(2))
+TLI_DEFINE_VECFUNC( "llvm.tgamma.f64", "_ZGVnN2v_tgamma", FIXED(2))
+
+#elif defined(TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS)
+
+TLI_DEFINE_VECFUNC( "acosf", "_ZGVnN4v_acosf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.acos.f32", "_ZGVnN4v_acosf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "asinf", "_ZGVnN4v_asinf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.asin.f32", "_ZGVnN4v_asinf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "atanf", "_ZGVnN4v_atanf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.atan.f32", "_ZGVnN4v_atanf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "atan2f", "_ZGVnN4vv_atan2f", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.atan2.f32", "_ZGVnN4vv_atan2f", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.atan2.v4f32", "_ZGVnN4vv_atan2f", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "atanhf", "_ZGVnN4v_atanhf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.atanh.f32", "_ZGVnN4v_atanhf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "cosf", "_ZGVnN4v_cosf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.cos.f32", "_ZGVnN4v_cosf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "coshf", "_ZGVnN4v_coshf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.cosh.f32", "_ZGVnN4v_coshf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "expf", "_ZGVnN4v_expf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.exp.f32", "_ZGVnN4v_expf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.exp.v4f32", "_ZGVnN4v_expf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "exp2f", "_ZGVnN4v_exp2f", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.exp2.f32", "_ZGVnN4v_exp2f", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.exp2.v4f32", "_ZGVnN4v_exp2f", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "exp10f", "_ZGVnN4v_exp10f", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.exp10.f32", "_ZGVnN4v_exp10f", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.exp10.v4f32", "_ZGVnN4v_exp10f", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "lgammaf", "_ZGVnN4v_lgammaf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.lgamma.f32", "_ZGVnN4v_lgammaf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "logf", "_ZGVnN4v_logf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.log.f32", "_ZGVnN4v_logf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "log2f", "_ZGVnN4v_log2f", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.log2.f32", "_ZGVnN4v_log2f", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "log10f", "_ZGVnN4v_log10f", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.log10.f32", "_ZGVnN4v_log10f", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "powf", "_ZGVnN4vv_powf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.pow.v4f32", "_ZGVnN4vv_powf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "sinf", "_ZGVnN4v_sinf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.sin.f32", "_ZGVnN4v_sinf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "sinhf", "_ZGVnN4v_sinhf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.sinh.f32", "_ZGVnN4v_sinhf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "sqrtf", "_ZGVnN4v_sqrtf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.sqrt.f32", "_ZGVnN4v_sqrtf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "tanf", "_ZGVnN4v_tanf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.tan.f32", "_ZGVnN4v_tanf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "tanhf", "_ZGVnN4v_tanhf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.tanh.f32", "_ZGVnN4v_tanhf", FIXED(4))
+
+TLI_DEFINE_VECFUNC( "tgammaf", "_ZGVnN4v_tgammaf", FIXED(4))
+TLI_DEFINE_VECFUNC( "llvm.tgamma.f32", "_ZGVnN4v_tgammaf", FIXED(4))
+
 #else
 #error "Must choose which vector library functions are to be defined."
 #endif
@@ -476,4 +616,6 @@ TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f16", FIXED(16))
 #undef TLI_DEFINE_LIBMVEC_X86_VECFUNCS
 #undef TLI_DEFINE_MASSV_VECFUNCS
 #undef TLI_DEFINE_SVML_VECFUNCS
+#undef TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS
+#undef TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
 #undef TLI_DEFINE_MASSV_VECFUNCS_NAMES

diff  --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 4c85bcca0a72a..31cc0e7ec30ea 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -31,7 +31,9 @@ static cl::opt<TargetLibraryInfoImpl::VectorLibrary> ClVectorLibrary(
                clEnumValN(TargetLibraryInfoImpl::MASSV, "MASSV",
                           "IBM MASS vector library"),
                clEnumValN(TargetLibraryInfoImpl::SVML, "SVML",
-                          "Intel SVML library")));
+                          "Intel SVML library"),
+               clEnumValN(TargetLibraryInfoImpl::SLEEFGNUABI, "sleefgnuabi",
+                          "SIMD Library for Evaluating Elementary Functions")));
 
 StringLiteral const TargetLibraryInfoImpl::StandardNames[LibFunc::NumLibFuncs] =
     {
@@ -852,7 +854,7 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_vec_free);
   }
 
-  TLI.addVectorizableFunctionsFromVecLib(ClVectorLibrary);
+  TLI.addVectorizableFunctionsFromVecLib(ClVectorLibrary, T);
 }
 
 TargetLibraryInfoImpl::TargetLibraryInfoImpl() {
@@ -1134,7 +1136,7 @@ void TargetLibraryInfoImpl::addVectorizableFunctions(ArrayRef<VecDesc> Fns) {
 }
 
 void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
-    enum VectorLibrary VecLib) {
+    enum VectorLibrary VecLib, const llvm::Triple &TargetTriple) {
   switch (VecLib) {
   case Accelerate: {
     const VecDesc VecFuncs[] = {
@@ -1176,6 +1178,27 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     addVectorizableFunctions(VecFuncs);
     break;
   }
+  case SLEEFGNUABI: {
+    const VecDesc VecFuncs_VF2[] = {
+#define TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS
+#include "llvm/Analysis/VecFuncs.def"
+    };
+    const VecDesc VecFuncs_VF4[] = {
+#define TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
+#include "llvm/Analysis/VecFuncs.def"
+    };
+
+    switch (TargetTriple.getArch()) {
+    default:
+      break;
+    case llvm::Triple::aarch64:
+    case llvm::Triple::aarch64_be:
+      addVectorizableFunctions(VecFuncs_VF2);
+      addVectorizableFunctions(VecFuncs_VF4);
+      break;
+    }
+    break;
+  }
   case NoLibrary:
     break;
   }

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll
new file mode 100644
index 0000000000000..623ac986fcace
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll
@@ -0,0 +1,1076 @@
+; Do NOT use -O3. It will lower exp2 to ldexp, and the test will fail.
+; RUN: opt -vector-library=sleefgnuabi -replace-with-veclib -loop-unroll -loop-vectorize -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+declare double @acos(double) #0
+declare float @acosf(float) #0
+declare double @llvm.acos.f64(double) #0
+declare float @llvm.acos.f32(float) #0
+
+define void @acos_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @acos_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @acos(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @acos_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @acos_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @acosf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @asin(double) #0
+declare float @asinf(float) #0
+declare double @llvm.asin.f64(double) #0
+declare float @llvm.asin.f32(float) #0
+
+define void @asin_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @asin_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @asin(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @asin_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @asin_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @asinf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @atan(double) #0
+declare float @atanf(float) #0
+declare double @llvm.atan.f64(double) #0
+declare float @llvm.atan.f32(float) #0
+
+define void @atan_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @atan_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @atan(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atan_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @atan_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @atanf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @atan2(double, double) #0
+declare float @atan2f(float, float) #0
+declare double @llvm.atan2.f64(double, double) #0
+declare float @llvm.atan2.f32(float, float) #0
+
+define void @atan2_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @atan2_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @atan2(double %conv, double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atan2_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @atan2_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @atan2f(float %conv, float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @atanh(double) #0
+declare float @atanhf(float) #0
+declare double @llvm.atanh.f64(double) #0
+declare float @llvm.atanh.f32(float) #0
+
+define void @atanh_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @atanh_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @atanh(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atanh_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @atanh_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @atanhf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @cos(double) #0
+declare float @cosf(float) #0
+declare double @llvm.cos.f64(double) #0
+declare float @llvm.cos.f32(float) #0
+
+define void @cos_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @cos_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @cos(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cos_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @cos_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @cosf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @cosh(double) #0
+declare float @coshf(float) #0
+declare double @llvm.cosh.f64(double) #0
+declare float @llvm.cosh.f32(float) #0
+
+define void @cosh_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @cosh_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @cosh(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cosh_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @cosh_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @coshf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @exp(double) #0
+declare float @expf(float) #0
+declare double @llvm.exp.f64(double) #0
+declare float @llvm.exp.f32(float) #0
+
+define void @exp_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @exp_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @exp(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @exp_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @expf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @exp2(double) #0
+declare float @exp2f(float) #0
+declare double @llvm.exp2.f64(double) #0
+declare float @llvm.exp2.f32(float) #0
+
+define void @exp2_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @exp2_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @exp2(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp2_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @exp2_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @exp2f(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @exp10(double) #0
+declare float @exp10f(float) #0
+declare double @llvm.exp10.f64(double) #0
+declare float @llvm.exp10.f32(float) #0
+
+define void @exp10_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @exp10_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @exp10(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp10_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @exp10_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @exp10f(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @lgamma(double) #0
+declare float @lgammaf(float) #0
+declare double @llvm.lgamma.f64(double) #0
+declare float @llvm.lgamma.f32(float) #0
+
+define void @lgamma_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @lgamma_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @lgamma(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @lgamma_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @lgamma_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @lgammaf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log10(double) #0
+declare float @log10f(float) #0
+declare double @llvm.log10.f64(double) #0
+declare float @llvm.log10.f32(float) #0
+
+define void @log10_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @log10_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @log10(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log10_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @log10_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @log10f(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log2(double) #0
+declare float @log2f(float) #0
+declare double @llvm.log2.f64(double) #0
+declare float @llvm.log2.f32(float) #0
+
+define void @log2_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @log2_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @log2(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log2_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @log2_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @log2f(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log(double) #0
+declare float @logf(float) #0
+declare double @llvm.log.f64(double) #0
+declare float @llvm.log.f32(float) #0
+
+define void @log_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @log_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @log(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @log_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @logf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @pow(double, double) #0
+declare float @powf(float, float) #0
+declare double @llvm.pow.f64(double, double) #0
+declare float @llvm.pow.f32(float, float) #0
+
+define void @pow_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @pow_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @pow(double %conv, double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @pow_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @pow_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @powf(float %conv, float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sin(double) #0
+declare float @sinf(float) #0
+declare double @llvm.sin.f64(double) #0
+declare float @llvm.sin.f32(float) #0
+
+define void @sin_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @sin_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @sin(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sin_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @sin_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @sinf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sinh(double) #0
+declare float @sinhf(float) #0
+declare double @llvm.sinh.f64(double) #0
+declare float @llvm.sinh.f32(float) #0
+
+define void @sinh_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @sinh_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @sinh(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sinh_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @sinh_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @sinhf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sqrt(double) #0
+declare float @sqrtf(float) #0
+declare double @llvm.sqrt.f64(double) #0
+declare float @llvm.sqrt.f32(float) #0
+
+define void @sqrt_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @sqrt_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @sqrt(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sqrt_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @sqrt_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @sqrtf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @tan(double) #0
+declare float @tanf(float) #0
+declare double @llvm.tan.f64(double) #0
+declare float @llvm.tan.f32(float) #0
+
+define void @tan_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @tan_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @tan(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tan_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @tan_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @tanf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @tanh(double) #0
+declare float @tanhf(float) #0
+declare double @llvm.tanh.f64(double) #0
+declare float @llvm.tanh.f32(float) #0
+
+define void @tanh_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @tanh_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @tanh(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tanh_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @tanh_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @tanhf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @tgamma(double) #0
+declare float @tgammaf(float) #0
+declare double @llvm.tgamma.f64(double) #0
+declare float @llvm.tgamma.f32(float) #0
+
+define void @tgamma_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @tgamma_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @tgamma(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tgamma_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @tgamma_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @tgammaf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}