[llvm] r347510 - [AArch64] Enable libm vectorized functions via SLEEF

Fri Nov 23 22:41:39 PST 2018

Author: joel_k_jones
Date: Fri Nov 23 22:41:39 2018
New Revision: 347510

URL: http://llvm.org/viewvc/llvm-project?rev=347510&view=rev
Log:
[AArch64] Enable libm vectorized functions via SLEEF

This changeset is modeled after Intel's submission for SVML. It enables
trigonometry functions vectorization via SLEEF: http://sleef.org/.

 * A new vectorization library enum is added to TargetLibraryInfo.h: SLEEF.
 * A new option is added to TargetLibraryInfoImpl - ClVectorLibrary: SLEEF.
 * A comprehensive test case is included in this changeset.
 * In a separate changeset (for clang), a new vectorization library argument is
   added to -fveclib: -fveclib=SLEEF.

Trigonometry functions that are vectorized by sleef:

acos
asin
atan
atanh
cos
cosh
exp
exp2
exp10
lgamma
log10
log2
log
sin
sinh
sqrt
tan
tanh
tgamma

Patch by Stefan Teleman
Differential Revision: https://reviews.llvm.org/D53927

Added:
    llvm/trunk/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll
Modified:
    llvm/trunk/include/llvm/Analysis/TargetLibraryInfo.h
    llvm/trunk/include/llvm/IR/Intrinsics.td
    llvm/trunk/lib/Analysis/TargetLibraryInfo.cpp

Modified: llvm/trunk/include/llvm/Analysis/TargetLibraryInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetLibraryInfo.h?rev=347510&r1=347509&r2=347510&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/TargetLibraryInfo.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetLibraryInfo.h Fri Nov 23 22:41:39 2018
@@ -47,6 +47,7 @@ struct VecDesc {
 class TargetLibraryInfoImpl {
   friend class TargetLibraryInfo;
 
+  Triple TT;
   unsigned char AvailableArray[(NumLibFuncs+3)/4];
   llvm::DenseMap<unsigned, std::string> CustomNames;
   static StringRef const StandardNames[NumLibFuncs];
@@ -87,7 +88,8 @@ public:
   enum VectorLibrary {
     NoLibrary,  // Don't use any vector library.
     Accelerate, // Use Accelerate framework.
-    SVML        // Intel short vector math library.
+    SVML,       // Intel short vector math library.
+    SLEEFGNUABI // SLEEF - SIMD Library for Evaluating Elementary Functions.
   };
 
   TargetLibraryInfoImpl();

Modified: llvm/trunk/include/llvm/IR/Intrinsics.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/Intrinsics.td?rev=347510&r1=347509&r2=347510&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/Intrinsics.td (original)
+++ llvm/trunk/include/llvm/IR/Intrinsics.td Fri Nov 23 22:41:39 2018
@@ -430,14 +430,34 @@ let IntrProperties = [IntrNoMem, IntrSpe
   def int_powi : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty]>;
   def int_sin  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_cos  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_acos : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_asin : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_atan : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_atan2 : Intrinsic<[llvm_anyfloat_ty],
+                            [LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_tan  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_pow  : Intrinsic<[llvm_anyfloat_ty],
                            [LLVMMatchType<0>, LLVMMatchType<0>]>;
+
+  def int_cosh : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_sinh : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_tanh : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+
+  def int_asinh : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_acosh : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_atanh : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+
+  def int_lgamma : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_tgamma : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+
   def int_log  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_log10: Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_log2 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_exp  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_exp2 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_exp10 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_fabs : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+
   def int_copysign : Intrinsic<[llvm_anyfloat_ty],
                                [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_floor : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;

Modified: llvm/trunk/lib/Analysis/TargetLibraryInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/TargetLibraryInfo.cpp?rev=347510&r1=347509&r2=347510&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/TargetLibraryInfo.cpp (original)
+++ llvm/trunk/lib/Analysis/TargetLibraryInfo.cpp Fri Nov 23 22:41:39 2018
@@ -25,7 +25,9 @@ static cl::opt<TargetLibraryInfoImpl::Ve
                clEnumValN(TargetLibraryInfoImpl::Accelerate, "Accelerate",
                           "Accelerate framework"),
                clEnumValN(TargetLibraryInfoImpl::SVML, "SVML",
-                          "Intel SVML library")));
+                          "Intel SVML library"),
+               clEnumValN(TargetLibraryInfoImpl::SLEEFGNUABI, "sleefgnuabi",
+                          "SIMD Library for Evaluating Elementary Functions")));
 
 StringRef const TargetLibraryInfoImpl::StandardNames[LibFunc::NumLibFuncs] = {
 #define TLI_DEFINE_STRING
@@ -524,7 +526,8 @@ TargetLibraryInfoImpl::TargetLibraryInfo
   initialize(*this, Triple(), StandardNames);
 }
 
-TargetLibraryInfoImpl::TargetLibraryInfoImpl(const Triple &T) {
+TargetLibraryInfoImpl::TargetLibraryInfoImpl(const Triple &T)
+  : TT(T) {
   // Default to everything being available.
   memset(AvailableArray, -1, sizeof(AvailableArray));
 
@@ -532,7 +535,8 @@ TargetLibraryInfoImpl::TargetLibraryInfo
 }
 
 TargetLibraryInfoImpl::TargetLibraryInfoImpl(const TargetLibraryInfoImpl &TLI)
-    : CustomNames(TLI.CustomNames), ShouldExtI32Param(TLI.ShouldExtI32Param),
+    : TT(TLI.TT), CustomNames(TLI.CustomNames),
+      ShouldExtI32Param(TLI.ShouldExtI32Param),
       ShouldExtI32Return(TLI.ShouldExtI32Return),
       ShouldSignExtI32Param(TLI.ShouldSignExtI32Param) {
   memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
@@ -541,7 +545,7 @@ TargetLibraryInfoImpl::TargetLibraryInfo
 }
 
 TargetLibraryInfoImpl::TargetLibraryInfoImpl(TargetLibraryInfoImpl &&TLI)
-    : CustomNames(std::move(TLI.CustomNames)),
+    : TT(std::move(TLI.TT)), CustomNames(std::move(TLI.CustomNames)),
       ShouldExtI32Param(TLI.ShouldExtI32Param),
       ShouldExtI32Return(TLI.ShouldExtI32Return),
       ShouldSignExtI32Param(TLI.ShouldSignExtI32Param) {
@@ -552,6 +556,7 @@ TargetLibraryInfoImpl::TargetLibraryInfo
 }
 
 TargetLibraryInfoImpl &TargetLibraryInfoImpl::operator=(const TargetLibraryInfoImpl &TLI) {
+  TT = TLI.TT;
   CustomNames = TLI.CustomNames;
   ShouldExtI32Param = TLI.ShouldExtI32Param;
   ShouldExtI32Return = TLI.ShouldExtI32Return;
@@ -561,6 +566,7 @@ TargetLibraryInfoImpl &TargetLibraryInfo
 }
 
 TargetLibraryInfoImpl &TargetLibraryInfoImpl::operator=(TargetLibraryInfoImpl &&TLI) {
+  TT = std::move(TLI.TT);
   CustomNames = std::move(TLI.CustomNames);
   ShouldExtI32Param = TLI.ShouldExtI32Param;
   ShouldExtI32Return = TLI.ShouldExtI32Return;
@@ -1579,6 +1585,150 @@ void TargetLibraryInfoImpl::addVectoriza
     addVectorizableFunctions(VecFuncs);
     break;
   }
+  case SLEEFGNUABI: {
+    if (TT.getArch() == llvm::Triple::aarch64 ||
+        TT.getArch() == llvm::Triple::aarch64_be) {
+      const VecDesc AArch64TwoAndFourLaneVecFuncs[] = {
+        { "acos", "_ZGVnN2v_acos", 2 },
+        { "acos", "_ZGVnN4v_acosf", 4 },
+        { "acosf", "_ZGVnN4v_acosf", 4 },
+        { "llvm.acos.f64", "_ZGVnN2v_acos", 2 },
+        { "llvm.acos.f32", "_ZGVnN4v_acosf", 4 },
+
+        { "asin", "_ZGVnN2v_asin", 2 },
+        { "asin", "_ZGVnN4v_asinf", 4 },
+        { "asinf", "_ZGVnN4v_asinf", 4 },
+        { "llvm.asin.f64", "_ZGVnN2v_asin", 2 },
+        { "llvm.asin.f32", "_ZGVnN4v_asinf", 4 },
+
+        { "atan", "_ZGVnN2v_atan", 2 },
+        { "atan", "_ZGVnN4v_atanf", 4 },
+        { "atanf", "_ZGVnN4v_atanf", 4 },
+        { "llvm.atan.f64", "_ZGVnN2v_atan", 2 },
+        { "llvm.atan.f32", "_ZGVnN4v_atanf", 4 },
+
+        { "atan2", "_ZGVnN2vv_atan2", 2 },
+        { "atan2", "_ZGVnN4vv_atan2f", 4 },
+        { "atan2f", "_ZGVnN4vv_atan2f", 4 },
+        { "llvm.atan2.f64", "_ZGVnN2vv_atan2", 2 },
+        { "llvm.atan2.f32", "_ZGVnN4vv_atan2f", 4 },
+        { "llvm.atan2.v2f64", "_ZGVnN2vv_atan2", 2 },
+        { "llvm.atan2.v4f32", "_ZGVnN4vv_atan2f", 4 },
+
+        { "atanh", "_ZGVnN2v_atanh", 2 },
+        { "atanh", "_ZGVnN4v_atanhf", 4 },
+        { "atanhf", "_ZGVnN4v_atanhf", 4 },
+        { "llvm.atanh.f64", "_ZGVnN2v_atanh", 2 },
+        { "llvm.atanh.f32", "_ZGVnN4v_atanhf", 4 },
+
+        { "cos", "_ZGVnN2v_cos", 2 },
+        { "cos", "_ZGVnN4v_cosf", 4 },
+        { "cosf", "_ZGVnN4v_cosf", 4 },
+        { "llvm.cos.f64", "_ZGVnN2v_cos", 2 },
+        { "llvm.cos.f32", "_ZGVnN4v_cosf", 4 },
+
+        { "cosh", "_ZGVnN2v_cosh", 2 },
+        { "cosh", "_ZGVnN4v_coshf", 4 },
+        { "coshf", "_ZGVnN4v_coshf", 4 },
+        { "llvm.cosh.f64", "_ZGVnN2v_cosh", 2 },
+        { "llvm.cosh.f32", "_ZGVnN4v_coshf", 4 },
+
+        { "exp", "_ZGVnN2v_exp", 2 },
+        { "exp", "_ZGVnN4v_expf", 4 },
+        { "expf", "_ZGVnN4v_expf", 4 },
+        { "llvm.exp.f64", "_ZGVnN2v_exp", 2 },
+        { "llvm.exp.f32", "_ZGVnN4v_expf", 4 },
+        { "llvm.exp.v2f64", "_ZGVnN2v_exp", 2 },
+        { "llvm.exp.v4f32", "_ZGVnN4v_expf", 4 },
+
+        { "exp2", "_ZGVnN2v_exp2", 2 },
+        { "exp2", "_ZGVnN4v_exp2f", 4 },
+        { "exp2f", "_ZGVnN4v_exp2f", 4 },
+        { "llvm.exp2.f64", "_ZGVnN2v_exp2", 2 },
+        { "llvm.exp2.f32", "_ZGVnN4v_exp2f", 4 },
+        { "llvm.exp2.v2f64", "_ZGVnN2v_exp2", 2 },
+        { "llvm.exp2.v4f32", "_ZGVnN4v_exp2f", 4 },
+
+        { "exp10", "_ZGVnN2v_exp10", 2 },
+        { "exp10", "_ZGVnN4v_exp10f", 4 },
+        { "exp10f", "_ZGVnN4v_exp10f", 4 },
+        { "llvm.exp10.f64", "_ZGVnN2v_exp10", 2 },
+        { "llvm.exp10.f32", "_ZGVnN4v_exp10f", 4 },
+        { "llvm.exp10.v2f64", "_ZGVnN2v_exp10", 2 },
+        { "llvm.exp10.v4f32", "_ZGVnN4v_exp10f", 4 },
+
+        { "lgamma", "_ZGVnN2v_lgamma", 2 },
+        { "lgamma", "_ZGVnN4v_lgammaf", 4 },
+        { "lgammaf", "_ZGVnN4v_lgammaf", 4 },
+        { "llvm.lgamma.f64", "_ZGVnN2v_lgamma", 2 },
+        { "llvm.lgamma.f32", "_ZGVnN4v_lgammaf", 4 },
+
+        { "log", "_ZGVnN2v_log", 2 },
+        { "log", "_ZGVnN4v_logf", 4 },
+        { "logf", "_ZGVnN4v_logf", 4 },
+        { "llvm.log.f64", "_ZGVnN2v_log", 2 },
+        { "llvm.log.f32", "_ZGVnN4v_logf", 4 },
+
+        { "log2", "_ZGVnN2v_log2", 2 },
+        { "log2", "_ZGVnN4v_log2f", 4 },
+        { "log2f", "_ZGVnN4v_log2f", 4 },
+        { "llvm.log2.f64", "_ZGVnN2v_log2", 2 },
+        { "llvm.log2.f32", "_ZGVnN4v_log2f", 4 },
+
+        { "log10", "_ZGVnN2v_log10", 2 },
+        { "log10", "_ZGVnN4v_log10f", 4 },
+        { "log10f", "_ZGVnN4v_log10f", 4 },
+        { "llvm.log10.f64", "_ZGVnN2v_log10", 2 },
+        { "llvm.log10.f32", "_ZGVnN4v_log10f", 4 },
+
+        { "pow", "_ZGVnN2vv_pow", 2 },
+        { "pow", "_ZGVnN4vv_powf", 4 },
+        { "powf", "_ZGVnN4vv_powf", 4 },
+        { "llvm.pow.f64", "_ZGVnN2vv_pow", 2 },
+        { "llvm.pow.f32", "_ZGVnN4vv_powf", 4 },
+        { "llvm.pow.v2f64", "_ZGVnN2vv_pow", 2 },
+        { "llvm.pow.v4f32", "_ZGVnN4vv_powf", 4 },
+
+        { "sin", "_ZGVnN2v_sin", 2 },
+        { "sin", "_ZGVnN4v_sinf", 4 },
+        { "sinf", "_ZGVnN4v_sinf", 4 },
+        { "llvm.sin.f64", "_ZGVnN2v_sin", 2 },
+        { "llvm.sin.f32", "_ZGVnN4v_sinf", 4 },
+
+        { "sinh", "_ZGVnN2v_sinh", 2 },
+        { "sinh", "_ZGVnN4v_sinhf", 4 },
+        { "sinhf", "_ZGVnN4v_sinhf", 4 },
+        { "llvm.sinh.f64", "_ZGVnN2v_sinh", 2 },
+        { "llvm.sinh.f32", "_ZGVnN4v_sinhf", 4 },
+
+        { "sqrt", "_ZGVnN2v_sqrt", 2 },
+        { "sqrt", "_ZGVnN4v_sqrtf", 4 },
+        { "sqrtf", "_ZGVnN4v_sqrtf", 4 },
+        { "llvm.sqrt.f64", "_ZGVnN2v_sqrt", 2 },
+        { "llvm.sqrt.f32", "_ZGVnN4v_sqrtf", 4 },
+
+        { "tan", "_ZGVnN2v_tan", 2 },
+        { "tan", "_ZGVnN4v_tanf", 4 },
+        { "tanf", "_ZGVnN4v_tanf", 4 },
+        { "llvm.tan.f64", "_ZGVnN2v_tan", 2 },
+        { "llvm.tan.f32", "_ZGVnN4v_tanf", 4 },
+
+        { "tanh", "_ZGVnN2v_tanh", 2 },
+        { "tanh", "_ZGVnN4v_tanhf", 4 },
+        { "tanhf", "_ZGVnN4v_tanhf", 4 },
+        { "llvm.tanh.f64", "_ZGVnN2v_tanh", 2 },
+        { "llvm.tanh.f32", "_ZGVnN4v_tanhf", 4 },
+
+        { "tgamma", "_ZGVnN2v_tgamma", 2 },
+        { "tgamma", "_ZGVnN4v_tgammaf", 4 },
+        { "tgammaf", "_ZGVnN4v_tgammaf", 4 },
+        { "llvm.tgamma.f64", "_ZGVnN2v_tgamma", 2 },
+        { "llvm.tgamma.f32", "_ZGVnN4v_tgammaf", 4 },
+      };
+      addVectorizableFunctions(AArch64TwoAndFourLaneVecFuncs);
+    }
+    break;
+  }
   case NoLibrary:
     break;
   }

Added: llvm/trunk/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll?rev=347510&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll Fri Nov 23 22:41:39 2018
@@ -0,0 +1,1077 @@
+; Do NOT use -O3. It will lower exp2 to ldexp, and the test will fail.
+; RUN: opt -vector-library=sleefgnuabi -loop-unroll -loop-vectorize -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+declare double @acos(double) #0
+declare float @acosf(float) #0
+declare double @llvm.acos.f64(double) #0
+declare float @llvm.acos.f32(float) #0
+
+define void @acos_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @acos_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @acos(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @acos_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @acos_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @acosf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @asin(double) #0
+declare float @asinf(float) #0
+declare double @llvm.asin.f64(double) #0
+declare float @llvm.asin.f32(float) #0
+
+define void @asin_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @asin_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @asin(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @asin_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @asin_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @asinf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @atan(double) #0
+declare float @atanf(float) #0
+declare double @llvm.atan.f64(double) #0
+declare float @llvm.atan.f32(float) #0
+
+define void @atan_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @atan_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @atan(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atan_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @atan_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @atanf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @atan2(double, double) #0
+declare float @atan2f(float, float) #0
+declare double @llvm.atan2.f64(double, double) #0
+declare float @llvm.atan2.f32(float, float) #0
+
+define void @atan2_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @atan2_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @atan2(double %conv, double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atan2_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @atan2_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @atan2f(float %conv, float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @atanh(double) #0
+declare float @atanhf(float) #0
+declare double @llvm.atanh.f64(double) #0
+declare float @llvm.atanh.f32(float) #0
+
+define void @atanh_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @atanh_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @atanh(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atanh_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @atanh_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @atanhf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @cos(double) #0
+declare float @cosf(float) #0
+declare double @llvm.cos.f64(double) #0
+declare float @llvm.cos.f32(float) #0
+
+define void @cos_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @cos_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @cos(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cos_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @cos_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @cosf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @cosh(double) #0
+declare float @coshf(float) #0
+declare double @llvm.cosh.f64(double) #0
+declare float @llvm.cosh.f32(float) #0
+
+define void @cosh_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @cosh_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @cosh(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cosh_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @cosh_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @coshf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @exp(double) #0
+declare float @expf(float) #0
+declare double @llvm.exp.f64(double) #0
+declare float @llvm.exp.f32(float) #0
+
+define void @exp_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @exp_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @exp(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @exp_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @expf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @exp2(double) #0
+declare float @exp2f(float) #0
+declare double @llvm.exp2.f64(double) #0
+declare float @llvm.exp2.f32(float) #0
+
+define void @exp2_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @exp2_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @exp2(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp2_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @exp2_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @exp2f(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @exp10(double) #0
+declare float @exp10f(float) #0
+declare double @llvm.exp10.f64(double) #0
+declare float @llvm.exp10.f32(float) #0
+
+define void @exp10_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @exp10_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @exp10(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp10_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @exp10_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @exp10f(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @lgamma(double) #0
+declare float @lgammaf(float) #0
+declare double @llvm.lgamma.f64(double) #0
+declare float @llvm.lgamma.f32(float) #0
+
+define void @lgamma_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @lgamma_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @lgamma(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @lgamma_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @lgamma_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @lgammaf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log10(double) #0
+declare float @log10f(float) #0
+declare double @llvm.log10.f64(double) #0
+declare float @llvm.log10.f32(float) #0
+
+define void @log10_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @log10_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @log10(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log10_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @log10_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @log10f(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log2(double) #0
+declare float @log2f(float) #0
+declare double @llvm.log2.f64(double) #0
+declare float @llvm.log2.f32(float) #0
+
+define void @log2_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @log2_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @log2(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log2_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @log2_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @log2f(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log(double) #0
+declare float @logf(float) #0
+declare double @llvm.log.f64(double) #0
+declare float @llvm.log.f32(float) #0
+
+define void @log_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @log_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @log(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @log_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @logf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @pow(double, double) #0
+declare float @powf(float, float) #0
+declare double @llvm.pow.f64(double, double) #0
+declare float @llvm.pow.f32(float, float) #0
+
+define void @pow_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @pow_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @pow(double %conv, double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @pow_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @pow_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @powf(float %conv, float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sin(double) #0
+declare float @sinf(float) #0
+declare double @llvm.sin.f64(double) #0
+declare float @llvm.sin.f32(float) #0
+
+define void @sin_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @sin_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @sin(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sin_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @sin_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @sinf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sinh(double) #0
+declare float @sinhf(float) #0
+declare double @llvm.sinh.f64(double) #0
+declare float @llvm.sinh.f32(float) #0
+
+define void @sinh_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @sinh_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @sinh(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sinh_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @sinh_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @sinhf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sqrt(double) #0
+declare float @sqrtf(float) #0
+declare double @llvm.sqrt.f64(double) #0
+declare float @llvm.sqrt.f32(float) #0
+
+define void @sqrt_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @sqrt_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @sqrt(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sqrt_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @sqrt_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @sqrtf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @tan(double) #0
+declare float @tanf(float) #0
+declare double @llvm.tan.f64(double) #0
+declare float @llvm.tan.f32(float) #0
+
+define void @tan_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @tan_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @tan(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tan_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @tan_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @tanf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @tanh(double) #0
+declare float @tanhf(float) #0
+declare double @llvm.tanh.f64(double) #0
+declare float @llvm.tanh.f32(float) #0
+
+define void @tanh_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @tanh_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @tanh(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tanh_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @tanh_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @tanhf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @tgamma(double) #0
+declare float @tgammaf(float) #0
+declare double @llvm.tgamma.f64(double) #0
+declare float @llvm.tgamma.f32(float) #0
+
+define void @tgamma_f64(double* nocapture %varray) {
+  ; CHECK-LABEL: @tgamma_f64(
+  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @tgamma(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tgamma_f32(float* nocapture %varray) {
+  ; CHECK-LABEL: @tgamma_f32(
+  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[TMP4:%.*]])
+  ; CHECK:    ret void
+  ;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @tgammaf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+