[llvm] 93a9a8a - [VecLib] Add support for vector fns from Darwin's libsystem.

Mon May 10 13:21:00 PDT 2021

Author: Florian Hahn
Date: 2021-05-10T21:19:58+01:00
New Revision: 93a9a8a8d90f5b9bb6965ebb1104082692d41833

URL: https://github.com/llvm/llvm-project/commit/93a9a8a8d90f5b9bb6965ebb1104082692d41833
DIFF: https://github.com/llvm/llvm-project/commit/93a9a8a8d90f5b9bb6965ebb1104082692d41833.diff

LOG: [VecLib] Add support for vector fns from Darwin's libsystem.

This patch adds support for Darwin's libsystem math vector functions to
TLI. Darwin's libsystem provides a range of vector functions for libm
functions.

This initial patch only adds the 2 x double and 4 x float versions,
which are available on both X86 and ARM64. On X86, wider vector versions
are supported as well.

Reviewed By: jroelofs

Differential Revision: https://reviews.llvm.org/D101856

Added: 
    llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib-darwin-libsystem-m.ll
    llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll

Modified: 
    llvm/include/llvm/Analysis/TargetLibraryInfo.h
    llvm/include/llvm/Analysis/VecFuncs.def
    llvm/lib/Analysis/TargetLibraryInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index adc6363c5b71d..a7ad218356afb 100644

--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -86,11 +86,12 @@ class TargetLibraryInfoImpl {
   /// addVectorizableFunctionsFromVecLib for filling up the tables of
   /// vectorizable functions.
   enum VectorLibrary {
-    NoLibrary,  // Don't use any vector library.
-    Accelerate, // Use Accelerate framework.
-    LIBMVEC_X86,// GLIBC Vector Math library.
-    MASSV,      // IBM MASS vector library.
-    SVML        // Intel short vector math library.
+    NoLibrary,        // Don't use any vector library.
+    Accelerate,       // Use Accelerate framework.
+    DarwinLibSystemM, // Use Darwin's libsystem_m.
+    LIBMVEC_X86,      // GLIBC Vector Math library.
+    MASSV,            // IBM MASS vector library.
+    SVML              // Intel short vector math library.
   };
 
   TargetLibraryInfoImpl();

diff  --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 3391afd7d3baf..2539ff0c91068 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -64,6 +64,59 @@ TLI_DEFINE_VECFUNC("asinhf", "vasinhf", FIXED(4))
 TLI_DEFINE_VECFUNC("acoshf", "vacoshf", FIXED(4))
 TLI_DEFINE_VECFUNC("atanhf", "vatanhf", FIXED(4))
 
+#elif defined(TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS)
+// Darwin libsystem_m vector functions.
+
+// Exponential and Logarithmic Functions
+TLI_DEFINE_VECFUNC("exp", "_simd_exp_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_simd_exp_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("expf", "_simd_exp_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_simd_exp_f4", FIXED(4))
+
+// Trigonometric Functions
+TLI_DEFINE_VECFUNC("acos", "_simd_acos_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("acosf", "_simd_acos_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("asin", "_simd_asin_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("asinf", "_simd_asin_f4", FIXED(4))
+
+TLI_DEFINE_VECFUNC("atan", "_simd_atan_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("atanf", "_simd_atan_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("atan2", "_simd_atan2_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("atan2f", "_simd_atan2_f4", FIXED(4))
+
+TLI_DEFINE_VECFUNC("cos", "_simd_cos_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_simd_cos_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("cosf", "_simd_cos_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_simd_cos_f4", FIXED(4))
+
+TLI_DEFINE_VECFUNC("sin", "_simd_sin_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_simd_sin_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("sinf", "_simd_sin_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_simd_sin_f4", FIXED(4))
+
+// Floating-Point Arithmetic and Auxiliary Functions
+TLI_DEFINE_VECFUNC("cbrt", "_simd_cbrt_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("cbrtf", "_simd_cbrt_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("erf", "_simd_erf_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("erff", "_simd_erf_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("pow", "_simd_pow_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_simd_pow_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("powf", "_simd_pow_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_simd_pow_f4", FIXED(4))
+
+// Hyperbolic Functions
+TLI_DEFINE_VECFUNC("sinh", "_simd_sinh_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("sinhf", "_simd_sinh_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("cosh", "_simd_cosh_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("coshf", "_simd_cosh_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("tanh", "_simd_tanh_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("tanhf", "_simd_tanh_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("asinh", "_simd_asinh_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("asinhf", "_simd_asinh_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("acosh", "_simd_acosh_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("acoshf", "_simd_acosh_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("atanh", "_simd_atanh_d2", FIXED(2))
+TLI_DEFINE_VECFUNC("atanhf", "_simd_atanh_f4", FIXED(4))
 
 #elif defined(TLI_DEFINE_LIBMVEC_X86_VECFUNCS)
 // GLIBC Vector math Functions
@@ -419,6 +472,7 @@ TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f16", FIXED(16))
 
 #undef TLI_DEFINE_VECFUNC
 #undef TLI_DEFINE_ACCELERATE_VECFUNCS
+#undef TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS
 #undef TLI_DEFINE_LIBMVEC_X86_VECFUNCS
 #undef TLI_DEFINE_MASSV_VECFUNCS
 #undef TLI_DEFINE_SVML_VECFUNCS

diff  --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 9ea84f1dcb2f3..153ba073cc10b 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -24,6 +24,8 @@ static cl::opt<TargetLibraryInfoImpl::VectorLibrary> ClVectorLibrary(
                           "No vector functions library"),
                clEnumValN(TargetLibraryInfoImpl::Accelerate, "Accelerate",
                           "Accelerate framework"),
+               clEnumValN(TargetLibraryInfoImpl::DarwinLibSystemM,
+                          "Darwin_libsystem_m", "Darwin libsystem_m"),
                clEnumValN(TargetLibraryInfoImpl::LIBMVEC_X86, "LIBMVEC-X86",
                           "GLIBC Vector Math library"),
                clEnumValN(TargetLibraryInfoImpl::MASSV, "MASSV",
@@ -1622,6 +1624,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     addVectorizableFunctions(VecFuncs);
     break;
   }
+  case DarwinLibSystemM: {
+    const VecDesc VecFuncs[] = {
+    #define TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS
+    #include "llvm/Analysis/VecFuncs.def"
+    };
+    addVectorizableFunctions(VecFuncs);
+    break;
+  }
   case LIBMVEC_X86: {
     const VecDesc VecFuncs[] = {
     #define TLI_DEFINE_LIBMVEC_X86_VECFUNCS

diff  --git a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib-darwin-libsystem-m.ll b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib-darwin-libsystem-m.ll
new file mode 100644
index 0000000000000..7602e1900e436
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib-darwin-libsystem-m.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes
+; RUN: opt -vector-library=Darwin_libsystem_m -replace-with-veclib -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define <4 x float> @sin_v4f32(<4 x float> %in) {
+; CHECK-LABEL: define {{[^@]+}}@sin_v4f32
+; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @_simd_sin_f4(<4 x float> [[IN]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %call = call <4 x float> @llvm.sin.v4f32(<4 x float> %in)
+  ret <4 x float> %call
+}
+declare <4 x float> @llvm.sin.v4f32(<4 x float>) #0
+
+define <2 x double> @sin_v2f64(<2 x double> %in) {
+; CHECK-LABEL: define {{[^@]+}}@sin_v2f64
+; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @_simd_sin_d2(<2 x double> [[IN]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %call = call <2 x double> @llvm.sin.v2f64(<2 x double> %in)
+  ret <2 x double> %call
+}
+declare <2 x double> @llvm.sin.v2f64(<2 x double>)
+
+
+define <2 x double> @exp_v2(<2 x double> %in) {
+; CHECK-LABEL: define {{[^@]+}}@exp_v2
+; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @_simd_exp_d2(<2 x double> [[IN]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %call = call <2 x double> @llvm.exp.v2f64(<2 x double> %in)
+  ret <2 x double> %call
+}
+
+declare <2 x double> @llvm.exp.v2f64(<2 x double>) #0
+
+define <4 x float> @exp_f32(<4 x float> %in) {
+; CHECK-LABEL: define {{[^@]+}}@exp_f32
+; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @_simd_exp_f4(<4 x float> [[IN]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %call = call <4 x float> @llvm.exp.v4f32(<4 x float> %in)
+  ret <4 x float> %call
+}
+
+declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0
+
+define <3 x double> @exp_v3(<3 x double> %in) {
+; CHECK-LABEL: define {{[^@]+}}@exp_v3
+; CHECK-SAME: (<3 x double> [[IN:%.*]]) {
+; CHECK-NEXT:    [[CALL:%.*]] = call <3 x double> @llvm.exp.v3f64(<3 x double> [[IN]])
+; CHECK-NEXT:    ret <3 x double> [[CALL]]
+;
+  %call = call <3 x double> @llvm.exp.v3f64(<3 x double> %in)
+  ret <3 x double> %call
+}
+
+declare <3 x double> @llvm.exp.v3f64(<3 x double>) #0

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
new file mode 100644
index 0000000000000..14367c462030e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
@@ -0,0 +1,724 @@
+; RUN: opt < %s -vector-library=Darwin_libsystem_m -inject-tli-mappings -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare float @expf(float) nounwind readnone
+define void @expf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @expf_v4f32(
+; CHECK: call <4 x float> @_simd_exp_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @expf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @exp(double) nounwind readnone
+define void @exp_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @exp_v2f64(
+; CHECK: call <2 x double> @_simd_exp_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @exp(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @acosf(float) nounwind readnone
+define void @acos_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @acos_v4f32(
+; CHECK: call <4 x float> @_simd_acos_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @acosf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @acos(double) nounwind readnone
+define void @acos_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @acos_v2f64(
+; CHECK: call <2 x double> @_simd_acos_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @acos(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @asinf(float) nounwind readnone
+define void @asinf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @asinf_v4f32(
+; CHECK: call <4 x float> @_simd_asin_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @asinf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @asin(double) nounwind readnone
+define void @asin_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @asin_v2f64(
+; CHECK: call <2 x double> @_simd_asin_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @asin(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+ declare float @atanf(float) nounwind readnone
+define void @atanf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @atanf_v4f32(
+; CHECK: call <4 x float> @_simd_atan_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @atanf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @atan(double) nounwind readnone
+define void @atan_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @atan_v2f64(
+; CHECK: call <2 x double> @_simd_atan_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @atan(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @atan2f(float) nounwind readnone
+define void @atan2f_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @atan2f_v4f32(
+; CHECK: call <4 x float> @_simd_atan2_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @atan2f(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @atan2(double) nounwind readnone
+define void @atan2_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @atan2_v2f64(
+; CHECK: call <2 x double> @_simd_atan2_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @atan2(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @cosf(float) nounwind readnone
+define void @cosf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @cosf_v4f32(
+; CHECK: call <4 x float> @_simd_cos_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @cosf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @cos(double) nounwind readnone
+define void @cos_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @cos_v2f64(
+; CHECK: call <2 x double> @_simd_cos_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @cos(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @cbrtf(float) nounwind readnone
+define void @cbrtf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @cbrtf_v4f32(
+; CHECK: call <4 x float> @_simd_cbrt_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @cbrtf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @cbrt(double) nounwind readnone
+define void @cbrt_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @cbrt_v2f64(
+; CHECK: call <2 x double> @_simd_cbrt_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @cbrt(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @erff(float) nounwind readnone
+define void @erff_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @erff_v4f32(
+; CHECK: call <4 x float> @_simd_erf_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @erff(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @erf(double) nounwind readnone
+define void @erf_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @erf_v2f64(
+; CHECK: call <2 x double> @_simd_erf_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @erf(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @powf(float) nounwind readnone
+define void @powf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @powf_v4f32(
+; CHECK: call <4 x float> @_simd_pow_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @powf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @pow(double) nounwind readnone
+define void @pow_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @pow_v2f64(
+; CHECK: call <2 x double> @_simd_pow_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @pow(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @sinhf(float) nounwind readnone
+define void @sinhf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @sinhf_v4f32(
+; CHECK: call <4 x float> @_simd_sinh_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @sinhf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @sinh(double) nounwind readnone
+define void @sinh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @sinh_v2f64(
+; CHECK: call <2 x double> @_simd_sinh_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @sinh(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @coshf(float) nounwind readnone
+define void @coshf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @coshf_v4f32(
+; CHECK: call <4 x float> @_simd_cosh_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @coshf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @cosh(double) nounwind readnone
+define void @cosh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @cosh_v2f64(
+; CHECK: call <2 x double> @_simd_cosh_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @cosh(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @tanhf(float) nounwind readnone
+define void @tanhf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @tanhf_v4f32(
+; CHECK: call <4 x float> @_simd_tanh_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @tanhf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @tanh(double) nounwind readnone
+define void @tanh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @tanh_v2f64(
+; CHECK: call <2 x double> @_simd_tanh_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @tanh(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @asinhf(float) nounwind readnone
+define void @asinhf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @asinhf_v4f32(
+; CHECK: call <4 x float> @_simd_asinh_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @asinhf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @asinh(double) nounwind readnone
+define void @asinh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @asinh_v2f64(
+; CHECK: call <2 x double> @_simd_asinh_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @asinh(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @acoshf(float) nounwind readnone
+define void @acoshf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @acoshf_v4f32(
+; CHECK: call <4 x float> @_simd_acosh_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @acoshf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @acosh(double) nounwind readnone
+define void @acosh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @acosh_v2f64(
+; CHECK: call <2 x double> @_simd_acosh_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @acosh(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @atanhf(float) nounwind readnone
+define void @atanhf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
+; CHECK-LABEL: @atanhf_v4f32(
+; CHECK: call <4 x float> @_simd_atanh_f4(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %gep.y, align 4
+  %call = tail call float @atanhf(float %lv)
+  %gep.x = getelementptr inbounds float, float* %x, i64 %iv
+  store float %call, float* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare double @atanh(double) nounwind readnone
+define void @atanh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
+; CHECK-LABEL: @atanh_v2f64(
+; CHECK: call <2 x double> @_simd_atanh_d2(
+; CHECK: ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %gep.y = getelementptr inbounds double, double* %y, i64 %iv
+  %lv = load double, double* %gep.y, align 4
+  %call = tail call double @atanh(double %lv)
+  %gep.x = getelementptr inbounds double, double* %x, i64 %iv
+  store double %call, double* %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}