[llvm] Adding more vector calls for -fveclib=AMDLIBM (PR #109662)
Rohit Aggarwal via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 28 21:55:34 PDT 2024
https://github.com/rohitaggarwal007 updated https://github.com/llvm/llvm-project/pull/109662
>From 4e9e39be2e88f8ef706feda5a8be73bb0aa12602 Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Date: Mon, 23 Sep 2024 18:28:00 +0530
Subject: [PATCH 1/2] Add more vector calls for -fveclib=AMDLIBM
Change-Id: Iba8d1ebd0cbad09d9f6ebc4215cc4467239420b5
---
llvm/include/llvm/Analysis/VecFuncs.def | 24 ++
.../LoopVectorize/X86/amdlibm-calls-finite.ll | 275 ++++++++++++------
.../LoopVectorize/X86/amdlibm-calls.ll | 212 ++++++++++++++
llvm/test/Transforms/Util/add-TLI-mappings.ll | 27 +-
4 files changed, 443 insertions(+), 95 deletions(-)
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 532a3ca334b1ae..06024131c5bdac 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -1317,14 +1317,17 @@ TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd2_log2", FIXED(2), NOMASK, "_ZGV_LLV
TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd4_log2", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd8_log2", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("log10", "amd_vrd2_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("log10f", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
TLI_DEFINE_VECFUNC("log10f", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("log10f", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__log10_finite", "amd_vrd2_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "amd_vrd2_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
@@ -1339,6 +1342,12 @@ TLI_DEFINE_VECFUNC("erf", "amd_vrd8_erf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("exp10", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("exp10f", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__exp10_finite", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("__exp10f_finite", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+
+TLI_DEFINE_VECFUNC("llvm.exp10.f64", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+
TLI_DEFINE_VECFUNC("expm1", "amd_vrd2_expm1", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("expm1f", "amd_vrs4_expm1f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
@@ -1369,10 +1378,19 @@ TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LL
TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+TLI_DEFINE_VECFUNC("__asin_finite", "amd_vrd8_asin", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__asinf_finite", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__asinf_finite", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__asinf_finite", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
TLI_DEFINE_VECFUNC("acosf", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
TLI_DEFINE_VECFUNC("acosf", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("acosf", "amd_vrs16_acosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+TLI_DEFINE_VECFUNC("__acosf_finite", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__acosf_finite", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__acosf_finite", "amd_vrs16_acosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs16_acosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
@@ -1410,6 +1428,12 @@ TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs16_tanhf", FIXED(16), NOMASK, "_ZGV_
TLI_DEFINE_VECFUNC("cbrt", "amd_vrd2_cbrt", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("cbrtf", "amd_vrs4_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sincos", "amd_vrd4_sincos", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl8l8")
+TLI_DEFINE_VECFUNC("sincos", "amd_vrd8_sincos", FIXED(8), NOMASK, "_ZGV_LLVM_N8vl8l8")
+
+TLI_DEFINE_VECFUNC("sincosf", "amd_vrs4_sincosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4l4")
+TLI_DEFINE_VECFUNC("sincosf", "amd_vrs8_sincosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8vl4l4")
+TLI_DEFINE_VECFUNC("sincosf", "amd_vrs16_sincosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16vl4l4")
#else
#error "Must choose which vector library functions are to be defined."
#endif
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll
index 54bb9352f3c89c..9899eded738086 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll
@@ -7,12 +7,10 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-declare float @__expf_finite(float) #0
-
+define void @exp_f32(ptr nocapture %varray) {
; CHECK-LABEL: @exp_f32
; CHECK: <4 x float> @amd_vrs4_expf
; CHECK: ret
-define void @exp_f32(ptr nocapture %varray) {
entry:
br label %for.body
@@ -25,23 +23,16 @@ for.body: ; preds = %for.body, %entry
store float %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!1 = distinct !{!1, !2, !3}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__exp_finite(double) #0
-
+define void @exp_f64(ptr nocapture %varray) {
; CHECK-LABEL: @exp_f64
; CHECK: <4 x double> @amd_vrd4_exp
; CHECK: ret
-define void @exp_f64(ptr nocapture %varray) {
entry:
br label %for.body
@@ -54,25 +45,16 @@ for.body: ; preds = %for.body, %entry
store double %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !11
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!11 = distinct !{!11, !12, !13}
-!12 = !{!"llvm.loop.vectorize.width", i32 4}
-!13 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-
-
-declare float @__logf_finite(float) #0
-
+define void @log_f32(ptr nocapture %varray) {
; CHECK-LABEL: @log_f32
; CHECK: <4 x float> @amd_vrs4_logf
; CHECK: ret
-define void @log_f32(ptr nocapture %varray) {
entry:
br label %for.body
@@ -85,23 +67,16 @@ for.body: ; preds = %for.body, %entry
store float %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!21 = distinct !{!21, !22, !23}
-!22 = !{!"llvm.loop.vectorize.width", i32 4}
-!23 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__log_finite(double) #0
-
+define void @log_f64(ptr nocapture %varray) {
; CHECK-LABEL: @log_f64
; CHECK: <4 x double> @amd_vrd4_log
; CHECK: ret
-define void @log_f64(ptr nocapture %varray) {
entry:
br label %for.body
@@ -114,23 +89,16 @@ for.body: ; preds = %for.body, %entry
store double %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!31 = distinct !{!31, !32, !33}
-!32 = !{!"llvm.loop.vectorize.width", i32 4}
-!33 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare float @__powf_finite(float, float) #0
-
+define void @pow_f32(ptr nocapture %varray, ptr nocapture readonly %exp) {
; CHECK-LABEL: @pow_f32
; CHECK: <4 x float> @amd_vrs4_powf
; CHECK: ret
-define void @pow_f32(ptr nocapture %varray, ptr nocapture readonly %exp) {
entry:
br label %for.body
@@ -145,23 +113,16 @@ for.body: ; preds = %for.body, %entry
store float %tmp2, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !41
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!41 = distinct !{!41, !42, !43}
-!42 = !{!"llvm.loop.vectorize.width", i32 4}
-!43 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__pow_finite(double, double) #0
-
+define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
; CHECK-LABEL: @pow_f64
; CHECK: <4 x double> @amd_vrd4_pow
; CHECK: ret
-define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
entry:
br label %for.body
@@ -176,18 +137,12 @@ for.body: ; preds = %for.body, %entry
store double %tmp2, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !51
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!51 = distinct !{!51, !52, !53}
-!52 = !{!"llvm.loop.vectorize.width", i32 4}
-!53 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare float @__exp2f_finite(float) #0
-
define void @exp2f_finite(ptr nocapture %varray) {
; CHECK-LABEL: @exp2f_finite(
; CHECK: call <4 x float> @amd_vrs4_exp2f(<4 x float> %{{.*}})
@@ -205,18 +160,12 @@ for.body:
store float %call, ptr %arrayidx, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !61
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end:
ret void
}
-!61 = distinct !{!61, !62, !63}
-!62 = !{!"llvm.loop.vectorize.width", i32 4}
-!63 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare double @__exp2_finite(double) #0
-
define void @exp2_finite(ptr nocapture %varray) {
; CHECK-LABEL: @exp2_finite(
; CHECK: call <4 x double> @amd_vrd4_exp2(<4 x double> {{.*}})
@@ -234,22 +183,16 @@ for.body:
store double %call, ptr %arrayidx, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !71
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end:
ret void
}
-!71 = distinct !{!71, !72, !73}
-!72 = !{!"llvm.loop.vectorize.width", i32 4}
-!73 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare float @__log2f_finite(float) #0
-
+define void @log2_f32(ptr nocapture %varray) {
; CHECK-LABEL: @log2_f32
; CHECK: <4 x float> @amd_vrs4_log2f
; CHECK: ret
-define void @log2_f32(ptr nocapture %varray) {
entry:
br label %for.body
@@ -262,23 +205,16 @@ for.body: ; preds = %for.body, %entry
store float %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!81 = distinct !{!21, !22, !23}
-!82 = !{!"llvm.loop.vectorize.width", i32 4}
-!83 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__log2_finite(double) #0
-
+define void @log2_f64(ptr nocapture %varray) {
; CHECK-LABEL: @log2_f64
; CHECK: <4 x double> @amd_vrd4_log2
; CHECK: ret
-define void @log2_f64(ptr nocapture %varray) {
entry:
br label %for.body
@@ -291,22 +227,16 @@ for.body: ; preds = %for.body, %entry
store double %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!91 = distinct !{!31, !32, !33}
-!92 = !{!"llvm.loop.vectorize.width", i32 4}
-!93 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare float @__log10f_finite(float) #0
-
+define void @log10_f32(ptr nocapture %varray) {
; CHECK-LABEL: @log10_f32
; CHECK: <4 x float> @amd_vrs4_log10f
; CHECK: ret
-define void @log10_f32(ptr nocapture %varray) {
entry:
br label %for.body
@@ -319,14 +249,173 @@ for.body: ; preds = %for.body, %entry
store float %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+define void @log10_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @log10_finite(
+; CHECK: call <2 x double> @amd_vrd2_log10(<2 x double> {{.*}})
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @__log10_finite(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret void
+}
+
+define void @exp10_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_finite(
+; CHECK: call <2 x double> @amd_vrd2_exp10(<2 x double> {{.*}})
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @__exp10_finite(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret void
+}
+
+define void @exp10_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f32
+; CHECK: <4 x float> @amd_vrs4_exp10f
+; CHECK: ret
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call fast float @__exp10f_finite(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv
+ store float %call, ptr %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!101 = distinct !{!21, !22, !23}
-!102 = !{!"llvm.loop.vectorize.width", i32 4}
-!103 = !{!"llvm.loop.vectorize.enable", i1 true}
+define void @asin_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @asin_finite(
+; CHECK: call <8 x double> @amd_vrd8_asin(<8 x double> {{.*}})
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @__asin_finite(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !7
+
+for.end:
+ ret void
+}
+define void @asinf_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @asinf_finite
+; CHECK: <4 x float> @amd_vrs4_asinf
+; CHECK: ret
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call fast float @__asinf_finite(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv
+ store float %call, ptr %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+define void @acosf_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @acosf_finite
+; CHECK: <4 x float> @amd_vrs4_acosf
+; CHECK: ret
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call fast float @__acosf_finite(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv
+ store float %call, ptr %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!4 = distinct !{!4, !5, !6}
+!5 = !{!"llvm.loop.vectorize.width", i32 4}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!7 = distinct !{!7, !8, !9}
+!8 = !{!"llvm.loop.vectorize.width", i32 8}
+!9 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+declare float @__expf_finite(float) #0
+declare double @__exp_finite(double) #0
+declare double @__log_finite(double) #0
+declare float @__logf_finite(float) #0
+declare float @__powf_finite(float, float) #0
+declare double @__pow_finite(double, double) #0
+declare float @__exp2f_finite(float) #0
+declare double @__exp2_finite(double) #0
+declare float @__log2f_finite(float) #0
+declare double @__log2_finite(double) #0
+declare float @__log10f_finite(float) #0
+declare double @__log10_finite(double) #0
+declare double @__exp10_finite(double) #0
+declare float @__exp10f_finite(float) #0
+declare double @__asin_finite(double) #0
+declare float @__asinf_finite(float) #0
+declare float @__acosf_finite(float) #0
\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
index 4acc7fe7eaccf6..f6c91de15279e4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
@@ -1444,6 +1444,59 @@ for.end:
ret void
}
+define void @log10_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @log10_f64(
+; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log10(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.log10.v4f64(<4 x double> [[TMP4:%.*]])
+; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @llvm.log10.v8f64(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.log10.v16f64(<16 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @log10(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @log10_f64_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @log10_f64_intrinsic(
+; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log10(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.log10.v4f64(<4 x double> [[TMP4:%.*]])
+; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @llvm.log10.v8f64(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.log10.v16f64(<16 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @llvm.log10.f64(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+
define void @log10_f32(ptr nocapture %varray) {
; CHECK-LABEL: @log10_f32(
; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.log10.v2f32(<2 x float> [[TMP4:%.*]])
@@ -1600,4 +1653,163 @@ for.end:
ret void
}
+define void @exp10_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f64(
+; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp10(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4-NOT: call <4 x double> @amd_vrd4_exp10(<4 x double> [[TMP4:%.*]])
+; CHECK-VF8-NOT: call <8 x double> @amd_vrd8_exp10(<8 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @exp10(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @exp10_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f32(
+; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_exp10f(<4 x float> [[TMP4:%.*]])
+; CHECK-VF8-NOT: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_exp10f(<8 x float> [[TMP4:%.*]])
+; CHECK-VF16-NOT: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_exp10f(<16 x float> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call float @exp10f(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+ store float %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @exp10_f64_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f64_intrinsic(
+; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp10(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4-NOT: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp10(<4 x double> [[TMP4:%.*]])
+; CHECK-VF8-NOT: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp10(<8 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @llvm.exp10.f64(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @exp10_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f32_intrinsic(
+; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_exp10f(<4 x float> [[TMP4:%.*]])
+; CHECK-VF8-NOT: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_exp10f(<8 x float> [[TMP4:%.*]])
+; CHECK-VF16-NOT: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_exp10f(<16 x float> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call float @llvm.exp10.f32(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+ store float %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+
+define void @sincos_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; CHECK-LABEL: define void @sincos_f64
+; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]])
+; CHECK-VF4: call void @amd_vrd4_sincos(<4 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
+; CHECK-VF8: call void @amd_vrd8_sincos(<8 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %gepa = getelementptr double, ptr %a, i64 %indvars.iv
+ %num = load double, ptr %gepa, align 8
+ %gepb = getelementptr double, ptr %b, i64 %indvars.iv
+ %gepc = getelementptr double, ptr %c, i64 %indvars.iv
+ call void @sincos(double %num, ptr %gepb, ptr %gepc)
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+define void @sincos_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; CHECK-LABEL: define void @sincos_f32
+; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]])
+; CHECK-VF4: call void @amd_vrs4_sincosf(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
+; CHECK-VF8: call void @amd_vrs8_sincosf(<8 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
+; CHECK-VF16: call void @amd_vrs16_sincosf(<16 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %gepa = getelementptr float, ptr %a, i64 %indvars.iv
+ %num = load float, ptr %gepa, align 8
+ %gepb = getelementptr float, ptr %b, i64 %indvars.iv
+ %gepc = getelementptr float, ptr %c, i64 %indvars.iv
+ call void @sincosf(float %num, ptr %gepb, ptr %gepc)
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
attributes #0 = { nounwind readnone }
+
+declare double @exp10(double) #0
+declare float @exp10f(float) #0
+declare double @llvm.exp10.f64(double) #0
+declare float @llvm.exp10.f32(float) #0
+declare void @sincos(double, ptr, ptr)
+declare void @sincosf(float, ptr, ptr)
\ No newline at end of file
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index 4e4b81e89a3270..76cccbd6f39cc3 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -14,10 +14,15 @@
; SVML-SAME: ptr @__svml_log10f4,
; SVML-SAME: ptr @__svml_log10f8,
; SVML-SAME: ptr @__svml_log10f16
-; AMDLIBM-SAME: [6 x ptr] [
+; AMDLIBM-SAME: [11 x ptr] [
; AMDLIBM-SAME: ptr @amd_vrd2_sin,
; AMDLIBM-SAME: ptr @amd_vrd4_sin,
; AMDLIBM-SAME: ptr @amd_vrd8_sin,
+; AMDLIBM-SAME: ptr @amd_vrd4_sincos,
+; AMDLIBM-SAME: ptr @amd_vrd8_sincos,
+; AMDLIBM-SAME: ptr @amd_vrs4_sincosf,
+; AMDLIBM-SAME: ptr @amd_vrs8_sincosf,
+; AMDLIBM-SAME: ptr @amd_vrs16_sincosf
; AMDLIBM-SAME: ptr @amd_vrs4_log10f,
; AMDLIBM-SAME: ptr @amd_vrs8_log10f,
; AMDLIBM-SAME: ptr @amd_vrs16_log10f
@@ -106,6 +111,7 @@ define void @sincos_f64(double %in, ptr %sin, ptr %cos) {
; COMMON-LABEL: @sincos_f64(
; SLEEFGNUABI: call void @sincos(double %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) #[[SINCOS:[0-9]+]]
; ARMPL: call void @sincos(double %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) #[[SINCOS:[0-9]+]]
+; AMDLIBM: call void @sincos(double %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) #[[SINCOS:[0-9]+]]
call void @sincos(double %in, ptr %sin, ptr %cos)
ret void
}
@@ -116,6 +122,7 @@ define void @sincos_f32(float %in, ptr %sin, ptr %cos) {
; COMMON-LABEL: @sincos_f32(
; SLEEFGNUABI: call void @sincosf(float %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) #[[SINCOSF:[0-9]+]]
; ARMPL: call void @sincosf(float %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) #[[SINCOSF:[0-9]+]]
+; AMDLIBM: call void @sincosf(float %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) #[[SINCOSF:[0-9]+]]
call void @sincosf(float %in, ptr %sin, ptr %cos)
ret void
}
@@ -145,7 +152,7 @@ declare void @sincospif(float, ptr, ptr) #0
define float @call_llvm.log10.f32(float %in) {
; COMMON-LABEL: @call_llvm.log10.f32(
; SVML: call float @llvm.log10.f32(float %{{.*}})
-; AMDLIBM: call float @llvm.log10.f32(float %{{.*}})
+; AMDLIBM: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
; LIBMVEC-X86: call float @llvm.log10.f32(float %{{.*}})
; MASSV: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
; ACCELERATE: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
@@ -171,6 +178,11 @@ declare float @llvm.log10.f32(float) #0
; AMDLIBM: declare <2 x double> @amd_vrd2_sin(<2 x double>)
; AMDLIBM: declare <4 x double> @amd_vrd4_sin(<4 x double>)
; AMDLIBM: declare <8 x double> @amd_vrd8_sin(<8 x double>)
+; AMDLIBM: declare void @amd_vrd4_sincos(<4 x double>, ptr, ptr)
+; AMDLIBM: declare void @amd_vrd8_sincos(<8 x double>, ptr, ptr)
+; AMDLIBM: declare void @amd_vrs4_sincosf(<4 x float>, ptr, ptr)
+; AMDLIBM: declare void @amd_vrs8_sincosf(<8 x float>, ptr, ptr)
+; AMDLIBM: declare void @amd_vrs16_sincosf(<16 x float>, ptr, ptr)
; AMDLIBM: declare <4 x float> @amd_vrs4_log10f(<4 x float>)
; AMDLIBM: declare <8 x float> @amd_vrs8_log10f(<8 x float>)
; AMDLIBM: declare <16 x float> @amd_vrs16_log10f(<16 x float>)
@@ -228,6 +240,17 @@ attributes #0 = { nounwind readnone }
; AMDLIBM-SAME: "_ZGV_LLVM_N2v_sin(amd_vrd2_sin),
; AMDLIBM-SAME: _ZGV_LLVM_N4v_sin(amd_vrd4_sin),
; AMDLIBM-SAME: _ZGV_LLVM_N8v_sin(amd_vrd8_sin)" }
+; AMDLIBM: attributes #[[SINCOS]] = { "vector-function-abi-variant"=
+; AMDLIBM-SAME: "_ZGV_LLVM_N4vl8l8_sincos(amd_vrd4_sincos),
+; AMDLIBM-SAME: _ZGV_LLVM_N8vl8l8_sincos(amd_vrd8_sincos)" }
+; AMDLIBM: attributes #[[SINCOSF]] = { "vector-function-abi-variant"=
+; AMDLIBM-SAME: "_ZGV_LLVM_N4vl4l4_sincosf(amd_vrs4_sincosf),
+; AMDLIBM-SAME: _ZGV_LLVM_N8vl4l4_sincosf(amd_vrs8_sincosf),
+; AMDLIBM-SAME: _ZGV_LLVM_N16vl4l4_sincosf(amd_vrs16_sincosf)" }
+; AMDLIBM: attributes #[[LOG10]] = { "vector-function-abi-variant"=
+; AMDLIBM-SAME: "_ZGV_LLVM_N4v_llvm.log10.f32(amd_vrs4_log10f),
+; AMDLIBM-SAME: _ZGV_LLVM_N8v_llvm.log10.f32(amd_vrs8_log10f),
+; AMDLIBM-SAME: _ZGV_LLVM_N16v_llvm.log10.f32(amd_vrs16_log10f)" }
; MASSV: attributes #[[SIN]] = { "vector-function-abi-variant"=
; MASSV-SAME: "_ZGV_LLVM_N2v_sin(__sind2)" }
>From 167616dc4d4fc2efaed4590637283a59ed9f69ac Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Date: Mon, 23 Sep 2024 18:28:00 +0530
Subject: [PATCH 2/2] Add more vector calls for -fveclib=AMDLIBM
Change-Id: Iba8d1ebd0cbad09d9f6ebc4215cc4467239420b5
---
.../include/llvm/Analysis/TargetLibraryInfo.h | 1 +
.../llvm/Analysis/TargetTransformInfoImpl.h | 3 +-
llvm/include/llvm/Analysis/VecFuncs.def | 24 ++
llvm/lib/Analysis/ValueTracking.cpp | 4 +
llvm/lib/Analysis/VectorUtils.cpp | 1 +
.../LoopVectorize/X86/amdlibm-calls-finite.ll | 275 ++++++++++++------
.../LoopVectorize/X86/amdlibm-calls.ll | 216 ++++++++++++++
llvm/test/Transforms/Util/add-TLI-mappings.ll | 27 +-
8 files changed, 455 insertions(+), 96 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 9e543b844ad768..a013814dba63ab 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -432,6 +432,7 @@ class TargetLibraryInfo {
case LibFunc_trunc: case LibFunc_truncf: case LibFunc_truncl:
case LibFunc_log2: case LibFunc_log2f: case LibFunc_log2l:
case LibFunc_exp2: case LibFunc_exp2f: case LibFunc_exp2l:
+ case LibFunc_exp10: case LibFunc_exp10f: case LibFunc_exp10l:
case LibFunc_ldexp: case LibFunc_ldexpf: case LibFunc_ldexpl:
case LibFunc_memcpy: case LibFunc_memset: case LibFunc_memmove:
case LibFunc_memcmp: case LibFunc_bcmp: case LibFunc_strcmp:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 79c8bafbc6c0df..35a5bc3897b1c7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -172,7 +172,8 @@ class TargetTransformInfoImplBase {
Name == "sinh" || Name == "sinhf" || Name == "sinhl" ||
Name == "cosh" || Name == "coshf" || Name == "coshl" ||
Name == "tanh" || Name == "tanhf" || Name == "tanhl" ||
- Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
+ Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl" ||
+ Name == "exp10" || Name == "exp10l" || Name == "exp10f")
return false;
// clang-format on
// These are all likely to be optimized into something smaller.
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 532a3ca334b1ae..06024131c5bdac 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -1317,14 +1317,17 @@ TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd2_log2", FIXED(2), NOMASK, "_ZGV_LLV
TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd4_log2", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd8_log2", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("log10", "amd_vrd2_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("log10f", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
TLI_DEFINE_VECFUNC("log10f", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("log10f", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__log10_finite", "amd_vrd2_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "amd_vrd2_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
@@ -1339,6 +1342,12 @@ TLI_DEFINE_VECFUNC("erf", "amd_vrd8_erf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("exp10", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("exp10f", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__exp10_finite", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("__exp10f_finite", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+
+TLI_DEFINE_VECFUNC("llvm.exp10.f64", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+
TLI_DEFINE_VECFUNC("expm1", "amd_vrd2_expm1", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("expm1f", "amd_vrs4_expm1f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
@@ -1369,10 +1378,19 @@ TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LL
TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+TLI_DEFINE_VECFUNC("__asin_finite", "amd_vrd8_asin", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__asinf_finite", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__asinf_finite", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__asinf_finite", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
TLI_DEFINE_VECFUNC("acosf", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
TLI_DEFINE_VECFUNC("acosf", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("acosf", "amd_vrs16_acosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+TLI_DEFINE_VECFUNC("__acosf_finite", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__acosf_finite", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__acosf_finite", "amd_vrs16_acosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs16_acosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
@@ -1410,6 +1428,12 @@ TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs16_tanhf", FIXED(16), NOMASK, "_ZGV_
TLI_DEFINE_VECFUNC("cbrt", "amd_vrd2_cbrt", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("cbrtf", "amd_vrs4_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sincos", "amd_vrd4_sincos", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl8l8")
+TLI_DEFINE_VECFUNC("sincos", "amd_vrd8_sincos", FIXED(8), NOMASK, "_ZGV_LLVM_N8vl8l8")
+
+TLI_DEFINE_VECFUNC("sincosf", "amd_vrs4_sincosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4l4")
+TLI_DEFINE_VECFUNC("sincosf", "amd_vrs8_sincosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8vl4l4")
+TLI_DEFINE_VECFUNC("sincosf", "amd_vrs16_sincosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16vl4l4")
#else
#error "Must choose which vector library functions are to be defined."
#endif
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 56eb3f99b39d2c..8749b879e75331 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -4213,6 +4213,10 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(const CallBase &CB,
case LibFunc_exp2f:
case LibFunc_exp2l:
return Intrinsic::exp2;
+ case LibFunc_exp10:
+ case LibFunc_exp10f:
+ case LibFunc_exp10l:
+ return Intrinsic::exp10;
case LibFunc_log:
case LibFunc_logf:
case LibFunc_logl:
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index d45d3bbefe4fd3..8411d4cd02a162 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -76,6 +76,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::cosh:
case Intrinsic::tanh:
case Intrinsic::exp:
+ case Intrinsic::exp10:
case Intrinsic::exp2:
case Intrinsic::log:
case Intrinsic::log10:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll
index 54bb9352f3c89c..9899eded738086 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll
@@ -7,12 +7,10 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-declare float @__expf_finite(float) #0
-
+define void @exp_f32(ptr nocapture %varray) {
; CHECK-LABEL: @exp_f32
; CHECK: <4 x float> @amd_vrs4_expf
; CHECK: ret
-define void @exp_f32(ptr nocapture %varray) {
entry:
br label %for.body
@@ -25,23 +23,16 @@ for.body: ; preds = %for.body, %entry
store float %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!1 = distinct !{!1, !2, !3}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__exp_finite(double) #0
-
+define void @exp_f64(ptr nocapture %varray) {
; CHECK-LABEL: @exp_f64
; CHECK: <4 x double> @amd_vrd4_exp
; CHECK: ret
-define void @exp_f64(ptr nocapture %varray) {
entry:
br label %for.body
@@ -54,25 +45,16 @@ for.body: ; preds = %for.body, %entry
store double %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !11
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!11 = distinct !{!11, !12, !13}
-!12 = !{!"llvm.loop.vectorize.width", i32 4}
-!13 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-
-
-declare float @__logf_finite(float) #0
-
+define void @log_f32(ptr nocapture %varray) {
; CHECK-LABEL: @log_f32
; CHECK: <4 x float> @amd_vrs4_logf
; CHECK: ret
-define void @log_f32(ptr nocapture %varray) {
entry:
br label %for.body
@@ -85,23 +67,16 @@ for.body: ; preds = %for.body, %entry
store float %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!21 = distinct !{!21, !22, !23}
-!22 = !{!"llvm.loop.vectorize.width", i32 4}
-!23 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__log_finite(double) #0
-
+define void @log_f64(ptr nocapture %varray) {
; CHECK-LABEL: @log_f64
; CHECK: <4 x double> @amd_vrd4_log
; CHECK: ret
-define void @log_f64(ptr nocapture %varray) {
entry:
br label %for.body
@@ -114,23 +89,16 @@ for.body: ; preds = %for.body, %entry
store double %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!31 = distinct !{!31, !32, !33}
-!32 = !{!"llvm.loop.vectorize.width", i32 4}
-!33 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare float @__powf_finite(float, float) #0
-
+define void @pow_f32(ptr nocapture %varray, ptr nocapture readonly %exp) {
; CHECK-LABEL: @pow_f32
; CHECK: <4 x float> @amd_vrs4_powf
; CHECK: ret
-define void @pow_f32(ptr nocapture %varray, ptr nocapture readonly %exp) {
entry:
br label %for.body
@@ -145,23 +113,16 @@ for.body: ; preds = %for.body, %entry
store float %tmp2, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !41
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!41 = distinct !{!41, !42, !43}
-!42 = !{!"llvm.loop.vectorize.width", i32 4}
-!43 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__pow_finite(double, double) #0
-
+define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
; CHECK-LABEL: @pow_f64
; CHECK: <4 x double> @amd_vrd4_pow
; CHECK: ret
-define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
entry:
br label %for.body
@@ -176,18 +137,12 @@ for.body: ; preds = %for.body, %entry
store double %tmp2, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !51
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!51 = distinct !{!51, !52, !53}
-!52 = !{!"llvm.loop.vectorize.width", i32 4}
-!53 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare float @__exp2f_finite(float) #0
-
define void @exp2f_finite(ptr nocapture %varray) {
; CHECK-LABEL: @exp2f_finite(
; CHECK: call <4 x float> @amd_vrs4_exp2f(<4 x float> %{{.*}})
@@ -205,18 +160,12 @@ for.body:
store float %call, ptr %arrayidx, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !61
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end:
ret void
}
-!61 = distinct !{!61, !62, !63}
-!62 = !{!"llvm.loop.vectorize.width", i32 4}
-!63 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare double @__exp2_finite(double) #0
-
define void @exp2_finite(ptr nocapture %varray) {
; CHECK-LABEL: @exp2_finite(
; CHECK: call <4 x double> @amd_vrd4_exp2(<4 x double> {{.*}})
@@ -234,22 +183,16 @@ for.body:
store double %call, ptr %arrayidx, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !71
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end:
ret void
}
-!71 = distinct !{!71, !72, !73}
-!72 = !{!"llvm.loop.vectorize.width", i32 4}
-!73 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare float @__log2f_finite(float) #0
-
+define void @log2_f32(ptr nocapture %varray) {
; CHECK-LABEL: @log2_f32
; CHECK: <4 x float> @amd_vrs4_log2f
; CHECK: ret
-define void @log2_f32(ptr nocapture %varray) {
entry:
br label %for.body
@@ -262,23 +205,16 @@ for.body: ; preds = %for.body, %entry
store float %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!81 = distinct !{!21, !22, !23}
-!82 = !{!"llvm.loop.vectorize.width", i32 4}
-!83 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__log2_finite(double) #0
-
+define void @log2_f64(ptr nocapture %varray) {
; CHECK-LABEL: @log2_f64
; CHECK: <4 x double> @amd_vrd4_log2
; CHECK: ret
-define void @log2_f64(ptr nocapture %varray) {
entry:
br label %for.body
@@ -291,22 +227,16 @@ for.body: ; preds = %for.body, %entry
store double %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!91 = distinct !{!31, !32, !33}
-!92 = !{!"llvm.loop.vectorize.width", i32 4}
-!93 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare float @__log10f_finite(float) #0
-
+define void @log10_f32(ptr nocapture %varray) {
; CHECK-LABEL: @log10_f32
; CHECK: <4 x float> @amd_vrs4_log10f
; CHECK: ret
-define void @log10_f32(ptr nocapture %varray) {
entry:
br label %for.body
@@ -319,14 +249,173 @@ for.body: ; preds = %for.body, %entry
store float %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+define void @log10_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @log10_finite(
+; CHECK: call <2 x double> @amd_vrd2_log10(<2 x double> {{.*}})
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @__log10_finite(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret void
+}
+
+define void @exp10_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_finite(
+; CHECK: call <2 x double> @amd_vrd2_exp10(<2 x double> {{.*}})
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @__exp10_finite(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret void
+}
+
+define void @exp10_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f32
+; CHECK: <4 x float> @amd_vrs4_exp10f
+; CHECK: ret
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call fast float @__exp10f_finite(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv
+ store float %call, ptr %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!101 = distinct !{!21, !22, !23}
-!102 = !{!"llvm.loop.vectorize.width", i32 4}
-!103 = !{!"llvm.loop.vectorize.enable", i1 true}
+define void @asin_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @asin_finite(
+; CHECK: call <8 x double> @amd_vrd8_asin(<8 x double> {{.*}})
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @__asin_finite(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !7
+
+for.end:
+ ret void
+}
+define void @asinf_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @asinf_finite
+; CHECK: <4 x float> @amd_vrs4_asinf
+; CHECK: ret
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call fast float @__asinf_finite(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv
+ store float %call, ptr %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+define void @acosf_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @acosf_finite
+; CHECK: <4 x float> @amd_vrs4_acosf
+; CHECK: ret
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call fast float @__acosf_finite(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv
+ store float %call, ptr %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!4 = distinct !{!4, !5, !6}
+!5 = !{!"llvm.loop.vectorize.width", i32 4}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!7 = distinct !{!7, !8, !9}
+!8 = !{!"llvm.loop.vectorize.width", i32 8}
+!9 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+declare float @__expf_finite(float) #0
+declare double @__exp_finite(double) #0
+declare double @__log_finite(double) #0
+declare float @__logf_finite(float) #0
+declare float @__powf_finite(float, float) #0
+declare double @__pow_finite(double, double) #0
+declare float @__exp2f_finite(float) #0
+declare double @__exp2_finite(double) #0
+declare float @__log2f_finite(float) #0
+declare double @__log2_finite(double) #0
+declare float @__log10f_finite(float) #0
+declare double @__log10_finite(double) #0
+declare double @__exp10_finite(double) #0
+declare float @__exp10f_finite(float) #0
+declare double @__asin_finite(double) #0
+declare float @__asinf_finite(float) #0
+declare float @__acosf_finite(float) #0
\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
index 4acc7fe7eaccf6..d39ebaa4cf1ed6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
@@ -1444,6 +1444,32 @@ for.end:
ret void
}
+define void @log10_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @log10_f64(
+; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log10(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.log10.v4f64(<4 x double> [[TMP4:%.*]])
+; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @llvm.log10.v8f64(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.log10.v16f64(<16 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @log10(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
define void @log10_f32(ptr nocapture %varray) {
; CHECK-LABEL: @log10_f32(
; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.log10.v2f32(<2 x float> [[TMP4:%.*]])
@@ -1470,6 +1496,32 @@ for.end:
ret void
}
+define void @log10_f64_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @log10_f64_intrinsic(
+; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log10(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.log10.v4f64(<4 x double> [[TMP4:%.*]])
+; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @llvm.log10.v8f64(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.log10.v16f64(<16 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @llvm.log10.f64(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
define void @log10_f32_intrinsic(ptr nocapture %varray) {
; CHECK-LABEL: @log10_f32_intrinsic(
; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.log10.v2f32(<2 x float> [[TMP4:%.*]])
@@ -1600,4 +1652,168 @@ for.end:
ret void
}
+define void @exp10_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f64(
+; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp10(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4: call <4 x double> @llvm.exp10.v4f64(<4 x double> [[TMP4:%.*]])
+; CHECK-VF8: call <8 x double> @llvm.exp10.v8f64(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16: call <16 x double> @llvm.exp10.v16f64(<16 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @exp10(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @exp10_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f32(
+; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.exp10.v2f32(<2 x float> [[TMP4:%.*]])
+; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_exp10f(<4 x float> [[TMP4:%.*]])
+; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @llvm.exp10.v8f32(<8 x float> [[TMP4:%.*]])
+; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @llvm.exp10.v16f32(<16 x float> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call float @exp10f(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+ store float %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @exp10_f64_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f64_intrinsic(
+; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp10(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.exp10.v4f64(<4 x double> [[TMP4:%.*]])
+; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @llvm.exp10.v8f64(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.exp10.v16f64(<16 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @llvm.exp10.f64(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @exp10_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f32_intrinsic(
+; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.exp10.v2f32(<2 x float> [[TMP4:%.*]])
+; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_exp10f(<4 x float> [[TMP4:%.*]])
+; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @llvm.exp10.v8f32(<8 x float> [[TMP4:%.*]])
+; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @llvm.exp10.v16f32(<16 x float> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call float @llvm.exp10.f32(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+ store float %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+
+define void @sincos_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; CHECK-LABEL: define void @sincos_f64
+; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]])
+; CHECK-VF2-NOT: call void @amd_vrd2_sincos(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
+; CHECK-VF4: call void @amd_vrd4_sincos(<4 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
+; CHECK-VF8: call void @amd_vrd8_sincos(<8 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %gepa = getelementptr double, ptr %a, i64 %indvars.iv
+ %num = load double, ptr %gepa, align 8
+ %gepb = getelementptr double, ptr %b, i64 %indvars.iv
+ %gepc = getelementptr double, ptr %c, i64 %indvars.iv
+ call void @sincos(double %num, ptr %gepb, ptr %gepc)
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+define void @sincos_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; CHECK-LABEL: define void @sincos_f32
+; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]])
+; CHECK-VF4: call void @amd_vrs4_sincosf(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
+; CHECK-VF8: call void @amd_vrs8_sincosf(<8 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
+; CHECK-VF16: call void @amd_vrs16_sincosf(<16 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %gepa = getelementptr float, ptr %a, i64 %indvars.iv
+ %num = load float, ptr %gepa, align 8
+ %gepb = getelementptr float, ptr %b, i64 %indvars.iv
+ %gepc = getelementptr float, ptr %c, i64 %indvars.iv
+ call void @sincosf(float %num, ptr %gepb, ptr %gepc)
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
attributes #0 = { nounwind readnone }
+
+declare double @exp10(double) #0
+declare float @exp10f(float) #0
+declare double @llvm.exp10.f64(double) #0
+declare float @llvm.exp10.f32(float) #0
+declare void @sincos(double, ptr, ptr)
+declare void @sincosf(float, ptr, ptr)
\ No newline at end of file
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index 4e4b81e89a3270..76cccbd6f39cc3 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -14,10 +14,15 @@
; SVML-SAME: ptr @__svml_log10f4,
; SVML-SAME: ptr @__svml_log10f8,
; SVML-SAME: ptr @__svml_log10f16
-; AMDLIBM-SAME: [6 x ptr] [
+; AMDLIBM-SAME: [11 x ptr] [
; AMDLIBM-SAME: ptr @amd_vrd2_sin,
; AMDLIBM-SAME: ptr @amd_vrd4_sin,
; AMDLIBM-SAME: ptr @amd_vrd8_sin,
+; AMDLIBM-SAME: ptr @amd_vrd4_sincos,
+; AMDLIBM-SAME: ptr @amd_vrd8_sincos,
+; AMDLIBM-SAME: ptr @amd_vrs4_sincosf,
+; AMDLIBM-SAME: ptr @amd_vrs8_sincosf,
+; AMDLIBM-SAME: ptr @amd_vrs16_sincosf
; AMDLIBM-SAME: ptr @amd_vrs4_log10f,
; AMDLIBM-SAME: ptr @amd_vrs8_log10f,
; AMDLIBM-SAME: ptr @amd_vrs16_log10f
@@ -106,6 +111,7 @@ define void @sincos_f64(double %in, ptr %sin, ptr %cos) {
; COMMON-LABEL: @sincos_f64(
; SLEEFGNUABI: call void @sincos(double %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) #[[SINCOS:[0-9]+]]
; ARMPL: call void @sincos(double %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) #[[SINCOS:[0-9]+]]
+; AMDLIBM: call void @sincos(double %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) #[[SINCOS:[0-9]+]]
call void @sincos(double %in, ptr %sin, ptr %cos)
ret void
}
@@ -116,6 +122,7 @@ define void @sincos_f32(float %in, ptr %sin, ptr %cos) {
; COMMON-LABEL: @sincos_f32(
; SLEEFGNUABI: call void @sincosf(float %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) #[[SINCOSF:[0-9]+]]
; ARMPL: call void @sincosf(float %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) #[[SINCOSF:[0-9]+]]
+; AMDLIBM: call void @sincosf(float %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) #[[SINCOSF:[0-9]+]]
call void @sincosf(float %in, ptr %sin, ptr %cos)
ret void
}
@@ -145,7 +152,7 @@ declare void @sincospif(float, ptr, ptr) #0
define float @call_llvm.log10.f32(float %in) {
; COMMON-LABEL: @call_llvm.log10.f32(
; SVML: call float @llvm.log10.f32(float %{{.*}})
-; AMDLIBM: call float @llvm.log10.f32(float %{{.*}})
+; AMDLIBM: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
; LIBMVEC-X86: call float @llvm.log10.f32(float %{{.*}})
; MASSV: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
; ACCELERATE: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
@@ -171,6 +178,11 @@ declare float @llvm.log10.f32(float) #0
; AMDLIBM: declare <2 x double> @amd_vrd2_sin(<2 x double>)
; AMDLIBM: declare <4 x double> @amd_vrd4_sin(<4 x double>)
; AMDLIBM: declare <8 x double> @amd_vrd8_sin(<8 x double>)
+; AMDLIBM: declare void @amd_vrd4_sincos(<4 x double>, ptr, ptr)
+; AMDLIBM: declare void @amd_vrd8_sincos(<8 x double>, ptr, ptr)
+; AMDLIBM: declare void @amd_vrs4_sincosf(<4 x float>, ptr, ptr)
+; AMDLIBM: declare void @amd_vrs8_sincosf(<8 x float>, ptr, ptr)
+; AMDLIBM: declare void @amd_vrs16_sincosf(<16 x float>, ptr, ptr)
; AMDLIBM: declare <4 x float> @amd_vrs4_log10f(<4 x float>)
; AMDLIBM: declare <8 x float> @amd_vrs8_log10f(<8 x float>)
; AMDLIBM: declare <16 x float> @amd_vrs16_log10f(<16 x float>)
@@ -228,6 +240,17 @@ attributes #0 = { nounwind readnone }
; AMDLIBM-SAME: "_ZGV_LLVM_N2v_sin(amd_vrd2_sin),
; AMDLIBM-SAME: _ZGV_LLVM_N4v_sin(amd_vrd4_sin),
; AMDLIBM-SAME: _ZGV_LLVM_N8v_sin(amd_vrd8_sin)" }
+; AMDLIBM: attributes #[[SINCOS]] = { "vector-function-abi-variant"=
+; AMDLIBM-SAME: "_ZGV_LLVM_N4vl8l8_sincos(amd_vrd4_sincos),
+; AMDLIBM-SAME: _ZGV_LLVM_N8vl8l8_sincos(amd_vrd8_sincos)" }
+; AMDLIBM: attributes #[[SINCOSF]] = { "vector-function-abi-variant"=
+; AMDLIBM-SAME: "_ZGV_LLVM_N4vl4l4_sincosf(amd_vrs4_sincosf),
+; AMDLIBM-SAME: _ZGV_LLVM_N8vl4l4_sincosf(amd_vrs8_sincosf),
+; AMDLIBM-SAME: _ZGV_LLVM_N16vl4l4_sincosf(amd_vrs16_sincosf)" }
+; AMDLIBM: attributes #[[LOG10]] = { "vector-function-abi-variant"=
+; AMDLIBM-SAME: "_ZGV_LLVM_N4v_llvm.log10.f32(amd_vrs4_log10f),
+; AMDLIBM-SAME: _ZGV_LLVM_N8v_llvm.log10.f32(amd_vrs8_log10f),
+; AMDLIBM-SAME: _ZGV_LLVM_N16v_llvm.log10.f32(amd_vrs16_log10f)" }
; MASSV: attributes #[[SIN]] = { "vector-function-abi-variant"=
; MASSV-SAME: "_ZGV_LLVM_N2v_sin(__sind2)" }
More information about the llvm-commits
mailing list