[llvm] [LLVM][NVPTX] Upstream tanh intrinsic for libdevice (PR #149596)
Meredith Julian via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 23 11:15:57 PDT 2025
https://github.com/mjulian31 updated https://github.com/llvm/llvm-project/pull/149596
>From 52fadded02318af120dc8820c186bcfae2288a5b Mon Sep 17 00:00:00 2001
From: Meredith Julian <mjulian at nvidia.com>
Date: Thu, 17 Jul 2025 15:53:49 -0700
Subject: [PATCH 1/4] initial commit
---
llvm/lib/IR/AutoUpgrade.cpp | 7 ++
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 7 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 5 +-
.../Assembler/auto_upgrade_nvvm_intrinsics.ll | 9 +++
llvm/test/CodeGen/NVPTX/tanhf.ll | 70 +++++++++++++++++++
5 files changed, 95 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/tanhf.ll
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 28ed1e520ce52..71591074cea50 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1450,6 +1450,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
.Case("popc.ll", true)
.Case("h2f", true)
.Case("swap.lo.hi.b64", true)
+ .Case("tanh.approx.f32", true)
.Default(false);
if (Expand) {
@@ -2543,6 +2544,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
MDNode *MD = MDNode::get(Builder.getContext(), {});
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
return LD;
+ } else if (Name == "tanh.approx.f32") {
+ // nvvm.tanh.approx.f32 -> afn llvm.tanh.f32
+ FastMathFlags FMF;
+ FMF.setApproxFunc();
+ Rep = Builder.CreateUnaryIntrinsic(Intrinsic::tanh, CI->getArgOperand(0),
+ FMF);
} else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") {
Value *Arg =
Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7883acca6d8b3..8eef4dab69724 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -952,10 +952,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// promoted to f32. v2f16 is expanded to f16, which is then promoted
// to f32.
for (const auto &Op :
- {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
+ {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
setOperationAction(Op, MVT::f16, Promote);
setOperationAction(Op, MVT::f32, Legal);
- setOperationAction(Op, MVT::f64, Legal);
+ // fsin, fcos, and ftanh are not supported on f64
+ if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
+ setOperationAction(Op, MVT::f64, Legal);
+ }
setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
setOperationAction(Op, MVT::bf16, Promote);
AddPromotedToType(Op, MVT::bf16, MVT::f32);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b5df4c6de7fd8..720d361c6c73f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1234,7 +1234,7 @@ defm FMA_F32 : FMA<F32RT, allow_ftz = true>;
defm FMA_F32x2 : FMA<F32X2RT, allow_ftz = true, preds = [hasF32x2Instructions]>;
defm FMA_F64 : FMA<F64RT, allow_ftz = false>;
-// sin/cos
+// sin/cos/tanh
class UnaryOpAllowsApproxFn<SDPatternOperator operator>
: PatFrag<(ops node:$A),
@@ -1250,6 +1250,9 @@ def COS_APPROX_f32 :
BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$src), (ins FTZFlag:$ftz),
"cos.approx$ftz.f32",
[(set f32:$dst, (UnaryOpAllowsApproxFn<fcos> f32:$src))]>;
+def TANH_APPROX_f32 :
+ BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f32",
+ [(set f32:$dst, (UnaryOpAllowsApproxFn<ftanh> f32:$src))]>;
//-----------------------------------
// Bitwise operations
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index a17f11a680aa2..362586af4f9b7 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -17,6 +17,8 @@ declare float @llvm.nvvm.fabs.f(float)
declare float @llvm.nvvm.fabs.ftz.f(float)
declare double @llvm.nvvm.fabs.d(double)
+declare float @llvm.nvvm.tanh.approx.f32(float)
+
declare i16 @llvm.nvvm.max.s(i16, i16)
declare i32 @llvm.nvvm.max.i(i32, i32)
declare i64 @llvm.nvvm.max.ll(i64, i64)
@@ -138,6 +140,13 @@ define void @fabs(float %a, double %b) {
ret void
}
+; CHECK-LABEL: @tanh
+define void @tanh(float %a) {
+; CHECK: call afn float @llvm.tanh.f32(float %a)
+ %r1 = call float @llvm.nvvm.tanh.approx.f32(float %a)
+ ret void
+}
+
; CHECK-LABEL: @min_max
define void @min_max(i16 %a1, i16 %a2, i32 %b1, i32 %b2, i64 %c1, i64 %c2) {
; CHECK: [[maxs:%[a-zA-Z0-9.]+]] = icmp sge i16 %a1, %a2
diff --git a/llvm/test/CodeGen/NVPTX/tanhf.ll b/llvm/test/CodeGen/NVPTX/tanhf.ll
new file mode 100644
index 0000000000000..73f3ca68768e9
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tanhf.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %}
+
+
+define float @test1(float %in) local_unnamed_addr {
+; CHECK-LABEL: test1(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test1_param_0];
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %call = call afn float @llvm.tanh.f32(float %in)
+ ret float %call
+}
+
+define float @test2(float %in) local_unnamed_addr {
+; CHECK-LABEL: test2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test2_param_0];
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %call = tail call afn float @llvm.tanh.f32(float %in)
+ ret float %call
+}
+
+define half @test3(half %in) local_unnamed_addr {
+; CHECK-LABEL: test3(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test3_param_0];
+; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+ %call = call afn half @llvm.tanh.f16(half %in)
+ ret half %call
+}
+
+define half @test4(half %in) local_unnamed_addr {
+; CHECK-LABEL: test4(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test4_param_0];
+; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+ %call = tail call afn half @llvm.tanh.f16(half %in)
+ ret half %call
+}
+
+declare float @llvm.tanh.f32(float)
+declare half @llvm.tanh.f16(half)
+
>From 484adfeae8c2b635675ef48dce6ea9475d4db233 Mon Sep 17 00:00:00 2001
From: Meredith Julian <mjulian at nvidia.com>
Date: Fri, 18 Jul 2025 17:06:44 -0700
Subject: [PATCH 2/4] fix comment
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8eef4dab69724..e26c07902f2c4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -955,7 +955,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
{ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
setOperationAction(Op, MVT::f16, Promote);
setOperationAction(Op, MVT::f32, Legal);
- // fsin, fcos, and ftanh are not supported on f64
+ // only div/rem/sqrt are legal for f64
if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
setOperationAction(Op, MVT::f64, Legal);
}
>From bfda682d416699f53b3d4e989ea486482123f13b Mon Sep 17 00:00:00 2001
From: Meredith Julian <mjulian at nvidia.com>
Date: Mon, 21 Jul 2025 14:27:16 -0700
Subject: [PATCH 3/4] simplify test
---
llvm/test/CodeGen/NVPTX/tanhf.ll | 35 ++------------------------------
1 file changed, 2 insertions(+), 33 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/tanhf.ll b/llvm/test/CodeGen/NVPTX/tanhf.ll
index 73f3ca68768e9..24981b7cace22 100644
--- a/llvm/test/CodeGen/NVPTX/tanhf.ll
+++ b/llvm/test/CodeGen/NVPTX/tanhf.ll
@@ -17,28 +17,14 @@ define float @test1(float %in) local_unnamed_addr {
ret float %call
}
-define float @test2(float %in) local_unnamed_addr {
+define half @test2(half %in) local_unnamed_addr {
; CHECK-LABEL: test2(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r1, [test2_param_0];
-; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %call = tail call afn float @llvm.tanh.f32(float %in)
- ret float %call
-}
-
-define half @test3(half %in) local_unnamed_addr {
-; CHECK-LABEL: test3(
-; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b16 %rs1, [test3_param_0];
+; CHECK-NEXT: ld.param.b16 %rs1, [test2_param_0];
; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;
; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;
@@ -48,23 +34,6 @@ define half @test3(half %in) local_unnamed_addr {
ret half %call
}
-define half @test4(half %in) local_unnamed_addr {
-; CHECK-LABEL: test4(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b16 %rs1, [test4_param_0];
-; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;
-; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
-; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
-; CHECK-NEXT: ret;
- %call = tail call afn half @llvm.tanh.f16(half %in)
- ret half %call
-}
-
declare float @llvm.tanh.f32(float)
declare half @llvm.tanh.f16(half)
>From 12ed6168a4c16a2d1bd5a1877a058bfceaf48b1a Mon Sep 17 00:00:00 2001
From: Meredith Julian <mjulian at nvidia.com>
Date: Wed, 23 Jul 2025 11:15:30 -0700
Subject: [PATCH 4/4] add ptx70 and sm75 predicates, update test
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 3 ++-
llvm/test/CodeGen/NVPTX/tanhf.ll | 5 +++--
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 720d361c6c73f..442b900d280b2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1252,7 +1252,8 @@ def COS_APPROX_f32 :
[(set f32:$dst, (UnaryOpAllowsApproxFn<fcos> f32:$src))]>;
def TANH_APPROX_f32 :
BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f32",
- [(set f32:$dst, (UnaryOpAllowsApproxFn<ftanh> f32:$src))]>;
+ [(set f32:$dst, (UnaryOpAllowsApproxFn<ftanh> f32:$src))]>,
+ Requires<[hasPTX<70>, hasSM<75>]>;
//-----------------------------------
// Bitwise operations
diff --git a/llvm/test/CodeGen/NVPTX/tanhf.ll b/llvm/test/CodeGen/NVPTX/tanhf.ll
index 24981b7cace22..6f4eb222e0b38 100644
--- a/llvm/test/CodeGen/NVPTX/tanhf.ll
+++ b/llvm/test/CodeGen/NVPTX/tanhf.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %}
+; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+target triple = "nvptx64-nvidia-cuda"
define float @test1(float %in) local_unnamed_addr {
; CHECK-LABEL: test1(
More information about the llvm-commits
mailing list