[llvm] [NVPTX] Support llvm.{exp2, log2} for f32 and vector of f32 (PR #120519)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 19 11:16:11 PST 2024
https://github.com/Prince781 updated https://github.com/llvm/llvm-project/pull/120519
>From 3761255f1e94a7a6096f435c7f2863d3b49e260c Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Wed, 18 Dec 2024 23:37:41 -0500
Subject: [PATCH] [NVPTX] Support llvm.{exp2,log2} for f32 and vector of f32
- Lower llvm.exp2 to ex2.approx for f32 and all vectors of f32.
- Lower llvm.log2 to lg2.approx for f32 and all vectors of f32.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 3 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 17 ++++++
llvm/test/CodeGen/NVPTX/fexp2.ll | 61 +++++++++++++++++++++
llvm/test/CodeGen/NVPTX/flog2.ll | 61 +++++++++++++++++++++
4 files changed, 141 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/NVPTX/fexp2.ll
create mode 100644 llvm/test/CodeGen/NVPTX/flog2.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5c1f717694a4c7..78fd5612e373d6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -968,7 +968,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::CopyToReg, MVT::i128, Custom);
setOperationAction(ISD::CopyFromReg, MVT::i128, Custom);
- // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
+ setOperationAction(ISD::FEXP2, MVT::f32, Legal);
+ setOperationAction(ISD::FLOG2, MVT::f32, Legal);
// No FPOW or FREM in PTX.
// Now deduce the information based on the above mentioned
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index abaf8e0b0ec1f8..174bfee7fd159e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -518,6 +518,19 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
Requires<[hasBF16Math, noFMA]>;
}
+// Template for operations which take one f32 operand. Provides two
+// instructions: <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush subnormal inputs and
+// results to zero).
+multiclass F1<string OpcStr, SDNode OpNode> {
+ def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ OpcStr # ".ftz.f32 \t$dst, $a;",
+ [(set f32:$dst, (OpNode f32:$a))]>,
+ Requires<[doF32FTZ]>;
+ def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ OpcStr # ".f32 \t$dst, $a;",
+ [(set f32:$dst, (OpNode f32:$a))]>;
+}
+
// Template for operations which take two f32 or f64 operands. Provides three
// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
// subnormal inputs and results to zero).
@@ -1204,6 +1217,10 @@ defm FNEG_H: F2_Support_Half<"neg", fneg>;
defm FSQRT : F2<"sqrt.rn", fsqrt>;
+defm FEXP2 : F1<"ex2.approx", fexp2>;
+
+defm FLOG2 : F1<"lg2.approx", flog2>;
+
//
// F16 NEG
//
diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll
new file mode 100644
index 00000000000000..41610772d30676
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fexp2.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx14 | %ptxas-verify -arch=sm_20 %}
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: exp2_test
+define float @exp2_test(float %in) {
+; CHECK-LABEL: exp2_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call float @llvm.exp2.f32(float %in)
+ ret float %exp2
+}
+
+; CHECK-LABEL: exp2_ftz_test
+define float @exp2_ftz_test(float %in) #0 {
+; CHECK-LABEL: exp2_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call float @llvm.exp2.f32(float %in)
+ ret float %exp2
+}
+
+; CHECK-LABEL: exp2_test_v
+define <4 x float> @exp2_test_v(<4 x float> %in) {
+; CHECK-LABEL: exp2_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [exp2_test_v_param_0];
+; CHECK-NEXT: ex2.approx.f32 %f5, %f4;
+; CHECK-NEXT: ex2.approx.f32 %f6, %f3;
+; CHECK-NEXT: ex2.approx.f32 %f7, %f2;
+; CHECK-NEXT: ex2.approx.f32 %f8, %f1;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
+ ret <4 x float> %exp2
+}
+
+declare float @llvm.exp2.f32(float %val)
+
+declare <4 x float> @llvm.exp2.v4f32(<4 x float> %val)
+
+attributes #0 = {"denormal-fp-math"="preserve-sign"}
diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll
new file mode 100644
index 00000000000000..26958188949a83
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/flog2.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx14 | %ptxas-verify -arch=sm_20 %}
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: log2_test
+define float @log2_test(float %in) {
+; CHECK-LABEL: log2_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [log2_test_param_0];
+; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call float @llvm.log2.f32(float %in)
+ ret float %log2
+}
+
+; CHECK-LABEL: log2_ftz_test
+define float @log2_ftz_test(float %in) #0 {
+; CHECK-LABEL: log2_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [log2_ftz_test_param_0];
+; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call float @llvm.log2.f32(float %in)
+ ret float %log2
+}
+
+; CHECK-LABEL: log2_test_v
+define <4 x float> @log2_test_v(<4 x float> %in) {
+; CHECK-LABEL: log2_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [log2_test_v_param_0];
+; CHECK-NEXT: lg2.approx.f32 %f5, %f4;
+; CHECK-NEXT: lg2.approx.f32 %f6, %f3;
+; CHECK-NEXT: lg2.approx.f32 %f7, %f2;
+; CHECK-NEXT: lg2.approx.f32 %f8, %f1;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
+ ret <4 x float> %log2
+}
+
+declare float @llvm.log2.f32(float %val)
+
+declare <4 x float> @llvm.log2.v4f32(<4 x float> %val)
+
+attributes #0 = {"denormal-fp-math"="preserve-sign"}
More information about the llvm-commits
mailing list