[llvm] [NVPTX] Support llvm.{exp2, log2} for f32 and vector of f32 (PR #120519)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 19 12:35:48 PST 2024
https://github.com/Prince781 updated https://github.com/llvm/llvm-project/pull/120519
>From 1e5be93a281995970871e76b4becfe2aeb4093df Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Wed, 18 Dec 2024 23:37:41 -0500
Subject: [PATCH] [NVPTX] Support llvm.{exp2,log2} for f32 and vector of f32
Lower llvm.exp2 to ex2.approx and llvm.log2 to lg2.approx for f32 and
all vectors of f32. Because these are approximate instructions, support
is hidden behind the flags -nvptx-approx-{exp2,log2}f32
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 15 ++++-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 17 ++++++
llvm/test/CodeGen/NVPTX/fexp2.ll | 61 +++++++++++++++++++++
llvm/test/CodeGen/NVPTX/flog2.ll | 61 +++++++++++++++++++++
4 files changed, 153 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/NVPTX/fexp2.ll
create mode 100644 llvm/test/CodeGen/NVPTX/flog2.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5c1f717694a4c7..99d6f78ca8dd93 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -94,6 +94,16 @@ static cl::opt<bool> UsePrecSqrtF32(
cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
cl::init(true));
+static cl::opt<bool> UseApproxExp2F32(
+ "nvptx-approx-exp2f32", cl::Hidden,
+ cl::desc("NVPTX Specific: 0 no ex2 support (default), 1 use ex2.approx"),
+ cl::init(false));
+
+static cl::opt<bool> UseApproxLog2F32(
+ "nvptx-approx-log2f32",
+ cl::desc("NVPTX Specific: 0 no lg2 support (default), 1 use lg2.approx"),
+ cl::init(false));
+
static cl::opt<bool> ForceMinByValParamAlign(
"nvptx-force-min-byval-param-align", cl::Hidden,
cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
@@ -968,7 +978,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::CopyToReg, MVT::i128, Custom);
setOperationAction(ISD::CopyFromReg, MVT::i128, Custom);
- // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
+ if (UseApproxExp2F32)
+ setOperationAction(ISD::FEXP2, MVT::f32, Legal);
+ if (UseApproxLog2F32)
+ setOperationAction(ISD::FLOG2, MVT::f32, Legal);
// No FPOW or FREM in PTX.
// Now deduce the information based on the above mentioned
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index abaf8e0b0ec1f8..174bfee7fd159e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -518,6 +518,19 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
Requires<[hasBF16Math, noFMA]>;
}
+// Template for operations which take one f32 operand. Provides two
+// instructions: <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush subnormal inputs and
+// results to zero).
+multiclass F1<string OpcStr, SDNode OpNode> {
+ def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ OpcStr # ".ftz.f32 \t$dst, $a;",
+ [(set f32:$dst, (OpNode f32:$a))]>,
+ Requires<[doF32FTZ]>;
+ def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ OpcStr # ".f32 \t$dst, $a;",
+ [(set f32:$dst, (OpNode f32:$a))]>;
+}
+
// Template for operations which take two f32 or f64 operands. Provides three
// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
// subnormal inputs and results to zero).
@@ -1204,6 +1217,10 @@ defm FNEG_H: F2_Support_Half<"neg", fneg>;
defm FSQRT : F2<"sqrt.rn", fsqrt>;
+defm FEXP2 : F1<"ex2.approx", fexp2>;
+
+defm FLOG2 : F1<"lg2.approx", flog2>;
+
//
// F16 NEG
//
diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll
new file mode 100644
index 00000000000000..920421776926a6
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fexp2.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 -nvptx-approx-exp2f32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 -nvptx-approx-exp2f32 | %ptxas-verify -arch=sm_20 %}
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: exp2_test
+define float @exp2_test(float %in) {
+; CHECK-LABEL: exp2_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call float @llvm.exp2.f32(float %in)
+ ret float %exp2
+}
+
+; CHECK-LABEL: exp2_ftz_test
+define float @exp2_ftz_test(float %in) #0 {
+; CHECK-LABEL: exp2_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call float @llvm.exp2.f32(float %in)
+ ret float %exp2
+}
+
+; CHECK-LABEL: exp2_test_v
+define <4 x float> @exp2_test_v(<4 x float> %in) {
+; CHECK-LABEL: exp2_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [exp2_test_v_param_0];
+; CHECK-NEXT: ex2.approx.f32 %f5, %f4;
+; CHECK-NEXT: ex2.approx.f32 %f6, %f3;
+; CHECK-NEXT: ex2.approx.f32 %f7, %f2;
+; CHECK-NEXT: ex2.approx.f32 %f8, %f1;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
+ ret <4 x float> %exp2
+}
+
+declare float @llvm.exp2.f32(float %val)
+
+declare <4 x float> @llvm.exp2.v4f32(<4 x float> %val)
+
+attributes #0 = {"denormal-fp-math"="preserve-sign"}
diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll
new file mode 100644
index 00000000000000..c968a7adbff4d1
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/flog2.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 -nvptx-approx-log2f32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 -nvptx-approx-log2f32 | %ptxas-verify -arch=sm_20 %}
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: log2_test
+define float @log2_test(float %in) {
+; CHECK-LABEL: log2_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [log2_test_param_0];
+; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call float @llvm.log2.f32(float %in)
+ ret float %log2
+}
+
+; CHECK-LABEL: log2_ftz_test
+define float @log2_ftz_test(float %in) #0 {
+; CHECK-LABEL: log2_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [log2_ftz_test_param_0];
+; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call float @llvm.log2.f32(float %in)
+ ret float %log2
+}
+
+; CHECK-LABEL: log2_test_v
+define <4 x float> @log2_test_v(<4 x float> %in) {
+; CHECK-LABEL: log2_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [log2_test_v_param_0];
+; CHECK-NEXT: lg2.approx.f32 %f5, %f4;
+; CHECK-NEXT: lg2.approx.f32 %f6, %f3;
+; CHECK-NEXT: lg2.approx.f32 %f7, %f2;
+; CHECK-NEXT: lg2.approx.f32 %f8, %f1;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
+ ret <4 x float> %log2
+}
+
+declare float @llvm.log2.f32(float %val)
+
+declare <4 x float> @llvm.log2.v4f32(<4 x float> %val)
+
+attributes #0 = {"denormal-fp-math"="preserve-sign"}
More information about the llvm-commits
mailing list