[llvm] [NVPTX] Support exp2 and log2 for f32/f16/bf16 and vectors (PR #120519)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 24 01:55:04 PST 2024
https://github.com/Prince781 updated https://github.com/llvm/llvm-project/pull/120519
>From 777d7d50dcaa45134989aad70da3ebd224a1002f Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Wed, 18 Dec 2024 23:37:41 -0500
Subject: [PATCH] [NVPTX] Improve support for {ex2,lg2}.approx
- Add support for bf16 and bf16x2 PTX types
- Includes new NVVM intrinsics: `@llvm.nvvm.ex2.approx.{bf16,bf16x2}`
- Add support for `@llvm.exp2()`:
- LLVM: `float` -> PTX: `ex2.approx{.ftz}.f32`
- LLVM: `half` -> PTX: `ex2.approx.f16`
- LLVM: `<2 x half>` -> PTX: `ex2.approx.f16x2`
- LLVM: `bfloat` -> PTX: `ex2.approx.ftz.bf16`
- LLVM: `<2 x bfloat>` -> PTX: `ex2.approx.ftz.bf16x2`
- Any operations with non-native vector widths are expanded. On
targets not supporting f16/bf16, values are promoted to f32.
- Add *CONDITIONAL* support for `@llvm.log2()` [^1]:
- LLVM: `float` -> PTX: `lg2.approx{.ftz}.f32`
- Support for f16/bf16 is emulated by promoting values to f32.
[1]: CUDA implements `exp2()` with `ex2.approx` but `log2()` is
implemented differently, so this is off by default. To enable, use the
flag -nvptx-approx-log2f32
---
llvm/include/llvm/IR/IntrinsicsNVVM.td | 4 +
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 33 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 3 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 1 +
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 24 ++
llvm/test/CodeGen/NVPTX/bf16-ex2.ll | 38 ++
llvm/test/CodeGen/NVPTX/f16-ex2.ll | 40 +-
llvm/test/CodeGen/NVPTX/f32-ex2.ll | 36 ++
llvm/test/CodeGen/NVPTX/f32-lg2.ll | 17 +
llvm/test/CodeGen/NVPTX/fexp2.ll | 414 ++++++++++++++++++++
llvm/test/CodeGen/NVPTX/flog2.ll | 234 +++++++++++
11 files changed, 831 insertions(+), 13 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/bf16-ex2.ll
create mode 100644 llvm/test/CodeGen/NVPTX/f32-ex2.ll
create mode 100644 llvm/test/CodeGen/NVPTX/f32-lg2.ll
create mode 100644 llvm/test/CodeGen/NVPTX/fexp2.ll
create mode 100644 llvm/test/CodeGen/NVPTX/flog2.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index fd07d131ce15b2..3513586dccce44 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -964,6 +964,10 @@ let TargetPrefix = "nvvm" in {
DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>;
def int_nvvm_ex2_approx_f16x2 :
DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>;
+ def int_nvvm_ex2_approx_bf16 :
+ DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_bfloat_ty], [IntrNoMem]>;
+ def int_nvvm_ex2_approx_bf16x2 :
+ DefaultAttrsIntrinsic<[llvm_v2bf16_ty], [llvm_v2bf16_ty], [IntrNoMem]>;
def int_nvvm_lg2_approx_ftz_f : ClangBuiltin<"__nvvm_lg2_approx_ftz_f">,
DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5c1f717694a4c7..ff26f5c66c71e7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -94,6 +94,13 @@ static cl::opt<bool> UsePrecSqrtF32(
cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
cl::init(true));
+/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
+/// does NOT use lg2.approx for log2, so this is disabled by default.
+static cl::opt<bool> UseApproxLog2F32(
+ "nvptx-approx-log2f32",
+ cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
+ cl::init(false));
+
static cl::opt<bool> ForceMinByValParamAlign(
"nvptx-force-min-byval-param-align", cl::Hidden,
cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
@@ -123,6 +130,8 @@ bool NVPTXTargetLowering::usePrecSqrtF32() const {
}
}
+bool NVPTXTargetLowering::useApproxLog2() const { return UseApproxLog2F32; }
+
bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==
DenormalMode::PreserveSign;
@@ -520,6 +529,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
case ISD::FMINIMUM:
IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
break;
+ case ISD::FEXP2:
+ IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
+ break;
}
setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
};
@@ -968,7 +980,26 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::CopyToReg, MVT::i128, Custom);
setOperationAction(ISD::CopyFromReg, MVT::i128, Custom);
- // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
+ // FEXP2 support:
+ // - f32
+ // - f16/f16x2 (sm_70+, PTX 7.0+)
+ // - bf16/bf16x2 (sm_90+, PTX 7.8+)
+ // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
+ setOperationAction(ISD::FEXP2, MVT::f32, Legal);
+ setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
+ setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
+ setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
+ setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
+
+ // FLOG2 supports f32 only
+ // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
+ if (UseApproxLog2F32) {
+ setOperationAction(ISD::FLOG2, MVT::f32, Legal);
+ setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
+ setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
+ setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16}, Expand);
+ }
+
// No FPOW or FREM in PTX.
// Now deduce the information based on the above mentioned
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 0244a0c5bec9d5..cb3bf1d96d60f5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -211,6 +211,9 @@ class NVPTXTargetLowering : public TargetLowering {
// sqrt instruction.
bool usePrecSqrtF32() const;
+ // Get whether to use lg2.approx for log2
+ bool useApproxLog2() const;
+
// Get whether we should use instructions that flush floating-point denormals
// to sign-preserving zero.
bool useF32FTZ(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 711cd67eceed9a..899d1691c4c7c7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -145,6 +145,7 @@ def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
def doF32FTZ : Predicate<"useF32FTZ()">;
def doNoF32FTZ : Predicate<"!useF32FTZ()">;
def doRsqrtOpt : Predicate<"doRsqrtOpt()">;
+def useApproxLog2 : Predicate<"useApproxLog2()">;
def doMulWide : Predicate<"doMulWide">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 0773c1bbc57819..f9414aa9593bb3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1255,11 +1255,30 @@ def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
+
def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16 \t$dst, $src0;",
Int16Regs, Int16Regs, int_nvvm_ex2_approx_f16, [hasPTX<70>, hasSM<75>]>;
def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2 \t$dst, $src0;",
Int32Regs, Int32Regs, int_nvvm_ex2_approx_f16x2, [hasPTX<70>, hasSM<75>]>;
+def INT_NVVM_EX2_APPROX_BF16 : F_MATH_1<"ex2.approx.ftz.bf16 \t$dst, $src0;",
+ Int16Regs, Int16Regs, int_nvvm_ex2_approx_bf16, [hasPTX<78>, hasSM<90>]>;
+def INT_NVVM_EX2_APPROX_BF16X2 : F_MATH_1<"ex2.approx.ftz.bf16x2 \t$dst, $src0;",
+ Int32Regs, Int32Regs, int_nvvm_ex2_approx_bf16x2, [hasPTX<78>, hasSM<90>]>;
+
+def : Pat<(fexp2 f32:$a),
+ (INT_NVVM_EX2_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>;
+def : Pat<(fexp2 f32:$a),
+ (INT_NVVM_EX2_APPROX_F Float32Regs:$a)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fexp2 f16:$a),
+ (INT_NVVM_EX2_APPROX_F16 Int16Regs:$a)>, Requires<[useFP16Math]>;
+def : Pat<(fexp2 v2f16:$a),
+ (INT_NVVM_EX2_APPROX_F16X2 Int32Regs:$a)>, Requires<[useFP16Math]>;
+def : Pat<(fexp2 bf16:$a),
+ (INT_NVVM_EX2_APPROX_BF16 Int16Regs:$a)>, Requires<[hasBF16Math]>;
+def : Pat<(fexp2 v2bf16:$a),
+ (INT_NVVM_EX2_APPROX_BF16X2 Int32Regs:$a)>, Requires<[hasBF16Math]>;
+
def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
@@ -1267,6 +1286,11 @@ def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;",
Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>;
+def : Pat<(flog2 f32:$a),
+ (INT_NVVM_LG2_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>;
+def : Pat<(flog2 f32:$a),
+ (INT_NVVM_LG2_APPROX_F Float32Regs:$a)>, Requires<[doNoF32FTZ]>;
+
//
// Sin Cos
//
diff --git a/llvm/test/CodeGen/NVPTX/bf16-ex2.ll b/llvm/test/CodeGen/NVPTX/bf16-ex2.ll
new file mode 100644
index 00000000000000..ba57d30692c139
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/bf16-ex2.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-BF16 %s
+; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+target triple = "nvptx64-nvidia-cuda"
+
+declare bfloat @llvm.nvvm.ex2.approx.bf16(bfloat)
+declare <2 x bfloat> @llvm.nvvm.ex2.approx.bf16x2(<2 x bfloat>)
+
+; CHECK-LABEL: ex2_bfloat
+define bfloat @ex2_bfloat(bfloat %0) {
+; CHECK-BF16-LABEL: ex2_bfloat(
+; CHECK-BF16: {
+; CHECK-BF16-NEXT: .reg .b16 %rs<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT: // %bb.0:
+; CHECK-BF16-NEXT: ld.param.b16 %rs1, [ex2_bfloat_param_0];
+; CHECK-BF16-NEXT: ex2.approx.ftz.bf16 %rs2, %rs1;
+; CHECK-BF16-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-BF16-NEXT: ret;
+ %res = call bfloat @llvm.nvvm.ex2.approx.bf16(bfloat %0)
+ ret bfloat %res
+}
+
+; CHECK-LABEL: ex2_2xbfloat
+define <2 x bfloat> @ex2_2xbfloat(<2 x bfloat> %0) {
+;
+; CHECK-BF16-LABEL: ex2_2xbfloat(
+; CHECK-BF16: {
+; CHECK-BF16-NEXT: .reg .b32 %r<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT: // %bb.0:
+; CHECK-BF16-NEXT: ld.param.b32 %r1, [ex2_2xbfloat_param_0];
+; CHECK-BF16-NEXT: ex2.approx.ftz.bf16x2 %r2, %r1;
+; CHECK-BF16-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-BF16-NEXT: ret;
+ %res = call <2 x bfloat> @llvm.nvvm.ex2.approx.bf16x2(<2 x bfloat> %0)
+ ret <2 x bfloat> %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
index df3a36db52b1a0..ae70946b4b1dc9 100644
--- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
@@ -1,21 +1,37 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | FileCheck %s
-; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s
+; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+target triple = "nvptx64-nvidia-cuda"
declare half @llvm.nvvm.ex2.approx.f16(half)
declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)
-; CHECK-LABEL: exp2_half
-define half @exp2_half(half %0) {
- ; CHECK-NOT: call
- ; CHECK: ex2.approx.f16
- %res = call half @llvm.nvvm.ex2.approx.f16(half %0);
+; CHECK-LABEL: ex2_half
+define half @ex2_half(half %0) {
+; CHECK-FP16-LABEL: ex2_half(
+; CHECK-FP16: {
+; CHECK-FP16-NEXT: .reg .b16 %rs<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT: // %bb.0:
+; CHECK-FP16-NEXT: ld.param.b16 %rs1, [ex2_half_param_0];
+; CHECK-FP16-NEXT: ex2.approx.f16 %rs2, %rs1;
+; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-FP16-NEXT: ret;
+ %res = call half @llvm.nvvm.ex2.approx.f16(half %0)
ret half %res
}
-; CHECK-LABEL: exp2_2xhalf
-define <2 x half> @exp2_2xhalf(<2 x half> %0) {
- ; CHECK-NOT: call
- ; CHECK: ex2.approx.f16x2
- %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0);
+; CHECK-LABEL: ex2_2xhalf
+define <2 x half> @ex2_2xhalf(<2 x half> %0) {
+; CHECK-FP16-LABEL: ex2_2xhalf(
+; CHECK-FP16: {
+; CHECK-FP16-NEXT: .reg .b32 %r<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT: // %bb.0:
+; CHECK-FP16-NEXT: ld.param.b32 %r1, [ex2_2xhalf_param_0];
+; CHECK-FP16-NEXT: ex2.approx.f16x2 %r2, %r1;
+; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-FP16-NEXT: ret;
+ %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0)
ret <2 x half> %res
}
diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
new file mode 100644
index 00000000000000..c9eff2a8ff17dc
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %}
+target triple = "nvptx-nvidia-cuda"
+
+declare float @llvm.nvvm.ex2.approx.f(float)
+
+; CHECK-LABEL: ex2_float
+define float @ex2_float(float %0) {
+; CHECK-LABEL: ex2_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [ex2_float_param_0];
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+ %res = call float @llvm.nvvm.ex2.approx.f(float %0)
+ ret float %res
+}
+
+; CHECK-LABEL: ex2_float_ftz
+define float @ex2_float_ftz(float %0) {
+; CHECK-LABEL: ex2_float_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [ex2_float_ftz_param_0];
+; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+ %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0)
+ ret float %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/f32-lg2.ll b/llvm/test/CodeGen/NVPTX/f32-lg2.ll
new file mode 100644
index 00000000000000..7d6c3f879d1ce3
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f32-lg2.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mcpu=sm_20 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
+target triple = "nvptx-nvidia-cuda"
+
+declare float @llvm.nvvm.lg2.approx.f32(float)
+declare <2 x float> @llvm.nvvm.lg2.approx.v2f32(<2 x float>)
+
+; CHECK-LABEL: log2_float
+define float @log2_float(float %0) {
+ %res = call float @llvm.nvvm.lg2.approx.f32(float %0)
+ ret float %res
+}
+
+; CHECK-LABEL: log2_2xfloat
+define <2 x float> @log2_2xfloat(<2 x float> %0) {
+ %res = call <2 x float> @llvm.nvvm.lg2.approx.v2f32(<2 x float> %0)
+ ret <2 x float> %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll
new file mode 100644
index 00000000000000..7e485dca65764c
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fexp2.ll
@@ -0,0 +1,414 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-BF16 %s
+; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %}
+; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+target triple = "nvptx64-nvidia-cuda"
+
+; --- f32 ---
+
+; CHECK-LABEL: exp2_test
+define float @exp2_test(float %in) {
+; CHECK-LABEL: exp2_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+;
+; CHECK-FP16-LABEL: exp2_test(
+; CHECK-FP16: {
+; CHECK-FP16-NEXT: .reg .f32 %f<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT: // %bb.0: // %entry
+; CHECK-FP16-NEXT: ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-FP16-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-FP16-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-FP16-NEXT: ret;
+;
+; CHECK-BF16-LABEL: exp2_test(
+; CHECK-BF16: {
+; CHECK-BF16-NEXT: .reg .f32 %f<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT: // %bb.0: // %entry
+; CHECK-BF16-NEXT: ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-BF16-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-BF16-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-BF16-NEXT: ret;
+entry:
+ %exp2 = call float @llvm.exp2.f32(float %in)
+ ret float %exp2
+}
+
+; CHECK-LABEL: exp2_ftz_test
+define float @exp2_ftz_test(float %in) #0 {
+; CHECK-LABEL: exp2_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+;
+; CHECK-FP16-LABEL: exp2_ftz_test(
+; CHECK-FP16: {
+; CHECK-FP16-NEXT: .reg .f32 %f<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT: // %bb.0: // %entry
+; CHECK-FP16-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-FP16-NEXT: ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-FP16-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-FP16-NEXT: ret;
+;
+; CHECK-BF16-LABEL: exp2_ftz_test(
+; CHECK-BF16: {
+; CHECK-BF16-NEXT: .reg .f32 %f<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT: // %bb.0: // %entry
+; CHECK-BF16-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-BF16-NEXT: ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-BF16-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-BF16-NEXT: ret;
+entry:
+ %exp2 = call float @llvm.exp2.f32(float %in)
+ ret float %exp2
+}
+
+; CHECK-LABEL: exp2_test_v
+define <2 x float> @exp2_test_v(<2 x float> %in) {
+; CHECK-LABEL: exp2_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0];
+; CHECK-NEXT: ex2.approx.f32 %f3, %f2;
+; CHECK-NEXT: ex2.approx.f32 %f4, %f1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-NEXT: ret;
+;
+; CHECK-FP16-LABEL: exp2_test_v(
+; CHECK-FP16: {
+; CHECK-FP16-NEXT: .reg .f32 %f<5>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT: // %bb.0: // %entry
+; CHECK-FP16-NEXT: ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0];
+; CHECK-FP16-NEXT: ex2.approx.f32 %f3, %f2;
+; CHECK-FP16-NEXT: ex2.approx.f32 %f4, %f1;
+; CHECK-FP16-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-FP16-NEXT: ret;
+;
+; CHECK-BF16-LABEL: exp2_test_v(
+; CHECK-BF16: {
+; CHECK-BF16-NEXT: .reg .f32 %f<5>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT: // %bb.0: // %entry
+; CHECK-BF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0];
+; CHECK-BF16-NEXT: ex2.approx.f32 %f3, %f2;
+; CHECK-BF16-NEXT: ex2.approx.f32 %f4, %f1;
+; CHECK-BF16-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-BF16-NEXT: ret;
+entry:
+ %exp2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
+ ret <2 x float> %exp2
+}
+
+; --- f16 ---
+
+; CHECK-LABEL: exp2_f16_test
+define half @exp2_f16_test(half %in) {
+; CHECK-LABEL: exp2_f16_test(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs1, [exp2_f16_test_param_0];
+; CHECK-NEXT: cvt.f32.f16 %f1, %rs1;
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+;
+; CHECK-FP16-LABEL: exp2_f16_test(
+; CHECK-FP16: {
+; CHECK-FP16-NEXT: .reg .b16 %rs<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT: // %bb.0: // %entry
+; CHECK-FP16-NEXT: ld.param.b16 %rs1, [exp2_f16_test_param_0];
+; CHECK-FP16-NEXT: ex2.approx.f16 %rs2, %rs1;
+; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-FP16-NEXT: ret;
+;
+; CHECK-BF16-LABEL: exp2_f16_test(
+; CHECK-BF16: {
+; CHECK-BF16-NEXT: .reg .b16 %rs<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT: // %bb.0: // %entry
+; CHECK-BF16-NEXT: ld.param.b16 %rs1, [exp2_f16_test_param_0];
+; CHECK-BF16-NEXT: ex2.approx.f16 %rs2, %rs1;
+; CHECK-BF16-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-BF16-NEXT: ret;
+entry:
+ %exp2 = call half @llvm.exp2.f16(half %in)
+ ret half %exp2
+}
+
+; COM: we should never have .ftz for f16
+; CHECK-LABEL: exp2_f16_ftz_test
+define half @exp2_f16_ftz_test(half %in) #0 {
+; CHECK-LABEL: exp2_f16_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs1, [exp2_f16_ftz_test_param_0];
+; CHECK-NEXT: cvt.ftz.f32.f16 %f1, %rs1;
+; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+;
+; CHECK-FP16-LABEL: exp2_f16_ftz_test(
+; CHECK-FP16: {
+; CHECK-FP16-NEXT: .reg .b16 %rs<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT: // %bb.0: // %entry
+; CHECK-FP16-NEXT: ld.param.b16 %rs1, [exp2_f16_ftz_test_param_0];
+; CHECK-FP16-NEXT: ex2.approx.f16 %rs2, %rs1;
+; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-FP16-NEXT: ret;
+;
+; CHECK-BF16-LABEL: exp2_f16_ftz_test(
+; CHECK-BF16: {
+; CHECK-BF16-NEXT: .reg .b16 %rs<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT: // %bb.0: // %entry
+; CHECK-BF16-NEXT: ld.param.b16 %rs1, [exp2_f16_ftz_test_param_0];
+; CHECK-BF16-NEXT: ex2.approx.f16 %rs2, %rs1;
+; CHECK-BF16-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-BF16-NEXT: ret;
+entry:
+ %exp2 = call half @llvm.exp2.f16(half %in)
+ ret half %exp2
+}
+
+; CHECK-LABEL: exp2_f16_test_v
+define <2 x half> @exp2_f16_test_v(<2 x half> %in) {
+; CHECK-LABEL: exp2_f16_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [exp2_f16_test_v_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NEXT: cvt.f32.f16 %f3, %rs1;
+; CHECK-NEXT: ex2.approx.f32 %f4, %f3;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+;
+; CHECK-FP16-LABEL: exp2_f16_test_v(
+; CHECK-FP16: {
+; CHECK-FP16-NEXT: .reg .b32 %r<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT: // %bb.0: // %entry
+; CHECK-FP16-NEXT: ld.param.b32 %r1, [exp2_f16_test_v_param_0];
+; CHECK-FP16-NEXT: ex2.approx.f16x2 %r2, %r1;
+; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-FP16-NEXT: ret;
+;
+; CHECK-BF16-LABEL: exp2_f16_test_v(
+; CHECK-BF16: {
+; CHECK-BF16-NEXT: .reg .b32 %r<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT: // %bb.0: // %entry
+; CHECK-BF16-NEXT: ld.param.b32 %r1, [exp2_f16_test_v_param_0];
+; CHECK-BF16-NEXT: ex2.approx.f16x2 %r2, %r1;
+; CHECK-BF16-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-BF16-NEXT: ret;
+entry:
+ %exp2 = call <2 x half> @llvm.exp2.v2f16(<2 x half> %in)
+ ret <2 x half> %exp2
+}
+
+; --- bf16 ---
+
+; COM: we should always have .ftz for bf16
+; CHECK-LABEL: exp2_bf16_test
+define bfloat @exp2_bf16_test(bfloat %in) {
+; CHECK-LABEL: exp2_bf16_test(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.u16 %r1, [exp2_bf16_test_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NEXT: mov.b32 %f1, %r2;
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: mov.b32 %r3, %f2;
+; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT: add.s32 %r5, %r4, %r3;
+; CHECK-NEXT: add.s32 %r6, %r5, 32767;
+; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT: or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+;
+; CHECK-FP16-LABEL: exp2_bf16_test(
+; CHECK-FP16: {
+; CHECK-FP16-NEXT: .reg .pred %p<2>;
+; CHECK-FP16-NEXT: .reg .b16 %rs<2>;
+; CHECK-FP16-NEXT: .reg .b32 %r<9>;
+; CHECK-FP16-NEXT: .reg .f32 %f<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT: // %bb.0: // %entry
+; CHECK-FP16-NEXT: ld.param.u16 %r1, [exp2_bf16_test_param_0];
+; CHECK-FP16-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-FP16-NEXT: mov.b32 %f1, %r2;
+; CHECK-FP16-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-FP16-NEXT: mov.b32 %r3, %f2;
+; CHECK-FP16-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; CHECK-FP16-NEXT: add.s32 %r5, %r4, %r3;
+; CHECK-FP16-NEXT: add.s32 %r6, %r5, 32767;
+; CHECK-FP16-NEXT: setp.nan.f32 %p1, %f2, %f2;
+; CHECK-FP16-NEXT: or.b32 %r7, %r3, 4194304;
+; CHECK-FP16-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-FP16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-FP16-NEXT: ret;
+;
+; CHECK-BF16-LABEL: exp2_bf16_test(
+; CHECK-BF16: {
+; CHECK-BF16-NEXT: .reg .b16 %rs<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT: // %bb.0: // %entry
+; CHECK-BF16-NEXT: ld.param.b16 %rs1, [exp2_bf16_test_param_0];
+; CHECK-BF16-NEXT: ex2.approx.ftz.bf16 %rs2, %rs1;
+; CHECK-BF16-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-BF16-NEXT: ret;
+entry:
+ %exp2 = call bfloat @llvm.exp2.bf16(bfloat %in)
+ ret bfloat %exp2
+}
+
+; CHECK-LABEL: exp2_bf16_test_v
+define <2 x bfloat> @exp2_bf16_test_v(<2 x bfloat> %in) {
+; CHECK-LABEL: exp2_bf16_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [exp2_bf16_test_v_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT: shl.b32 %r3, %r2, 16;
+; CHECK-NEXT: mov.b32 %f1, %r3;
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: mov.b32 %r4, %f2;
+; CHECK-NEXT: bfe.u32 %r5, %r4, 16, 1;
+; CHECK-NEXT: add.s32 %r6, %r5, %r4;
+; CHECK-NEXT: add.s32 %r7, %r6, 32767;
+; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT: or.b32 %r8, %r4, 4194304;
+; CHECK-NEXT: selp.b32 %r9, %r8, %r7, %p1;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs1;
+; CHECK-NEXT: shl.b32 %r11, %r10, 16;
+; CHECK-NEXT: mov.b32 %f3, %r11;
+; CHECK-NEXT: ex2.approx.f32 %f4, %f3;
+; CHECK-NEXT: mov.b32 %r12, %f4;
+; CHECK-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; CHECK-NEXT: add.s32 %r14, %r13, %r12;
+; CHECK-NEXT: add.s32 %r15, %r14, 32767;
+; CHECK-NEXT: setp.nan.f32 %p2, %f4, %f4;
+; CHECK-NEXT: or.b32 %r16, %r12, 4194304;
+; CHECK-NEXT: selp.b32 %r17, %r16, %r15, %p2;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r9, 0x7632U;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r18;
+; CHECK-NEXT: ret;
+;
+; CHECK-FP16-LABEL: exp2_bf16_test_v(
+; CHECK-FP16: {
+; CHECK-FP16-NEXT: .reg .pred %p<3>;
+; CHECK-FP16-NEXT: .reg .b16 %rs<3>;
+; CHECK-FP16-NEXT: .reg .b32 %r<19>;
+; CHECK-FP16-NEXT: .reg .f32 %f<5>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT: // %bb.0: // %entry
+; CHECK-FP16-NEXT: ld.param.b32 %r1, [exp2_bf16_test_v_param_0];
+; CHECK-FP16-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-FP16-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-FP16-NEXT: shl.b32 %r3, %r2, 16;
+; CHECK-FP16-NEXT: mov.b32 %f1, %r3;
+; CHECK-FP16-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-FP16-NEXT: mov.b32 %r4, %f2;
+; CHECK-FP16-NEXT: bfe.u32 %r5, %r4, 16, 1;
+; CHECK-FP16-NEXT: add.s32 %r6, %r5, %r4;
+; CHECK-FP16-NEXT: add.s32 %r7, %r6, 32767;
+; CHECK-FP16-NEXT: setp.nan.f32 %p1, %f2, %f2;
+; CHECK-FP16-NEXT: or.b32 %r8, %r4, 4194304;
+; CHECK-FP16-NEXT: selp.b32 %r9, %r8, %r7, %p1;
+; CHECK-FP16-NEXT: cvt.u32.u16 %r10, %rs1;
+; CHECK-FP16-NEXT: shl.b32 %r11, %r10, 16;
+; CHECK-FP16-NEXT: mov.b32 %f3, %r11;
+; CHECK-FP16-NEXT: ex2.approx.f32 %f4, %f3;
+; CHECK-FP16-NEXT: mov.b32 %r12, %f4;
+; CHECK-FP16-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; CHECK-FP16-NEXT: add.s32 %r14, %r13, %r12;
+; CHECK-FP16-NEXT: add.s32 %r15, %r14, 32767;
+; CHECK-FP16-NEXT: setp.nan.f32 %p2, %f4, %f4;
+; CHECK-FP16-NEXT: or.b32 %r16, %r12, 4194304;
+; CHECK-FP16-NEXT: selp.b32 %r17, %r16, %r15, %p2;
+; CHECK-FP16-NEXT: prmt.b32 %r18, %r17, %r9, 0x7632U;
+; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r18;
+; CHECK-FP16-NEXT: ret;
+;
+; CHECK-BF16-LABEL: exp2_bf16_test_v(
+; CHECK-BF16: {
+; CHECK-BF16-NEXT: .reg .b32 %r<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT: // %bb.0: // %entry
+; CHECK-BF16-NEXT: ld.param.b32 %r1, [exp2_bf16_test_v_param_0];
+; CHECK-BF16-NEXT: ex2.approx.ftz.bf16x2 %r2, %r1;
+; CHECK-BF16-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-BF16-NEXT: ret;
+entry:
+ %exp2 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %in)
+ ret <2 x bfloat> %exp2
+}
+
+declare float @llvm.exp2.f32(float %val)
+
+declare <2 x float> @llvm.exp2.v2f32(<2 x float> %val)
+
+declare half @llvm.exp2.f16(half %val)
+
+declare <2 x half> @llvm.exp2.v2f16(<2 x half> %val)
+
+declare bfloat @llvm.exp2.bf16(bfloat %val)
+
+declare <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %val)
+
+attributes #0 = {"denormal-fp-math"="preserve-sign"}
diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll
new file mode 100644
index 00000000000000..ff762dcf74b2f9
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/flog2.ll
@@ -0,0 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | %ptxas-verify -arch=sm_50 %}
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: log2_test
+define float @log2_test(float %in) {
+; CHECK-LABEL: log2_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [log2_test_param_0];
+; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call float @llvm.log2.f32(float %in)
+ ret float %log2
+}
+
+; CHECK-LABEL: log2_ftz_test
+define float @log2_ftz_test(float %in) #0 {
+; CHECK-LABEL: log2_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [log2_ftz_test_param_0];
+; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call float @llvm.log2.f32(float %in)
+ ret float %log2
+}
+
+; CHECK-LABEL: log2_test_v
+define <2 x float> @log2_test_v(<2 x float> %in) {
+; CHECK-LABEL: log2_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [log2_test_v_param_0];
+; CHECK-NEXT: lg2.approx.f32 %f3, %f2;
+; CHECK-NEXT: lg2.approx.f32 %f4, %f1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
+ ret <2 x float> %log2
+}
+
+; --- f16 ---
+
+; CHECK-LABEL: log2_f16_test
+define half @log2_f16_test(half %in) {
+; CHECK-LABEL: log2_f16_test(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs1, [log2_f16_test_param_0];
+; CHECK-NEXT: cvt.f32.f16 %f1, %rs1;
+; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call half @llvm.log2.f16(half %in)
+ ret half %log2
+}
+
+; CHECK-LABEL: log2_f16_ftz_test
+define half @log2_f16_ftz_test(half %in) #0 {
+; CHECK-LABEL: log2_f16_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs1, [log2_f16_ftz_test_param_0];
+; CHECK-NEXT: cvt.ftz.f32.f16 %f1, %rs1;
+; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call half @llvm.log2.f16(half %in)
+ ret half %log2
+}
+
+; CHECK-LABEL: log2_f16_test_v
+define <2 x half> @log2_f16_test_v(<2 x half> %in) {
+; CHECK-LABEL: log2_f16_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [log2_f16_test_v_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NEXT: cvt.f32.f16 %f3, %rs1;
+; CHECK-NEXT: lg2.approx.f32 %f4, %f3;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call <2 x half> @llvm.log2.v2f16(<2 x half> %in)
+ ret <2 x half> %log2
+}
+
+; --- bf16 ---
+
+; CHECK-LABEL: log2_bf16_test
+define bfloat @log2_bf16_test(bfloat %in) {
+; CHECK-LABEL: log2_bf16_test(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.u16 %r1, [log2_bf16_test_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NEXT: mov.b32 %f1, %r2;
+; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT: mov.b32 %r3, %f2;
+; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT: add.s32 %r5, %r4, %r3;
+; CHECK-NEXT: add.s32 %r6, %r5, 32767;
+; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT: or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call bfloat @llvm.log2.bf16(bfloat %in)
+ ret bfloat %log2
+}
+
+; CHECK-LABEL: log2_bf16_ftz_test
+define bfloat @log2_bf16_ftz_test(bfloat %in) #0 {
+; CHECK-LABEL: log2_bf16_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.u16 %r1, [log2_bf16_ftz_test_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NEXT: mov.b32 %f1, %r2;
+; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: mov.b32 %r3, %f2;
+; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT: add.s32 %r5, %r4, %r3;
+; CHECK-NEXT: add.s32 %r6, %r5, 32767;
+; CHECK-NEXT: setp.nan.ftz.f32 %p1, %f2, %f2;
+; CHECK-NEXT: or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call bfloat @llvm.log2.bf16(bfloat %in)
+ ret bfloat %log2
+}
+
+; CHECK-LABEL: log2_bf16_test_v
+define <2 x bfloat> @log2_bf16_test_v(<2 x bfloat> %in) {
+; CHECK-LABEL: log2_bf16_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [log2_bf16_test_v_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT: shl.b32 %r3, %r2, 16;
+; CHECK-NEXT: mov.b32 %f1, %r3;
+; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT: mov.b32 %r4, %f2;
+; CHECK-NEXT: bfe.u32 %r5, %r4, 16, 1;
+; CHECK-NEXT: add.s32 %r6, %r5, %r4;
+; CHECK-NEXT: add.s32 %r7, %r6, 32767;
+; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT: or.b32 %r8, %r4, 4194304;
+; CHECK-NEXT: selp.b32 %r9, %r8, %r7, %p1;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs1;
+; CHECK-NEXT: shl.b32 %r11, %r10, 16;
+; CHECK-NEXT: mov.b32 %f3, %r11;
+; CHECK-NEXT: lg2.approx.f32 %f4, %f3;
+; CHECK-NEXT: mov.b32 %r12, %f4;
+; CHECK-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; CHECK-NEXT: add.s32 %r14, %r13, %r12;
+; CHECK-NEXT: add.s32 %r15, %r14, 32767;
+; CHECK-NEXT: setp.nan.f32 %p2, %f4, %f4;
+; CHECK-NEXT: or.b32 %r16, %r12, 4194304;
+; CHECK-NEXT: selp.b32 %r17, %r16, %r15, %p2;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r9, 0x7632U;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r18;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in)
+ ret <2 x bfloat> %log2
+}
+
+declare float @llvm.log2.f32(float %val)
+
+declare <2 x float> @llvm.log2.v2f32(<2 x float> %val)
+
+declare half @llvm.log2.f16(half %val)
+
+declare <2 x half> @llvm.log2.v2f16(<2 x half> %val)
+
+declare bfloat @llvm.log2.bf16(bfloat %val)
+
+declare <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %val)
+
+attributes #0 = {"denormal-fp-math"="preserve-sign"}
More information about the llvm-commits
mailing list