[llvm] [NVPTX] Support llvm.{exp2, log2} for f32/f16/bf16 and vectors (PR #120519)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 19 21:18:19 PST 2024
https://github.com/Prince781 updated https://github.com/llvm/llvm-project/pull/120519
>From a246f128da1192eedbe11d7c7fb2ad9ed9abd6b7 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Wed, 18 Dec 2024 23:37:41 -0500
Subject: [PATCH] [NVPTX] Support llvm.{exp2,log2} for f32/f16/bf16 and vectors
Lower llvm.exp2 to ex2.approx and llvm.log2 to lg2.approx for f32 and
all vectors of f32. Also support f16 and bf16 variants by promoting
values to f32.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 28 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 17 ++
llvm/test/CodeGen/NVPTX/fexp2.ll | 270 ++++++++++++++++++++
llvm/test/CodeGen/NVPTX/flog2.ll | 268 +++++++++++++++++++
4 files changed, 582 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/NVPTX/fexp2.ll
create mode 100644 llvm/test/CodeGen/NVPTX/flog2.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5c1f717694a4c7..bc4fc4d3ca0f42 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -94,6 +94,18 @@ static cl::opt<bool> UsePrecSqrtF32(
cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
cl::init(true));
+static cl::opt<bool>
+ UseApproxExp2F32("nvptx-approx-exp2f32", cl::Hidden,
+ cl::desc("NVPTX Specific: whether to use ex2.approx for "
+ "exp2f() (default = true)"),
+ cl::init(true));
+
+static cl::opt<bool>
+ UseApproxLog2F32("nvptx-approx-log2f32",
+ cl::desc("NVPTX Specific: whether to use lg2.approx for "
+ "log2f() (default = true)"),
+ cl::init(true));
+
static cl::opt<bool> ForceMinByValParamAlign(
"nvptx-force-min-byval-param-align", cl::Hidden,
cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
@@ -968,7 +980,21 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::CopyToReg, MVT::i128, Custom);
setOperationAction(ISD::CopyFromReg, MVT::i128, Custom);
- // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
+ if (UseApproxExp2F32) {
+ setOperationAction(ISD::FEXP2, MVT::f32, Legal);
+ setOperationPromotedToType(ISD::FEXP2, MVT::f16, MVT::f32);
+ setOperationAction(ISD::FEXP2, MVT::v2f16, Expand);
+ setOperationPromotedToType(ISD::FEXP2, MVT::bf16, MVT::f32);
+ setOperationAction(ISD::FEXP2, MVT::v2bf16, Expand);
+ }
+ if (UseApproxLog2F32) {
+ setOperationAction(ISD::FLOG2, MVT::f32, Legal);
+ setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
+ setOperationAction(ISD::FLOG2, MVT::v2f16, Expand);
+ setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
+ setOperationAction(ISD::FLOG2, MVT::v2bf16, Expand);
+ }
+
// No FPOW or FREM in PTX.
// Now deduce the information based on the above mentioned
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 711cd67eceed9a..ca61337b204a4a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -518,6 +518,19 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
Requires<[hasBF16Math, noFMA]>;
}
+// Template for operations which take one f32 operand. Provides two
+// instructions: <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush subnormal inputs and
+// results to zero).
+multiclass F1<string OpcStr, SDNode OpNode> {
+ def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ OpcStr # ".ftz.f32 \t$dst, $a;",
+ [(set f32:$dst, (OpNode f32:$a))]>,
+ Requires<[doF32FTZ]>;
+ def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ OpcStr # ".f32 \t$dst, $a;",
+ [(set f32:$dst, (OpNode f32:$a))]>;
+}
+
// Template for operations which take two f32 or f64 operands. Provides three
// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
// subnormal inputs and results to zero).
@@ -1193,6 +1206,10 @@ defm FNEG_H: F2_Support_Half<"neg", fneg>;
defm FSQRT : F2<"sqrt.rn", fsqrt>;
+defm FEXP2 : F1<"ex2.approx", fexp2>;
+
+defm FLOG2 : F1<"lg2.approx", flog2>;
+
//
// F16 NEG
//
diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll
new file mode 100644
index 00000000000000..fbacc35be70f1a
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fexp2.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 | %ptxas-verify -arch=sm_20 %}
+target triple = "nvptx64-nvidia-cuda"
+
+; --- f32 ---
+
+; CHECK-LABEL: exp2_test
+define float @exp2_test(float %in) {
+; CHECK-LABEL: exp2_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call float @llvm.exp2.f32(float %in)
+ ret float %exp2
+}
+
+; CHECK-LABEL: exp2_ftz_test
+define float @exp2_ftz_test(float %in) #0 {
+; CHECK-LABEL: exp2_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call float @llvm.exp2.f32(float %in)
+ ret float %exp2
+}
+
+; CHECK-LABEL: exp2_test_v
+define <4 x float> @exp2_test_v(<4 x float> %in) {
+; CHECK-LABEL: exp2_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [exp2_test_v_param_0];
+; CHECK-NEXT: ex2.approx.f32 %f5, %f4;
+; CHECK-NEXT: ex2.approx.f32 %f6, %f3;
+; CHECK-NEXT: ex2.approx.f32 %f7, %f2;
+; CHECK-NEXT: ex2.approx.f32 %f8, %f1;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
+ ret <4 x float> %exp2
+}
+
+; --- f16 ---
+
+; CHECK-LABEL: exp2_f16_test
+define half @exp2_f16_test(half %in) {
+; CHECK-LABEL: exp2_f16_test(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs1, [exp2_f16_test_param_0];
+; CHECK-NEXT: cvt.f32.f16 %f1, %rs1;
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call half @llvm.exp2.f16(half %in)
+ ret half %exp2
+}
+
+; CHECK-LABEL: exp2_f16_ftz_test
+define half @exp2_f16_ftz_test(half %in) #0 {
+; CHECK-LABEL: exp2_f16_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs1, [exp2_f16_ftz_test_param_0];
+; CHECK-NEXT: cvt.ftz.f32.f16 %f1, %rs1;
+; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call half @llvm.exp2.f16(half %in)
+ ret half %exp2
+}
+
+; CHECK-LABEL: exp2_f16_test_v
+define <4 x half> @exp2_f16_test_v(<4 x half> %in) {
+; CHECK-LABEL: exp2_f16_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [exp2_f16_test_v_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NEXT: cvt.f32.f16 %f3, %rs1;
+; CHECK-NEXT: ex2.approx.f32 %f4, %f3;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NEXT: mov.b32 %r3, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: cvt.f32.f16 %f5, %rs6;
+; CHECK-NEXT: ex2.approx.f32 %f6, %f5;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs7, %f6;
+; CHECK-NEXT: cvt.f32.f16 %f7, %rs5;
+; CHECK-NEXT: ex2.approx.f32 %f8, %f7;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs8, %f8;
+; CHECK-NEXT: mov.b32 %r4, {%rs8, %rs7};
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call <4 x half> @llvm.exp2.v4f16(<4 x half> %in)
+ ret <4 x half> %exp2
+}
+
+; --- bf16 ---
+
+; CHECK-LABEL: exp2_bf16_test
+define bfloat @exp2_bf16_test(bfloat %in) {
+; CHECK-LABEL: exp2_bf16_test(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.u16 %r1, [exp2_bf16_test_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NEXT: mov.b32 %f1, %r2;
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: mov.b32 %r3, %f2;
+; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT: add.s32 %r5, %r4, %r3;
+; CHECK-NEXT: add.s32 %r6, %r5, 32767;
+; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT: or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call bfloat @llvm.exp2.bf16(bfloat %in)
+ ret bfloat %exp2
+}
+
+; CHECK-LABEL: exp2_bf16_ftz_test
+define bfloat @exp2_bf16_ftz_test(bfloat %in) #0 {
+; CHECK-LABEL: exp2_bf16_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.u16 %r1, [exp2_bf16_ftz_test_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NEXT: mov.b32 %f1, %r2;
+; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: mov.b32 %r3, %f2;
+; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT: add.s32 %r5, %r4, %r3;
+; CHECK-NEXT: add.s32 %r6, %r5, 32767;
+; CHECK-NEXT: setp.nan.ftz.f32 %p1, %f2, %f2;
+; CHECK-NEXT: or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call bfloat @llvm.exp2.bf16(bfloat %in)
+ ret bfloat %exp2
+}
+
+; CHECK-LABEL: exp2_bf16_test_v
+define <4 x bfloat> @exp2_bf16_test_v(<4 x bfloat> %in) {
+; CHECK-LABEL: exp2_bf16_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<37>;
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [exp2_bf16_test_v_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-NEXT: mov.b32 %f1, %r4;
+; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT: mov.b32 %r5, %f2;
+; CHECK-NEXT: bfe.u32 %r6, %r5, 16, 1;
+; CHECK-NEXT: add.s32 %r7, %r6, %r5;
+; CHECK-NEXT: add.s32 %r8, %r7, 32767;
+; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT: or.b32 %r9, %r5, 4194304;
+; CHECK-NEXT: selp.b32 %r10, %r9, %r8, %p1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs1;
+; CHECK-NEXT: shl.b32 %r12, %r11, 16;
+; CHECK-NEXT: mov.b32 %f3, %r12;
+; CHECK-NEXT: ex2.approx.f32 %f4, %f3;
+; CHECK-NEXT: mov.b32 %r13, %f4;
+; CHECK-NEXT: bfe.u32 %r14, %r13, 16, 1;
+; CHECK-NEXT: add.s32 %r15, %r14, %r13;
+; CHECK-NEXT: add.s32 %r16, %r15, 32767;
+; CHECK-NEXT: setp.nan.f32 %p2, %f4, %f4;
+; CHECK-NEXT: or.b32 %r17, %r13, 4194304;
+; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p2;
+; CHECK-NEXT: prmt.b32 %r19, %r18, %r10, 0x7632U;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs4;
+; CHECK-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-NEXT: mov.b32 %f5, %r21;
+; CHECK-NEXT: ex2.approx.f32 %f6, %f5;
+; CHECK-NEXT: mov.b32 %r22, %f6;
+; CHECK-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-NEXT: setp.nan.f32 %p3, %f6, %f6;
+; CHECK-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-NEXT: selp.b32 %r27, %r26, %r25, %p3;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs3;
+; CHECK-NEXT: shl.b32 %r29, %r28, 16;
+; CHECK-NEXT: mov.b32 %f7, %r29;
+; CHECK-NEXT: ex2.approx.f32 %f8, %f7;
+; CHECK-NEXT: mov.b32 %r30, %f8;
+; CHECK-NEXT: bfe.u32 %r31, %r30, 16, 1;
+; CHECK-NEXT: add.s32 %r32, %r31, %r30;
+; CHECK-NEXT: add.s32 %r33, %r32, 32767;
+; CHECK-NEXT: setp.nan.f32 %p4, %f8, %f8;
+; CHECK-NEXT: or.b32 %r34, %r30, 4194304;
+; CHECK-NEXT: selp.b32 %r35, %r34, %r33, %p4;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r27, 0x7632U;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r36, %r19};
+; CHECK-NEXT: ret;
+entry:
+ %exp2 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> %in)
+ ret <4 x bfloat> %exp2
+}
+
+declare float @llvm.exp2.f32(float %val)
+
+declare <4 x float> @llvm.exp2.v4f32(<4 x float> %val)
+
+declare half @llvm.exp2.f16(half %val)
+
+declare <4 x half> @llvm.exp2.v4f16(<4 x half> %val)
+
+declare bfloat @llvm.exp2.bf16(bfloat %val)
+
+declare <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> %val)
+
+attributes #0 = {"denormal-fp-math"="preserve-sign"}
diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll
new file mode 100644
index 00000000000000..7316f1ea0a3488
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/flog2.ll
@@ -0,0 +1,268 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 | %ptxas-verify -arch=sm_20 %}
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: log2_test
+define float @log2_test(float %in) {
+; CHECK-LABEL: log2_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [log2_test_param_0];
+; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call float @llvm.log2.f32(float %in)
+ ret float %log2
+}
+
+; CHECK-LABEL: log2_ftz_test
+define float @log2_ftz_test(float %in) #0 {
+; CHECK-LABEL: log2_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.f32 %f1, [log2_ftz_test_param_0];
+; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call float @llvm.log2.f32(float %in)
+ ret float %log2
+}
+
+; CHECK-LABEL: log2_test_v
+define <4 x float> @log2_test_v(<4 x float> %in) {
+; CHECK-LABEL: log2_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [log2_test_v_param_0];
+; CHECK-NEXT: lg2.approx.f32 %f5, %f4;
+; CHECK-NEXT: lg2.approx.f32 %f6, %f3;
+; CHECK-NEXT: lg2.approx.f32 %f7, %f2;
+; CHECK-NEXT: lg2.approx.f32 %f8, %f1;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
+ ret <4 x float> %log2
+}
+
+; --- f16 ---
+
+; CHECK-LABEL: log2_f16_test
+define half @log2_f16_test(half %in) {
+; CHECK-LABEL: log2_f16_test(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs1, [log2_f16_test_param_0];
+; CHECK-NEXT: cvt.f32.f16 %f1, %rs1;
+; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call half @llvm.log2.f16(half %in)
+ ret half %log2
+}
+
+; CHECK-LABEL: log2_f16_ftz_test
+define half @log2_f16_ftz_test(half %in) #0 {
+; CHECK-LABEL: log2_f16_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs1, [log2_f16_ftz_test_param_0];
+; CHECK-NEXT: cvt.ftz.f32.f16 %f1, %rs1;
+; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call half @llvm.log2.f16(half %in)
+ ret half %log2
+}
+
+; CHECK-LABEL: log2_f16_test_v
+define <4 x half> @log2_f16_test_v(<4 x half> %in) {
+; CHECK-LABEL: log2_f16_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [log2_f16_test_v_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NEXT: cvt.f32.f16 %f3, %rs1;
+; CHECK-NEXT: lg2.approx.f32 %f4, %f3;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NEXT: mov.b32 %r3, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: cvt.f32.f16 %f5, %rs6;
+; CHECK-NEXT: lg2.approx.f32 %f6, %f5;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs7, %f6;
+; CHECK-NEXT: cvt.f32.f16 %f7, %rs5;
+; CHECK-NEXT: lg2.approx.f32 %f8, %f7;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs8, %f8;
+; CHECK-NEXT: mov.b32 %r4, {%rs8, %rs7};
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call <4 x half> @llvm.log2.v4f16(<4 x half> %in)
+ ret <4 x half> %log2
+}
+
+; --- bf16 ---
+
+; CHECK-LABEL: log2_bf16_test
+define bfloat @log2_bf16_test(bfloat %in) {
+; CHECK-LABEL: log2_bf16_test(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.u16 %r1, [log2_bf16_test_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NEXT: mov.b32 %f1, %r2;
+; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT: mov.b32 %r3, %f2;
+; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT: add.s32 %r5, %r4, %r3;
+; CHECK-NEXT: add.s32 %r6, %r5, 32767;
+; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT: or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call bfloat @llvm.log2.bf16(bfloat %in)
+ ret bfloat %log2
+}
+
+; CHECK-LABEL: log2_bf16_ftz_test
+define bfloat @log2_bf16_ftz_test(bfloat %in) #0 {
+; CHECK-LABEL: log2_bf16_ftz_test(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.u16 %r1, [log2_bf16_ftz_test_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NEXT: mov.b32 %f1, %r2;
+; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT: mov.b32 %r3, %f2;
+; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT: add.s32 %r5, %r4, %r3;
+; CHECK-NEXT: add.s32 %r6, %r5, 32767;
+; CHECK-NEXT: setp.nan.ftz.f32 %p1, %f2, %f2;
+; CHECK-NEXT: or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call bfloat @llvm.log2.bf16(bfloat %in)
+ ret bfloat %log2
+}
+
+; CHECK-LABEL: log2_bf16_test_v
+define <4 x bfloat> @log2_bf16_test_v(<4 x bfloat> %in) {
+; CHECK-LABEL: log2_bf16_test_v(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<37>;
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [log2_bf16_test_v_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-NEXT: mov.b32 %f1, %r4;
+; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT: mov.b32 %r5, %f2;
+; CHECK-NEXT: bfe.u32 %r6, %r5, 16, 1;
+; CHECK-NEXT: add.s32 %r7, %r6, %r5;
+; CHECK-NEXT: add.s32 %r8, %r7, 32767;
+; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT: or.b32 %r9, %r5, 4194304;
+; CHECK-NEXT: selp.b32 %r10, %r9, %r8, %p1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs1;
+; CHECK-NEXT: shl.b32 %r12, %r11, 16;
+; CHECK-NEXT: mov.b32 %f3, %r12;
+; CHECK-NEXT: lg2.approx.f32 %f4, %f3;
+; CHECK-NEXT: mov.b32 %r13, %f4;
+; CHECK-NEXT: bfe.u32 %r14, %r13, 16, 1;
+; CHECK-NEXT: add.s32 %r15, %r14, %r13;
+; CHECK-NEXT: add.s32 %r16, %r15, 32767;
+; CHECK-NEXT: setp.nan.f32 %p2, %f4, %f4;
+; CHECK-NEXT: or.b32 %r17, %r13, 4194304;
+; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p2;
+; CHECK-NEXT: prmt.b32 %r19, %r18, %r10, 0x7632U;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs4;
+; CHECK-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-NEXT: mov.b32 %f5, %r21;
+; CHECK-NEXT: lg2.approx.f32 %f6, %f5;
+; CHECK-NEXT: mov.b32 %r22, %f6;
+; CHECK-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-NEXT: setp.nan.f32 %p3, %f6, %f6;
+; CHECK-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-NEXT: selp.b32 %r27, %r26, %r25, %p3;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs3;
+; CHECK-NEXT: shl.b32 %r29, %r28, 16;
+; CHECK-NEXT: mov.b32 %f7, %r29;
+; CHECK-NEXT: lg2.approx.f32 %f8, %f7;
+; CHECK-NEXT: mov.b32 %r30, %f8;
+; CHECK-NEXT: bfe.u32 %r31, %r30, 16, 1;
+; CHECK-NEXT: add.s32 %r32, %r31, %r30;
+; CHECK-NEXT: add.s32 %r33, %r32, 32767;
+; CHECK-NEXT: setp.nan.f32 %p4, %f8, %f8;
+; CHECK-NEXT: or.b32 %r34, %r30, 4194304;
+; CHECK-NEXT: selp.b32 %r35, %r34, %r33, %p4;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r27, 0x7632U;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r36, %r19};
+; CHECK-NEXT: ret;
+entry:
+ %log2 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> %in)
+ ret <4 x bfloat> %log2
+}
+
+declare float @llvm.log2.f32(float %val)
+
+declare <4 x float> @llvm.log2.v4f32(<4 x float> %val)
+
+declare half @llvm.log2.f16(half %val)
+
+declare <4 x half> @llvm.log2.v4f16(<4 x half> %val)
+
+declare bfloat @llvm.log2.bf16(bfloat %val)
+
+declare <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> %val)
+
+attributes #0 = {"denormal-fp-math"="preserve-sign"}
More information about the llvm-commits
mailing list