[llvm] [NVPTX] Support exp2 and log2 for f32/f16/bf16 and vectors (PR #120519)

Princeton Ferro via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 20 13:58:37 PST 2024


https://github.com/Prince781 updated https://github.com/llvm/llvm-project/pull/120519

>From accff238eca8c6b952e045d28818ef80fc0a8b52 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Wed, 18 Dec 2024 23:37:41 -0500
Subject: [PATCH] [NVPTX] Support exp2 and log2 for f32/f16/bf16 and vectors

Lower exp2 to ex2.approx and log2 to lg2.approx for f32 and
all vectors of f32. Also support f16 and bf16 variants by promoting
values to f32.

Lowering to lg2.approx is off by default because of different precision
from libdevice's implementation.
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp |  29 ++-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td     |  17 ++
 llvm/test/CodeGen/NVPTX/fexp2.ll            | 270 +++++++++++++++++++
 llvm/test/CodeGen/NVPTX/flog2.ll            | 271 ++++++++++++++++++++
 4 files changed, 586 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/fexp2.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/flog2.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5c1f717694a4c7..957c9a3bf6c312 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -94,6 +94,19 @@ static cl::opt<bool> UsePrecSqrtF32(
     cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
     cl::init(true));
 
+/// Approximate exp2 is enabled by default to match existing CUDA behavior,
+/// which always uses `ex2.approx` instruction. This is not the case for `log2`,
+/// so it's disabled by default.
+static cl::opt<bool> UseApproxExp2F32(
+    "nvptx-approx-exp2f32", cl::Hidden,
+    cl::desc("NVPTX Specific: whether to use ex2.approx for exp2"),
+    cl::init(true));
+
+static cl::opt<bool> UseApproxLog2F32(
+    "nvptx-approx-log2f32",
+    cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
+    cl::init(false));
+
 static cl::opt<bool> ForceMinByValParamAlign(
     "nvptx-force-min-byval-param-align", cl::Hidden,
     cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
@@ -968,7 +981,21 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::CopyToReg, MVT::i128, Custom);
   setOperationAction(ISD::CopyFromReg, MVT::i128, Custom);
 
-  // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
+  if (UseApproxExp2F32) {
+    setOperationAction(ISD::FEXP2, MVT::f32, Legal);
+    setOperationPromotedToType(ISD::FEXP2, MVT::f16, MVT::f32);
+    setOperationAction(ISD::FEXP2, MVT::v2f16, Expand);
+    setOperationPromotedToType(ISD::FEXP2, MVT::bf16, MVT::f32);
+    setOperationAction(ISD::FEXP2, MVT::v2bf16, Expand);
+  }
+  if (UseApproxLog2F32) {
+    setOperationAction(ISD::FLOG2, MVT::f32, Legal);
+    setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
+    setOperationAction(ISD::FLOG2, MVT::v2f16, Expand);
+    setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
+    setOperationAction(ISD::FLOG2, MVT::v2bf16, Expand);
+  }
+
   // No FPOW or FREM in PTX.
 
   // Now deduce the information based on the above mentioned
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 711cd67eceed9a..ca61337b204a4a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -518,6 +518,19 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
                Requires<[hasBF16Math, noFMA]>;
 }
 
+// Template for operations which take one f32 operand.  Provides two
+// instructions: <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush subnormal inputs and
+// results to zero).
+multiclass F1<string OpcStr, SDNode OpNode> {
+   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                           OpcStr # ".ftz.f32 \t$dst, $a;",
+                           [(set f32:$dst, (OpNode f32:$a))]>,
+                           Requires<[doF32FTZ]>;
+   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                           OpcStr # ".f32 \t$dst, $a;",
+                           [(set f32:$dst, (OpNode f32:$a))]>;
+}
+
 // Template for operations which take two f32 or f64 operands.  Provides three
 // instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
 // subnormal inputs and results to zero).
@@ -1193,6 +1206,10 @@ defm FNEG_H: F2_Support_Half<"neg", fneg>;
 
 defm FSQRT : F2<"sqrt.rn", fsqrt>;
 
+defm FEXP2 : F1<"ex2.approx", fexp2>;
+
+defm FLOG2 : F1<"lg2.approx", flog2>;
+
 //
 // F16 NEG
 //
diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll
new file mode 100644
index 00000000000000..fbacc35be70f1a
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fexp2.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 | %ptxas-verify -arch=sm_20 %}
+target triple = "nvptx64-nvidia-cuda"
+
+; --- f32 ---
+
+; CHECK-LABEL: exp2_test
+define float @exp2_test(float %in) {
+; CHECK-LABEL: exp2_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+entry:
+  %exp2 = call float @llvm.exp2.f32(float %in)
+  ret float %exp2
+}
+
+; CHECK-LABEL: exp2_ftz_test
+define float @exp2_ftz_test(float %in) #0 {
+; CHECK-LABEL: exp2_ftz_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+entry:
+  %exp2 = call float @llvm.exp2.f32(float %in)
+  ret float %exp2
+}
+
+; CHECK-LABEL: exp2_test_v
+define <4 x float> @exp2_test_v(<4 x float> %in) {
+; CHECK-LABEL: exp2_test_v(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [exp2_test_v_param_0];
+; CHECK-NEXT:    ex2.approx.f32 %f5, %f4;
+; CHECK-NEXT:    ex2.approx.f32 %f6, %f3;
+; CHECK-NEXT:    ex2.approx.f32 %f7, %f2;
+; CHECK-NEXT:    ex2.approx.f32 %f8, %f1;
+; CHECK-NEXT:    st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    ret;
+entry:
+  %exp2 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
+  ret <4 x float> %exp2
+}
+
+; --- f16 ---
+
+; CHECK-LABEL: exp2_f16_test
+define half @exp2_f16_test(half %in) {
+; CHECK-LABEL: exp2_f16_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b16 %rs1, [exp2_f16_test_param_0];
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs1;
+; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT:    ret;
+entry:
+  %exp2 = call half @llvm.exp2.f16(half %in)
+  ret half %exp2
+}
+
+; CHECK-LABEL: exp2_f16_ftz_test
+define half @exp2_f16_ftz_test(half %in) #0 {
+; CHECK-LABEL: exp2_f16_ftz_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b16 %rs1, [exp2_f16_ftz_test_param_0];
+; CHECK-NEXT:    cvt.ftz.f32.f16 %f1, %rs1;
+; CHECK-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT:    ret;
+entry:
+  %exp2 = call half @llvm.exp2.f16(half %in)
+  ret half %exp2
+}
+
+; CHECK-LABEL: exp2_f16_test_v
+define <4 x half> @exp2_f16_test_v(<4 x half> %in) {
+; CHECK-LABEL: exp2_f16_test_v(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<9>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [exp2_f16_test_v_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NEXT:    ex2.approx.f32 %f4, %f3;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NEXT:    mov.b32 %r3, {%rs4, %rs3};
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f5, %rs6;
+; CHECK-NEXT:    ex2.approx.f32 %f6, %f5;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs7, %f6;
+; CHECK-NEXT:    cvt.f32.f16 %f7, %rs5;
+; CHECK-NEXT:    ex2.approx.f32 %f8, %f7;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs8, %f8;
+; CHECK-NEXT:    mov.b32 %r4, {%rs8, %rs7};
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+entry:
+  %exp2 = call <4 x half> @llvm.exp2.v4f16(<4 x half> %in)
+  ret <4 x half> %exp2
+}
+
+; --- bf16 ---
+
+; CHECK-LABEL: exp2_bf16_test
+define bfloat @exp2_bf16_test(bfloat %in) {
+; CHECK-LABEL: exp2_bf16_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.u16 %r1, [exp2_bf16_test_param_0];
+; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    mov.b32 %r3, %f2;
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT:    add.s32 %r5, %r4, %r3;
+; CHECK-NEXT:    add.s32 %r6, %r5, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT:    or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT:    ret;
+entry:
+  %exp2 = call bfloat @llvm.exp2.bf16(bfloat %in)
+  ret bfloat %exp2
+}
+
+; CHECK-LABEL: exp2_bf16_ftz_test
+define bfloat @exp2_bf16_ftz_test(bfloat %in) #0 {
+; CHECK-LABEL: exp2_bf16_ftz_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.u16 %r1, [exp2_bf16_ftz_test_param_0];
+; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    mov.b32 %r3, %f2;
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT:    add.s32 %r5, %r4, %r3;
+; CHECK-NEXT:    add.s32 %r6, %r5, 32767;
+; CHECK-NEXT:    setp.nan.ftz.f32 %p1, %f2, %f2;
+; CHECK-NEXT:    or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT:    ret;
+entry:
+  %exp2 = call bfloat @llvm.exp2.bf16(bfloat %in)
+  ret bfloat %exp2
+}
+
+; CHECK-LABEL: exp2_bf16_test_v
+define <4 x bfloat> @exp2_bf16_test_v(<4 x bfloat> %in) {
+; CHECK-LABEL: exp2_bf16_test_v(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<5>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<37>;
+; CHECK-NEXT:    .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [exp2_bf16_test_v_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r4;
+; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    mov.b32 %r5, %f2;
+; CHECK-NEXT:    bfe.u32 %r6, %r5, 16, 1;
+; CHECK-NEXT:    add.s32 %r7, %r6, %r5;
+; CHECK-NEXT:    add.s32 %r8, %r7, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT:    or.b32 %r9, %r5, 4194304;
+; CHECK-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
+; CHECK-NEXT:    cvt.u32.u16 %r11, %rs1;
+; CHECK-NEXT:    shl.b32 %r12, %r11, 16;
+; CHECK-NEXT:    mov.b32 %f3, %r12;
+; CHECK-NEXT:    ex2.approx.f32 %f4, %f3;
+; CHECK-NEXT:    mov.b32 %r13, %f4;
+; CHECK-NEXT:    bfe.u32 %r14, %r13, 16, 1;
+; CHECK-NEXT:    add.s32 %r15, %r14, %r13;
+; CHECK-NEXT:    add.s32 %r16, %r15, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p2, %f4, %f4;
+; CHECK-NEXT:    or.b32 %r17, %r13, 4194304;
+; CHECK-NEXT:    selp.b32 %r18, %r17, %r16, %p2;
+; CHECK-NEXT:    prmt.b32 %r19, %r18, %r10, 0x7632U;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r20, %rs4;
+; CHECK-NEXT:    shl.b32 %r21, %r20, 16;
+; CHECK-NEXT:    mov.b32 %f5, %r21;
+; CHECK-NEXT:    ex2.approx.f32 %f6, %f5;
+; CHECK-NEXT:    mov.b32 %r22, %f6;
+; CHECK-NEXT:    bfe.u32 %r23, %r22, 16, 1;
+; CHECK-NEXT:    add.s32 %r24, %r23, %r22;
+; CHECK-NEXT:    add.s32 %r25, %r24, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p3, %f6, %f6;
+; CHECK-NEXT:    or.b32 %r26, %r22, 4194304;
+; CHECK-NEXT:    selp.b32 %r27, %r26, %r25, %p3;
+; CHECK-NEXT:    cvt.u32.u16 %r28, %rs3;
+; CHECK-NEXT:    shl.b32 %r29, %r28, 16;
+; CHECK-NEXT:    mov.b32 %f7, %r29;
+; CHECK-NEXT:    ex2.approx.f32 %f8, %f7;
+; CHECK-NEXT:    mov.b32 %r30, %f8;
+; CHECK-NEXT:    bfe.u32 %r31, %r30, 16, 1;
+; CHECK-NEXT:    add.s32 %r32, %r31, %r30;
+; CHECK-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p4, %f8, %f8;
+; CHECK-NEXT:    or.b32 %r34, %r30, 4194304;
+; CHECK-NEXT:    selp.b32 %r35, %r34, %r33, %p4;
+; CHECK-NEXT:    prmt.b32 %r36, %r35, %r27, 0x7632U;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r36, %r19};
+; CHECK-NEXT:    ret;
+entry:
+  %exp2 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> %in)
+  ret <4 x bfloat> %exp2
+}
+
+declare float @llvm.exp2.f32(float %val)
+
+declare <4 x float> @llvm.exp2.v4f32(<4 x float> %val)
+
+declare half @llvm.exp2.f16(half %val)
+
+declare <4 x half> @llvm.exp2.v4f16(<4 x half> %val)
+
+declare bfloat @llvm.exp2.bf16(bfloat %val)
+
+declare <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> %val)
+
+attributes #0 = {"denormal-fp-math"="preserve-sign"}
diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll
new file mode 100644
index 00000000000000..822c7e610358ef
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/flog2.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 -nvptx-approx-log2f32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 | %ptxas-verify -arch=sm_20 %}
+target triple = "nvptx64-nvidia-cuda"
+
+; RUN: not --crash llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx32 2>&1 | FileCheck --check-prefixes=ERR %s
+; ERR: no libcall available for flog2
+
+; CHECK-LABEL: log2_test
+define float @log2_test(float %in) {
+; CHECK-LABEL: log2_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.f32 %f1, [log2_test_param_0];
+; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call float @llvm.log2.f32(float %in)
+  ret float %log2
+}
+
+; CHECK-LABEL: log2_ftz_test
+define float @log2_ftz_test(float %in) #0 {
+; CHECK-LABEL: log2_ftz_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.f32 %f1, [log2_ftz_test_param_0];
+; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call float @llvm.log2.f32(float %in)
+  ret float %log2
+}
+
+; CHECK-LABEL: log2_test_v
+define <4 x float> @log2_test_v(<4 x float> %in) {
+; CHECK-LABEL: log2_test_v(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [log2_test_v_param_0];
+; CHECK-NEXT:    lg2.approx.f32 %f5, %f4;
+; CHECK-NEXT:    lg2.approx.f32 %f6, %f3;
+; CHECK-NEXT:    lg2.approx.f32 %f7, %f2;
+; CHECK-NEXT:    lg2.approx.f32 %f8, %f1;
+; CHECK-NEXT:    st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
+  ret <4 x float> %log2
+}
+
+; --- f16 ---
+
+; CHECK-LABEL: log2_f16_test
+define half @log2_f16_test(half %in) {
+; CHECK-LABEL: log2_f16_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b16 %rs1, [log2_f16_test_param_0];
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs1;
+; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call half @llvm.log2.f16(half %in)
+  ret half %log2
+}
+
+; CHECK-LABEL: log2_f16_ftz_test
+define half @log2_f16_ftz_test(half %in) #0 {
+; CHECK-LABEL: log2_f16_ftz_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b16 %rs1, [log2_f16_ftz_test_param_0];
+; CHECK-NEXT:    cvt.ftz.f32.f16 %f1, %rs1;
+; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call half @llvm.log2.f16(half %in)
+  ret half %log2
+}
+
+; CHECK-LABEL: log2_f16_test_v
+define <4 x half> @log2_f16_test_v(<4 x half> %in) {
+; CHECK-LABEL: log2_f16_test_v(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<9>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [log2_f16_test_v_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NEXT:    lg2.approx.f32 %f4, %f3;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NEXT:    mov.b32 %r3, {%rs4, %rs3};
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f5, %rs6;
+; CHECK-NEXT:    lg2.approx.f32 %f6, %f5;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs7, %f6;
+; CHECK-NEXT:    cvt.f32.f16 %f7, %rs5;
+; CHECK-NEXT:    lg2.approx.f32 %f8, %f7;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs8, %f8;
+; CHECK-NEXT:    mov.b32 %r4, {%rs8, %rs7};
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call <4 x half> @llvm.log2.v4f16(<4 x half> %in)
+  ret <4 x half> %log2
+}
+
+; --- bf16 ---
+
+; CHECK-LABEL: log2_bf16_test
+define bfloat @log2_bf16_test(bfloat %in) {
+; CHECK-LABEL: log2_bf16_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.u16 %r1, [log2_bf16_test_param_0];
+; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    mov.b32 %r3, %f2;
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT:    add.s32 %r5, %r4, %r3;
+; CHECK-NEXT:    add.s32 %r6, %r5, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT:    or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call bfloat @llvm.log2.bf16(bfloat %in)
+  ret bfloat %log2
+}
+
+; CHECK-LABEL: log2_bf16_ftz_test
+define bfloat @log2_bf16_ftz_test(bfloat %in) #0 {
+; CHECK-LABEL: log2_bf16_ftz_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.u16 %r1, [log2_bf16_ftz_test_param_0];
+; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    mov.b32 %r3, %f2;
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT:    add.s32 %r5, %r4, %r3;
+; CHECK-NEXT:    add.s32 %r6, %r5, 32767;
+; CHECK-NEXT:    setp.nan.ftz.f32 %p1, %f2, %f2;
+; CHECK-NEXT:    or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call bfloat @llvm.log2.bf16(bfloat %in)
+  ret bfloat %log2
+}
+
+; CHECK-LABEL: log2_bf16_test_v
+define <4 x bfloat> @log2_bf16_test_v(<4 x bfloat> %in) {
+; CHECK-LABEL: log2_bf16_test_v(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<5>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<37>;
+; CHECK-NEXT:    .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [log2_bf16_test_v_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r4;
+; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    mov.b32 %r5, %f2;
+; CHECK-NEXT:    bfe.u32 %r6, %r5, 16, 1;
+; CHECK-NEXT:    add.s32 %r7, %r6, %r5;
+; CHECK-NEXT:    add.s32 %r8, %r7, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT:    or.b32 %r9, %r5, 4194304;
+; CHECK-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
+; CHECK-NEXT:    cvt.u32.u16 %r11, %rs1;
+; CHECK-NEXT:    shl.b32 %r12, %r11, 16;
+; CHECK-NEXT:    mov.b32 %f3, %r12;
+; CHECK-NEXT:    lg2.approx.f32 %f4, %f3;
+; CHECK-NEXT:    mov.b32 %r13, %f4;
+; CHECK-NEXT:    bfe.u32 %r14, %r13, 16, 1;
+; CHECK-NEXT:    add.s32 %r15, %r14, %r13;
+; CHECK-NEXT:    add.s32 %r16, %r15, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p2, %f4, %f4;
+; CHECK-NEXT:    or.b32 %r17, %r13, 4194304;
+; CHECK-NEXT:    selp.b32 %r18, %r17, %r16, %p2;
+; CHECK-NEXT:    prmt.b32 %r19, %r18, %r10, 0x7632U;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r20, %rs4;
+; CHECK-NEXT:    shl.b32 %r21, %r20, 16;
+; CHECK-NEXT:    mov.b32 %f5, %r21;
+; CHECK-NEXT:    lg2.approx.f32 %f6, %f5;
+; CHECK-NEXT:    mov.b32 %r22, %f6;
+; CHECK-NEXT:    bfe.u32 %r23, %r22, 16, 1;
+; CHECK-NEXT:    add.s32 %r24, %r23, %r22;
+; CHECK-NEXT:    add.s32 %r25, %r24, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p3, %f6, %f6;
+; CHECK-NEXT:    or.b32 %r26, %r22, 4194304;
+; CHECK-NEXT:    selp.b32 %r27, %r26, %r25, %p3;
+; CHECK-NEXT:    cvt.u32.u16 %r28, %rs3;
+; CHECK-NEXT:    shl.b32 %r29, %r28, 16;
+; CHECK-NEXT:    mov.b32 %f7, %r29;
+; CHECK-NEXT:    lg2.approx.f32 %f8, %f7;
+; CHECK-NEXT:    mov.b32 %r30, %f8;
+; CHECK-NEXT:    bfe.u32 %r31, %r30, 16, 1;
+; CHECK-NEXT:    add.s32 %r32, %r31, %r30;
+; CHECK-NEXT:    add.s32 %r33, %r32, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p4, %f8, %f8;
+; CHECK-NEXT:    or.b32 %r34, %r30, 4194304;
+; CHECK-NEXT:    selp.b32 %r35, %r34, %r33, %p4;
+; CHECK-NEXT:    prmt.b32 %r36, %r35, %r27, 0x7632U;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r36, %r19};
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> %in)
+  ret <4 x bfloat> %log2
+}
+
+declare float @llvm.log2.f32(float %val)
+
+declare <4 x float> @llvm.log2.v4f32(<4 x float> %val)
+
+declare half @llvm.log2.f16(half %val)
+
+declare <4 x half> @llvm.log2.v4f16(<4 x half> %val)
+
+declare bfloat @llvm.log2.bf16(bfloat %val)
+
+declare <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> %val)
+
+attributes #0 = {"denormal-fp-math"="preserve-sign"}



More information about the llvm-commits mailing list