[llvm] 28f3edd - AMDGPU: Add llvm.amdgcn.exp2 intrinsic

Thu Jun 15 04:00:17 PDT 2023

Author: Matt Arsenault
Date: 2023-06-15T07:00:07-04:00
New Revision: 28f3edd2be2a27b9dc7739d137147007c5fd9e41

URL: https://github.com/llvm/llvm-project/commit/28f3edd2be2a27b9dc7739d137147007c5fd9e41
DIFF: https://github.com/llvm/llvm-project/commit/28f3edd2be2a27b9dc7739d137147007c5fd9e41.diff

LOG: AMDGPU: Add llvm.amdgcn.exp2 intrinsic

Provide direct access to v_exp_f32 and v_exp_f16, so we can start
correctly lowering the generic exp intrinsics.

Unfortunately have to break from the usual naming convention of
matching the instruction name and stripping the v_ prefix. exp is
already taken by the export intrinsic. On the clang builtin side, we
have a choice of maintaining the convention to the instruction name,
or following the intrinsic name.

Added: 
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll

Modified: 
    clang/include/clang/Basic/BuiltinsAMDGPU.def
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/test/CodeGenOpenCL/builtins-amdgcn.cl
    llvm/docs/AMDGPUUsage.rst
    llvm/docs/ReleaseNotes.rst
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
    llvm/lib/Target/AMDGPU/R600Instructions.td
    llvm/lib/Target/AMDGPU/VOP1Instructions.td

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0ccdc30ad386b..29aa9ca7552ed 100644

--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -101,6 +101,7 @@ BUILTIN(__builtin_amdgcn_rsq_clampf, "ff", "nc")
 BUILTIN(__builtin_amdgcn_sinf, "ff", "nc")
 BUILTIN(__builtin_amdgcn_cosf, "ff", "nc")
 BUILTIN(__builtin_amdgcn_logf, "ff", "nc")
+BUILTIN(__builtin_amdgcn_exp2f, "ff", "nc")
 BUILTIN(__builtin_amdgcn_log_clampf, "ff", "nc")
 BUILTIN(__builtin_amdgcn_ldexp, "ddi", "nc")
 BUILTIN(__builtin_amdgcn_ldexpf, "ffi", "nc")

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 421c3d2e66ddb..4939392f5d148 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17173,6 +17173,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     return EmitAMDGPUDispatchPtr(*this, E);
   case AMDGPU::BI__builtin_amdgcn_logf:
     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log);
+  case AMDGPU::BI__builtin_amdgcn_exp2f:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_exp2);
   case AMDGPU::BI__builtin_amdgcn_log_clampf:
     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
   case AMDGPU::BI__builtin_amdgcn_ldexp:

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 3900a8cacd930..f611bc6f16d15 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -179,6 +179,13 @@ void test_log_f32(global float* out, float a)
   *out = __builtin_amdgcn_logf(a);
 }
 
+// CHECK-LABEL: @test_exp2_f32
+// CHECK: call float @llvm.amdgcn.exp2.f32
+void test_exp2_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_exp2f(a);
+}
+
 // CHECK-LABEL: @test_log_clamp_f32
 // CHECK: call float @llvm.amdgcn.log.clamp.f32
 void test_log_clamp_f32(global float* out, float a)

diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 3acd83feacde8..a53c06f43868f 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -955,6 +955,9 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
   llvm.amdgcn.log                            Provides direct access to v_log_f32 and v_log_f16
                                              (on targets with half support). Peforms log2 function.
 
+  llvm.amdgcn.exp2                           Provides direct access to v_exp_f32 and v_exp_f16
+                                             (on targets with half support). Performs exp2 function.
+
   =========================================  ==========================================================
 
 .. TODO::

diff  --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 4d078cf42f060..2b1edd5d6ba38 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -136,6 +136,10 @@ Changes to the AMDGPU Backend
 * Added llvm.amdgcn.log.f32 intrinsic. This provides direct access to
   v_log_f32.
 
+* Added llvm.amdgcn.exp2.f32 intrinsic. This provides direct access to
+  v_exp_f32.
+
+
 Changes to the ARM Backend
 --------------------------
 

diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 0c6646c01ae22..8c0f25b088787 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -308,6 +308,15 @@ def int_amdgcn_log : DefaultAttrsIntrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
 >;
 
+// v_exp_{f16|f32} (int_amdgcn_exp was taken by export
+// already). Performs exp2. f32 version does not handle
+// denormals. There is no reason to use this for f16 as it does
+// support denormals, and the generic exp2 intrinsic should be
+// preferred.
+def int_amdgcn_exp2 : DefaultAttrsIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+>;
+
 def int_amdgcn_log_clamp : DefaultAttrsIntrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
 >;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 2a0d6297f2c0c..a09604fa1872f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4688,6 +4688,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(RCP_LEGACY)
   NODE_NAME_CASE(RCP_IFLAG)
   NODE_NAME_CASE(LOG)
+  NODE_NAME_CASE(EXP)
   NODE_NAME_CASE(FMUL_LEGACY)
   NODE_NAME_CASE(RSQ_CLAMP)
   NODE_NAME_CASE(FP_CLASS)

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index af5979ef8590a..7d3f23b28a50b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -448,6 +448,9 @@ enum NodeType : unsigned {
   // log2, no denormal handling for f32.
   LOG,
 
+  // exp2, no denormal handling for f32.
+  EXP,
+
   FMUL_LEGACY,
   RSQ_CLAMP,
   FP_CLASS,

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index ece730ca83c36..c5a23d2eb5e98 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -118,6 +118,9 @@ def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
 // v_log_f32, which is log2
 def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>;
 
+// v_exp_f32, which is exp2
+def AMDGPUexp_impl : SDNode<"AMDGPUISD::EXP", SDTFPUnaryOp>;
+
 // out = 1.0 / sqrt(a)
 def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
 
@@ -394,6 +397,11 @@ def AMDGPUlog : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src),
 def AMDGPUlogf16 : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src),
                                               (flog2 node:$src)]>;
 
+def AMDGPUexp : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src),
+                                           (AMDGPUexp_impl node:$src),
+                                           (fexp2 node:$src)]>; // FIXME: Remove me
+def AMDGPUexpf16 : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src),
+                                              (fexp2 node:$src)]>;
 
 def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1),
   [(int_amdgcn_class node:$src0, node:$src1),

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index d0425f84307c3..d14c020dfa881 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4208,6 +4208,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_cos:
     case Intrinsic::amdgcn_log_clamp:
     case Intrinsic::amdgcn_log:
+    case Intrinsic::amdgcn_exp2:
     case Intrinsic::amdgcn_rcp:
     case Intrinsic::amdgcn_rcp_legacy:
     case Intrinsic::amdgcn_sqrt:

diff  --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index d42f3c57d8537..f4dfbe8adc75d 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -1090,7 +1090,7 @@ multiclass CUBE_Common <bits<11> inst> {
 } // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
 
 class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "EXP_IEEE", fexp2
+  inst, "EXP_IEEE", AMDGPUexp
 > {
   let Itinerary = TransALU;
 }

diff  --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 2179dda4c6337..06ded85946f50 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -321,7 +321,7 @@ defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
 defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
 
 let TRANS = 1, SchedRW = [WriteTrans32] in {
-defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
+defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, AMDGPUexp>;
 defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, AMDGPUlog>;
 defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
 defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
@@ -488,7 +488,7 @@ defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
 defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
 defm V_RSQ_F16 : VOP1Inst_t16 <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
 defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>;
-defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, fexp2>;
+defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>;
 defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
 defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
 } // End TRANS = 1, SchedRW = [WriteTrans32]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll
new file mode 100644
index 0000000000000..99a092e310abb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+define float @v_exp2_f32(float %src)  {
+; GCN-LABEL: v_exp2_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %exp2 = call float @llvm.amdgcn.exp2.f32(float %src)
+  ret float %exp2
+}
+
+define float @v_fabs_exp2_f32(float %src)  {
+; GCN-LABEL: v_fabs_exp2_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_exp_f32_e64 v0, |v0|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.src = call float @llvm.fabs.f32(float %src)
+  %exp2 = call float @llvm.amdgcn.exp2.f32(float %fabs.src)
+  ret float %exp2
+}
+
+define float @v_fneg_fabs_exp2_f32(float %src)  {
+; GCN-LABEL: v_fneg_fabs_exp2_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_exp_f32_e64 v0, -|v0|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.src = call float @llvm.fabs.f32(float %src)
+  %neg.fabs.src = fneg float %fabs.src
+  %exp2 = call float @llvm.amdgcn.exp2.f32(float %neg.fabs.src)
+  ret float %exp2
+}
+
+define half @v_exp2_f16(half %src)  {
+; GCN-LABEL: v_exp2_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_exp_f16_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %exp2 = call half @llvm.amdgcn.exp2.f16(half %src)
+  ret half %exp2
+}
+
+define half @v_fabs_exp2_f16(half %src)  {
+; GCN-LABEL: v_fabs_exp2_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_exp_f16_e64 v0, |v0|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.src = call half @llvm.fabs.f16(half %src)
+  %exp2 = call half @llvm.amdgcn.exp2.f16(half %fabs.src)
+  ret half %exp2
+}
+
+define half @v_fneg_fabs_exp2_f16(half %src)  {
+; GCN-LABEL: v_fneg_fabs_exp2_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_exp_f16_e64 v0, -|v0|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.src = call half @llvm.fabs.f16(half %src)
+  %neg.fabs.src = fneg half %fabs.src
+  %exp2 = call half @llvm.amdgcn.exp2.f16(half %neg.fabs.src)
+  ret half %exp2
+}
+
+declare half @llvm.amdgcn.exp2.f16(half) #0
+declare float @llvm.amdgcn.exp2.f32(float) #0
+declare float @llvm.fabs.f32(float) #0
+declare half @llvm.fabs.f16(half) #0
+
+attributes #0 = { nounwind readnone speculatable willreturn }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL: {{.*}}
+; SDAG: {{.*}}