[llvm] 6448d5b - AMDGPU: Remove pointless libcall recognition of native_{divide|recip}

Wed Aug 9 15:48:51 PDT 2023

Author: Matt Arsenault
Date: 2023-08-09T18:48:46-04:00
New Revision: 6448d5ba581a275ddaf9504368690abcf1aec244

URL: https://github.com/llvm/llvm-project/commit/6448d5ba581a275ddaf9504368690abcf1aec244
DIFF: https://github.com/llvm/llvm-project/commit/6448d5ba581a275ddaf9504368690abcf1aec244.diff

LOG: AMDGPU: Remove pointless libcall recognition of native_{divide|recip}

This was trying to constant fold these calls, and also turn some of
them into a regular fmul/fdiv. There's no point to doing that, the
underlying library implementation should be using those in the first
place. Even when the library does use the rcp intrinsics, the backend
handles constant folding of those. This was also only performing the
folds under overly strict fast-evertyhing-is-required conditions.

The one possible plus this gained over linking in the library is if
you were using all fast math flags, it would propagate them to the new
instructions. We could address this in the library by adding more fast
math flags to the native implementations.

The constant fold case also had no test coverage.

https://reviews.llvm.org/D156676

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
    llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index b9b6f7d6d55f29..6beab66635a703 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -65,12 +65,6 @@ class AMDGPULibCalls {
 
   /* Specialized optimizations */
 
-  // recip (half or native)
-  bool fold_recip(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
-  // divide (half or native)
-  bool fold_divide(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
   // pow/powr/pown
   bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
 
@@ -587,19 +581,6 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
     case AMDGPULibFunc::EI_COS:
     case AMDGPULibFunc::EI_SIN:
       return fold_sincos(FPOp, B, FInfo);
-    case AMDGPULibFunc::EI_RECIP:
-      // skip vector function
-      assert((FInfo.getPrefix() == AMDGPULibFunc::NATIVE ||
-              FInfo.getPrefix() == AMDGPULibFunc::HALF) &&
-             "recip must be an either native or half function");
-      return (getVecSize(FInfo) != 1) ? false : fold_recip(CI, B, FInfo);
-
-    case AMDGPULibFunc::EI_DIVIDE:
-      // skip vector function
-      assert((FInfo.getPrefix() == AMDGPULibFunc::NATIVE ||
-              FInfo.getPrefix() == AMDGPULibFunc::HALF) &&
-             "divide must be an either native or half function");
-      return (getVecSize(FInfo) != 1) ? false : fold_divide(CI, B, FInfo);
     case AMDGPULibFunc::EI_FMA:
     case AMDGPULibFunc::EI_MAD:
     case AMDGPULibFunc::EI_NFMA:
@@ -687,45 +668,6 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
   return false;
 }
 
-//  [native_]half_recip(c) ==> 1.0/c
-bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B,
-                                const FuncInfo &FInfo) {
-  Value *opr0 = CI->getArgOperand(0);
-  if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
-    // Just create a normal div. Later, InstCombine will be able
-    // to compute the divide into a constant (avoid check float infinity
-    // or subnormal at this point).
-    Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0),
-                               opr0,
-                               "recip2div");
-    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
-    replaceCall(CI, nval);
-    return true;
-  }
-  return false;
-}
-
-//  [native_]half_divide(x, c) ==> x/c
-bool AMDGPULibCalls::fold_divide(CallInst *CI, IRBuilder<> &B,
-                                 const FuncInfo &FInfo) {
-  Value *opr0 = CI->getArgOperand(0);
-  Value *opr1 = CI->getArgOperand(1);
-  ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
-  ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
-
-  if ((CF0 && CF1) ||  // both are constants
-      (CF1 && (getArgType(FInfo) == AMDGPULibFunc::F32)))
-      // CF1 is constant && f32 divide
-  {
-    Value *nval1 = B.CreateFDiv(ConstantFP::get(opr1->getType(), 1.0),
-                                opr1, "__div2recip");
-    Value *nval  = B.CreateFMul(opr0, nval1, "__div2mul");
-    replaceCall(CI, nval);
-    return true;
-  }
-  return false;
-}
-
 namespace llvm {
 static double log2(double V) {
 #if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L

diff  --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index e6c4447ad0067e..2d3e7c336c1e9a 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -156,8 +156,10 @@ entry:
 
 declare float @_Z10half_recipf(float)
 
+; Do nothing, the underlying implementation will optimize correctly
+; after inlining.
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_divide
-; GCN: fmul fast float %tmp, 0x3FD5555560000000
+; GCN: %call = tail call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
 define amdgpu_kernel void @test_native_divide(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -168,8 +170,10 @@ entry:
 
 declare float @_Z13native_divideff(float, float)
 
+; Do nothing, the optimization will naturally happen after inlining.
+
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_divide
-; GCN: fmul fast float %tmp, 0x3FD5555560000000
+; GCN: %call = tail call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
 define amdgpu_kernel void @test_half_divide(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4