[llvm] AMDGPU: Perform libcall recognition to replace fast OpenCL pow (PR #182135)

Wed Feb 18 12:52:01 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

If a float-typed call site is marked with afn, replace the 4
flavors of pow with a faster variant.

This transforms pow, powr, pown, and rootn to __pow_fast,
__powr_fast, __pown_fast, and __rootn_fast if available. Also
attempts to handle all of the same basic folds on the new fast
variants that were already performed with the base forms. This
maintains optimizations with OpenCL when the device libs unsafe
math control library is deleted. This maintains the status quo
of how libcalls work, and only handles 4 new entry points. This
only helps with the elimination of the control library, and not
general libcall emission problems.

This makes no practical difference for HIP, which is the status
quo for libcall optimizations. AMDGPULibCalls recognizes the OpenCL
mangled names. e.g., OpenCL float "pow" is really _Z3powff but the
HIP provided function "powf" is really named _ZL4powfff, and std::pow
with float is _ZL3powff. The pass still runs for HIP, so by accident
if you used the OpenCL mangled function names, this would trigger.

Since the functions cannot yet be relied on from the library,
introduce a temporary module flag check. I'm not planning on emitting
it anywhere and it's a poor substitute for versioning the target.

---

Patch is 905.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182135.diff


12 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp (+112-11) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp (+19) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPULibFunc.h (+4) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll (+6-45) 
- (added) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-fast.ll (+658) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll (+4143-1728) 
- (added) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown-fast.ll (+566) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll (+171-125) 
- (added) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr-fast.ll (+487) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll (+1244-881) 
- (added) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn-fast.ll (+452) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll (+633-442) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index d0c13d154995d..54aac24d95dbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -63,6 +63,19 @@ class AMDGPULibCalls {
   // "FuncName" exists. It may create a new function prototype in pre-link mode.
   FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
 
+  /// Wrapper around getFunction which tries to use a faster variant if
+  /// available, and falls back to a less fast option.
+  ///
+  /// Return a replacement function for \p fInfo that has float-typed fast
+  /// variants. \p NewFunc is a base replacement function to use. \p
+  /// NewFuncFastVariant is a faster version to use if the calling context knows
+  /// it's legal. If there is no fast variant to use, \p NewFuncFastVariant
+  /// should be EI_NONE.
+  FunctionCallee getFloatFastVariant(Module *M, const FuncInfo &fInfo,
+                                     FuncInfo &newInfo,
+                                     AMDGPULibFunc::EFuncId NewFunc,
+                                     AMDGPULibFunc::EFuncId NewFuncFastVariant);
+
   bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
 
   bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
@@ -410,6 +423,22 @@ FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) {
                        : AMDGPULibFunc::getFunction(M, fInfo);
 }
 
+FunctionCallee AMDGPULibCalls::getFloatFastVariant(
+    Module *M, const FuncInfo &fInfo, FuncInfo &newInfo,
+    AMDGPULibFunc::EFuncId NewFunc, AMDGPULibFunc::EFuncId FastVariant) {
+  assert(NewFunc != FastVariant);
+
+  if (FastVariant != AMDGPULibFunc::EI_NONE &&
+      getArgType(fInfo) == AMDGPULibFunc::F32) {
+    newInfo = AMDGPULibFunc(FastVariant, fInfo);
+    if (FunctionCallee NewCallee = getFunction(M, newInfo))
+      return NewCallee;
+  }
+
+  newInfo = AMDGPULibFunc(NewFunc, fInfo);
+  return getFunction(M, newInfo);
+}
+
 bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
                                        FuncInfo &FInfo) {
   return AMDGPULibFunc::parse(FMangledName, FInfo);
@@ -680,12 +709,21 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
           {CI->getType(), CI->getArgOperand(1)->getType()}));
       return true;
     }
-    case AMDGPULibFunc::EI_POW: {
+    case AMDGPULibFunc::EI_POW:
+    case AMDGPULibFunc::EI_POW_FAST: {
       Module *M = Callee->getParent();
-      AMDGPULibFunc PowrInfo(AMDGPULibFunc::EI_POWR, FInfo);
-      FunctionCallee PowrFunc = getFunction(M, PowrInfo);
       CallInst *Call = cast<CallInst>(FPOp);
 
+      FuncInfo PowrInfo;
+      AMDGPULibFunc::EFuncId FastPowrFuncId =
+          FMF.approxFunc() || FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
+              ? AMDGPULibFunc::EI_POWR_FAST
+              : AMDGPULibFunc::EI_NONE;
+      FunctionCallee PowrFunc = getFloatFastVariant(
+          M, FInfo, PowrInfo, AMDGPULibFunc::EI_POWR, FastPowrFuncId);
+
+      // TODO: Prefer fast pown to fast powr, but slow powr to slow pown.
+
       // pow(x, y) -> powr(x, y) for x >= -0.0
       // TODO: Account for flags on current call
       if (PowrFunc && cannotBeOrderedLessThanZero(
@@ -698,8 +736,15 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
       if (isKnownIntegral(FPOp->getOperand(1), SQ.getWithInstruction(CI),
                           FPOp->getFastMathFlags())) {
         FunctionType *PownType = getPownType(CI->getFunctionType());
-        AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true);
-        FunctionCallee PownFunc = getFunction(M, PownInfo);
+
+        FuncInfo PownInfo;
+        AMDGPULibFunc::EFuncId FastPownFuncId =
+            FMF.approxFunc() || FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
+                ? AMDGPULibFunc::EI_POWN_FAST
+                : AMDGPULibFunc::EI_NONE;
+        FunctionCallee PownFunc = getFloatFastVariant(
+            M, FInfo, PownInfo, AMDGPULibFunc::EI_POWN, FastPownFuncId);
+
         if (PownFunc) {
           // TODO: If the incoming integral value is an sitofp/uitofp, it won't
           // fold out without a known range. We can probably take the source
@@ -721,30 +766,78 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
 
       if (!FMF.approxFunc())
         return false;
+
+      if (FInfo.getId() == AMDGPULibFunc::EI_POW && FMF.approxFunc() &&
+          getArgType(FInfo) == AMDGPULibFunc::F32) {
+        AMDGPULibFunc PowFastInfo(AMDGPULibFunc::EI_POW_FAST, FInfo);
+        if (FunctionCallee PowFastFunc = getFunction(M, PowFastInfo)) {
+          Call->setCalledFunction(PowFastFunc);
+          return fold_pow(FPOp, B, PowFastInfo) || true;
+        }
+      }
+
       return expandFastPow(FPOp, B, PowKind::Pow);
     }
     case AMDGPULibFunc::EI_POWR:
+    case AMDGPULibFunc::EI_POWR_FAST: {
       if (fold_pow(FPOp, B, FInfo))
         return true;
       if (!FMF.approxFunc())
         return false;
+
+      if (FInfo.getId() == AMDGPULibFunc::EI_POWR && FMF.approxFunc() &&
+          getArgType(FInfo) == AMDGPULibFunc::F32) {
+        Module *M = Callee->getParent();
+        AMDGPULibFunc PowrFastInfo(AMDGPULibFunc::EI_POWR_FAST, FInfo);
+        if (FunctionCallee PowrFastFunc = getFunction(M, PowrFastInfo)) {
+          CI->setCalledFunction(PowrFastFunc);
+          return true;
+        }
+      }
+
       if (!shouldReplaceLibcallWithIntrinsic(CI))
         return false;
       return expandFastPow(FPOp, B, PowKind::PowR);
+    }
     case AMDGPULibFunc::EI_POWN:
+    case AMDGPULibFunc::EI_POWN_FAST: {
       if (fold_pow(FPOp, B, FInfo))
         return true;
       if (!FMF.approxFunc())
         return false;
+
+      if (FInfo.getId() == AMDGPULibFunc::EI_POWN &&
+          getArgType(FInfo) == AMDGPULibFunc::F32) {
+        Module *M = Callee->getParent();
+        AMDGPULibFunc PownFastInfo(AMDGPULibFunc::EI_POWN_FAST, FInfo);
+        if (FunctionCallee PownFastFunc = getFunction(M, PownFastInfo)) {
+          CI->setCalledFunction(PownFastFunc);
+          return true;
+        }
+      }
+
       if (!shouldReplaceLibcallWithIntrinsic(CI))
         return false;
       return expandFastPow(FPOp, B, PowKind::PowN);
+    }
     case AMDGPULibFunc::EI_ROOTN:
+    case AMDGPULibFunc::EI_ROOTN_FAST: {
       if (fold_rootn(FPOp, B, FInfo))
         return true;
       if (!FMF.approxFunc())
         return false;
+
+      if (getArgType(FInfo) == AMDGPULibFunc::F32) {
+        Module *M = Callee->getParent();
+        AMDGPULibFunc RootnFastInfo(AMDGPULibFunc::EI_ROOTN_FAST, FInfo);
+        if (FunctionCallee RootnFastFunc = getFunction(M, RootnFastInfo)) {
+          CI->setCalledFunction(RootnFastFunc);
+          return true;
+        }
+      }
+
       return expandFastPow(FPOp, B, PowKind::RootN);
+    }
     case AMDGPULibFunc::EI_SQRT:
       // TODO: Allow with strictfp + constrained intrinsic
       return tryReplaceLibcallWithSimpleIntrinsic(
@@ -846,8 +939,11 @@ static double log2(double V) {
 bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
                               const FuncInfo &FInfo) {
   assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
+          FInfo.getId() == AMDGPULibFunc::EI_POW_FAST ||
           FInfo.getId() == AMDGPULibFunc::EI_POWR ||
-          FInfo.getId() == AMDGPULibFunc::EI_POWN) &&
+          FInfo.getId() == AMDGPULibFunc::EI_POWR_FAST ||
+          FInfo.getId() == AMDGPULibFunc::EI_POWN ||
+          FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) &&
          "fold_pow: encounter a wrong function call");
 
   Module *M = B.GetInsertBlock()->getModule();
@@ -1000,18 +1096,21 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
 
       V = log2(std::abs(V));
       cnval = ConstantFP::get(eltType, V);
-      needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) &&
+      needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR &&
+                      FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST) &&
                      CF->isNegative();
     } else {
       needlog = true;
-      needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
+      needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
+                               FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
     }
   } else {
     ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr0);
 
     if (!CDV) {
       needlog = true;
-      needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
+      needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
+                               FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
     } else {
       assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
               "Wrong vector size detected");
@@ -1036,7 +1135,8 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
     }
   }
 
-  if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
+  if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW ||
+                       FInfo.getId() == AMDGPULibFunc::EI_POW_FAST)) {
     // We cannot handle corner cases for a general pow() function, give up
     // unless y is a constant integral value. Then proceed as if it were pown.
     if (!isKnownIntegral(opr1, SQ.getWithInstruction(cast<Instruction>(FPOp)),
@@ -1064,7 +1164,8 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
     nval = CreateCallEx(B,LogExpr, nval, "__log2");
   }
 
-  if (FInfo.getId() == AMDGPULibFunc::EI_POWN) {
+  if (FInfo.getId() == AMDGPULibFunc::EI_POWN ||
+      FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) {
     // convert int(32) to fp(f32 or f64)
     opr1 = B.CreateSIToFP(opr1, nval->getType(), "pownI2F");
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 82233c0c891ad..68d617e343b99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -254,8 +254,11 @@ static constexpr ManglingRule manglingRules[] = {
 { "normalize"                       , {1},   {E_ANY}},
 { "popcount"                        , {1},   {E_ANY}},
 { "pow"                             , {1},   {E_ANY,E_COPY}},
+{ "__pow_fast"                      , {1},   {E_ANY,E_COPY}},
 { "pown"                            , {1},   {E_ANY,E_SETBASE_I32}},
+{ "__pown_fast"                     , {1},   {E_ANY,E_SETBASE_I32}},
 { "powr"                            , {1},   {E_ANY,E_COPY}},
+{ "__powr_fast"                     , {1},   {E_ANY,E_COPY}},
 { "prefetch"                        , {1},   {E_CONSTPTR_ANY,EX_SIZET}},
 { "radians"                         , {1},   {E_ANY}},
 { "recip"                           , {1},   {E_ANY}},
@@ -266,6 +269,7 @@ static constexpr ManglingRule manglingRules[] = {
 { "rhadd"                           , {1},   {E_ANY,E_COPY}},
 { "rint"                            , {1},   {E_ANY}},
 { "rootn"                           , {1},   {E_ANY,E_SETBASE_I32}},
+{ "__rootn_fast"                    , {1},   {E_ANY,E_SETBASE_I32}},
 { "rotate"                          , {1},   {E_ANY,E_COPY}},
 { "round"                           , {1},   {E_ANY}},
 { "rsqrt"                           , {1},   {E_ANY}},
@@ -1079,6 +1083,21 @@ Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) {
   if (!fInfo.isCompatibleSignature(*M, F->getFunctionType()))
     return nullptr;
 
+  switch (fInfo.getId()) {
+  case AMDGPULibFunc::EI_POW_FAST:
+  case AMDGPULibFunc::EI_POWR_FAST:
+  case AMDGPULibFunc::EI_POWN_FAST:
+  case AMDGPULibFunc::EI_ROOTN_FAST:
+    // TODO: Remove this. This is not a real module flag used anywhere. This is
+    // a bringup hack so this transform is testable prior to the library
+    // functions existing.
+    if (!M->getModuleFlag("amdgpu-libcall-have-fast-pow"))
+      return nullptr;
+    break;
+  default:
+    break;
+  }
+
   return F;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
index 580ef51b559d8..5a44cc4fc799e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -150,8 +150,11 @@ class AMDGPULibFuncBase {
     EI_NORMALIZE,
     EI_POPCOUNT,
     EI_POW,
+    EI_POW_FAST,
     EI_POWN,
+    EI_POWN_FAST,
     EI_POWR,
+    EI_POWR_FAST,
     EI_PREFETCH,
     EI_RADIANS,
     EI_RECIP,
@@ -162,6 +165,7 @@ class AMDGPULibFuncBase {
     EI_RHADD,
     EI_RINT,
     EI_ROOTN,
+    EI_ROOTN_FAST,
     EI_ROTATE,
     EI_ROUND,
     EI_RSQRT,
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 5d08cef95810d..afe0971088bc1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -33,51 +33,12 @@ define float @test_pow_fast_f32(float %x, float %y) {
 ; CHECK-LABEL: test_pow_fast_f32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cmp_lg_f32_e32 vcc, 1.0, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; CHECK-NEXT:    v_cmp_lg_f32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; CHECK-NEXT:    s_mov_b32 s4, 0x800000
-; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
-; CHECK-NEXT:    v_ldexp_f32 v2, |v0|, v2
-; CHECK-NEXT:    v_log_f32_e32 v2, v2
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x42000000
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v3
-; CHECK-NEXT:    v_mul_f32_e32 v3, v1, v2
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0x42800000
-; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; CHECK-NEXT:    v_fma_f32 v2, v1, v2, v3
-; CHECK-NEXT:    v_exp_f32_e32 v2, v2
-; CHECK-NEXT:    v_not_b32_e32 v3, 63
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0.5, v1
-; CHECK-NEXT:    v_ldexp_f32 v2, v2, v3
-; CHECK-NEXT:    v_trunc_f32_e32 v3, v1
-; CHECK-NEXT:    v_trunc_f32_e32 v5, v4
-; CHECK-NEXT:    v_cmp_eq_f32_e32 vcc, v3, v1
-; CHECK-NEXT:    v_cmp_lg_f32_e64 s[4:5], v5, v4
-; CHECK-NEXT:    s_and_b64 vcc, vcc, s[4:5]
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, 1.0, v0, vcc
-; CHECK-NEXT:    s_brev_b32 s8, -2
-; CHECK-NEXT:    v_cmp_lg_f32_e64 s[4:5], v3, v1
-; CHECK-NEXT:    v_cmp_gt_f32_e64 s[6:7], 0, v0
-; CHECK-NEXT:    v_bfi_b32 v2, s8, v2, v4
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; CHECK-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_f32_e64 s[4:5], 0, v0
-; CHECK-NEXT:    v_cmp_gt_f32_e64 s[6:7], 0, v1
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f800000
-; CHECK-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[6:7]
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NEXT:    v_bfi_b32 v0, s8, v1, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, _Z10__pow_fastff at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, _Z10__pow_fastff at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[16:17]
   %pow = tail call fast float @_Z3powff(float %x, float %y)
   ret float %pow
 }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-fast.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-fast.ll
new file mode 100644
index 0000000000000..961412ae45d2c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-fast.ll
@@ -0,0 +1,658 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink %s | FileCheck %s
+
+define float @test_pow_afn_f32(float %x, float %y) #0 {
+; CHECK-LABEL: define float @test_pow_afn_f32(
+; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call afn float @_Z10__pow_fastff(float [[X]], float [[Y]])
+; CHECK-NEXT:    ret float [[CALL]]
+;
+entry:
+  %call = tail call afn float @_Z3powff(float %x, float %y)
+  ret float %call
+}
+
+declare float @_Z3powff(float, float) #1
+
+define <2 x float> @test_pow_afn_v2f32(<2 x float> %x, <2 x float> %y) #0 {
+; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32(
+; CHECK-SAME: <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call afn <2 x float> @_Z10__pow_fastDv2_fS_(<2 x float> [[X]], <2 x float> [[Y]])
+; CHECK-NEXT:    ret <2 x float> [[CALL]]
+;
+entry:
+  %call = tail call afn <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y)
+  ret <2 x float> %call
+}
+
+declare <2 x float> @_Z3powDv2_fS_(<2 x float>, <2 x float>) #1
+
+define <3 x float> @test_pow_afn_v3f32(<3 x float> %x, <3 x float> %y) #0 {
+; CHECK-LABEL: define <3 x float> @test_pow_afn_v3f32(
+; CHECK-SAME: <3 x float> [[X:%.*]], <3 x float> [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call afn <3 x float> @_Z10__pow_fastDv3_fS_(<3 x float> [[X]], <3 x float> [[Y]])
+; CHECK-NEXT:    ret <3 x float> [[CALL]]
+;
+entry:
+  %call = tail call afn <3 x float> @_Z3powDv3_fS_(<3 x float> %x, <3 x float> %y)
+  ret <3 x float> %call
+}
+
+declare <3 x float> @_Z3powDv3_fS_(<3 x float>, <3 x float>) #1
+
+define <4 x float> @test_pow_afn_v4f32(<4 x float> %x, <4 x float> %y) #0 {
+; CHECK-LABEL: define <4 x float> @test_pow_afn_v4f32(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call afn <4 x float> @_Z10__pow_fastDv4_fS_(<4 x float> [[X]], <4 x float> [[Y]])
+; CHECK-NEXT:    ret <4 x float> [[CALL]]
+;
+entry:
+  %call = tail call afn <4 x float> @_Z3powDv4_fS_(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %call
+}
+
+declare <4 x float> @_Z3powDv4_fS_(<4 x float>, <4 x float>) #1
+
+define <8 x float> @test_pow_afn_v8f32(<8 x float> %x, <8 x float> %y) #0 {
+; CHECK-LABEL: define <8 x float> @test_pow_afn_v8f32(
+; CHECK-SAME: <8 x float> [[X:%.*]], <8 x float> [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call afn <8 x float> @_Z10__pow_fastDv8_fS_(<8 x float> [[X]], <8 x float> [[Y]])
+; CHECK-NEXT:    ret <8 x float> [[CALL]]
+;
+entry:
+  %call = tail call afn <8 x float> @_Z3powDv8_fS_(<8 x float> %x, <8 x float> %y)
+  ret <8 x float> %call
+}
+
+declare <8 x float> @_Z3powDv8_fS_(<8 x float>, <8 x float>) #1
+
+define <16 x float> @test_pow_afn_v16f32(<16 x float> %x, <16 x float> %y) #0 {
+; CHECK-LABEL: define <16 x float> @test_pow_afn_v16f32(
+; CHECK-SAME: <16 x float> [[X:%.*]], <16 x float> [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call afn <16 x float> @_Z10__pow_fastDv16_fS_(<16 x float> [[X]], <16 x float> [[Y]])
+; CHECK-NEXT:    ret <16 x float> [[CALL]]
+;
+entry:
+  %call = tail call afn <16 x float> @_Z3powDv16_fS_(<16 x float> %x, <16 x float> %y)
+  ret <16 x float> %call
+}
+
+declare <16 x float> @_Z3powDv16_fS_(<16 x float>, <16 x float>) #1
+
+
+define float @test_pow_afn_f32__known_positive_x(float nofpclass(ninf nnorm nsub nzero) %x, float %y) #0 {
+; CHECK-LABEL: define float @test_pow_afn_f32__known_positive_x(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[X:%.*]], float [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call afn float @_Z11__powr_fastff(float [[X]], float [[Y]])
+; CHECK-NEXT:    ret float [[CALL]]
+;
+entry:
+  %call = tail call afn float @_Z3powff(float %x, float %y)
+  ret float %call
+}
+
+define float @test_pow_afn_f32__known_positive_x__known_integral_y(float nofpclass(ninf nnorm nsub nzero) %x, i32 %y.int) #0 {
+; CHECK-LABEL: define float @test_pow_afn_f32__known_positive_x__known_integral_y(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[X:%.*]], i32 [[Y_INT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[Y:%.*]] = sitof...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/182135