[polly] r312239 - [PPCGCodeGen] Convert intrinsics to libdevice functions whenever possible.

Thu Aug 31 06:03:37 PDT 2017

Author: bollu
Date: Thu Aug 31 06:03:37 2017
New Revision: 312239

URL: http://llvm.org/viewvc/llvm-project?rev=312239&view=rev
Log:
[PPCGCodeGen] Convert intrinsics to libdevice functions whenever possible.

This is useful when we face certain intrinsics such as `llvm.exp.*`
which cannot be lowered by the NVPTX backend while other intrinsics can.

So, we would need to keep blacklists of intrinsics that cannot be
handled by the NVPTX backend. It is much simpler to try and promote
all intrinsics to libdevice versions.

This patch makes function/intrinsic very uniform, and will always try to use
a libdevice version if it exists.

Differential Revision: https://reviews.llvm.org/D37056

Modified:
    polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
    polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll
    polly/trunk/test/GPGPU/libdevice-functions-copied-into-kernel.ll

Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=312239&r1=312238&r2=312239&view=diff
==============================================================================

--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Thu Aug 31 06:03:37 2017
@@ -1383,15 +1383,36 @@ isl_bool collectReferencesInGPUStmt(__is
 
 /// A list of functions that are available in NVIDIA's libdevice.
 const std::set<std::string> CUDALibDeviceFunctions = {
-    "exp",   "expf",     "expl",      "cos",       "cosf", "sqrt",
-    "sqrtf", "copysign", "copysignf", "copysignl", "log",  "logf"};
+    "exp",      "expf",      "expl",      "cos", "cosf", "sqrt", "sqrtf",
+    "copysign", "copysignf", "copysignl", "log", "logf", "powi", "powif"};
+
+// A map from intrinsics to their corresponding libdevice functions.
+const std::map<std::string, std::string> IntrinsicToLibdeviceFunc = {
+    {"llvm.exp.f64", "exp"},
+    {"llvm.exp.f32", "expf"},
+    {"llvm.powi.f64", "powi"},
+    {"llvm.powi.f32", "powif"}};
 
 /// Return the corresponding CUDA libdevice function name for @p F.
+/// Note that this function will try to convert instrinsics in the list
+/// IntrinsicToLibdeviceFunc into libdevice functions.
+/// This is because some intrinsics such as `exp`
+/// are not supported by the NVPTX backend.
+/// If this restriction of the backend is lifted, we should refactor our code
+/// so that we use intrinsics whenever possible.
 ///
 /// Return "" if we are not compiling for CUDA.
 std::string getCUDALibDeviceFuntion(Function *F) {
-  if (CUDALibDeviceFunctions.count(F->getName()))
-    return std::string("__nv_") + std::string(F->getName());
+  const std::string FnName = [&] {
+    auto It = IntrinsicToLibdeviceFunc.find(F->getName());
+    if (It != IntrinsicToLibdeviceFunc.end())
+      return It->second;
+
+    return std::string(F->getName());
+  }();
+
+  if (CUDALibDeviceFunctions.count(FnName))
+    return "__nv_" + FnName;
 
   return "";
 }
@@ -1409,7 +1430,7 @@ static bool isValidFunctionInKernel(llvm
 
   return F->isIntrinsic() &&
          (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") ||
-          Name.startswith("llvm.copysign") || Name.startswith("llvm.powi"));
+          Name.startswith("llvm.copysign"));
 }
 
 /// Do not take `Function` as a subtree value.
@@ -2362,9 +2383,22 @@ bool GPUNodeBuilder::requiresCUDALibDevi
     if (!F.isDeclaration())
       continue;
 
-    std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(&F);
+    const std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(&F);
     if (CUDALibDeviceFunc.length() != 0) {
-      F.setName(CUDALibDeviceFunc);
+      // We need to handle the case where a module looks like this:
+      // @expf(..)
+      // @llvm.exp.f64(..)
+      // Both of these functions would be renamed to `__nv_expf`.
+      //
+      // So, we must first check for the existence of the libdevice function.
+      // If this exists, we replace our current function with it.
+      //
+      // If it does not exist, we rename the current function to the
+      // libdevice functiono name.
+      if (Function *Replacement = F.getParent()->getFunction(CUDALibDeviceFunc))
+        F.replaceAllUsesWith(Replacement);
+      else
+        F.setName(CUDALibDeviceFunc);
       RequiresLibDevice = true;
     }
   }

Modified: polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll?rev=312239&r1=312238&r2=312239&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll (original)
+++ polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll Thu Aug 31 06:03:37 2017
@@ -14,7 +14,7 @@
 ; KERNEL-IR:   %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
 ; KERNEL-IR:   declare float @llvm.sqrt.f32(float)
 ; KERNEL-IR:   declare float @llvm.fabs.f32(float)
-; KERNEL-IR:   declare float @llvm.powi.f32(float, i32)
+
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
@@ -27,7 +27,6 @@
 ;       float tmp1 = sqrt(tmp1);
 ;       float tmp2 = fabs(tmp2);
 ;       float tmp3 = copysignf(tmp1, tmp2);
-;       float tmp4 = powi(tmp3, 2);
 ;       B[i] = tmp4;
 ;   }
 ; }
@@ -53,9 +52,8 @@ for.body:
   %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
   %fabs = tail call float @llvm.fabs.f32(float %sqrt);
   %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs);
-  %powi = tail call float @llvm.powi.f32(float %copysign, i32 2);
   %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %powi, float* %B.arr.i, align 4
+  store float %copysign, float* %B.arr.i, align 4
 
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %wide.trip.count = zext i32 %N to i64
@@ -73,7 +71,6 @@ for.end:
 declare float @llvm.sqrt.f32(float) #0
 declare float @llvm.fabs.f32(float) #0
 declare float @llvm.copysign.f32(float, float) #0
-declare float @llvm.powi.f32(float, i32) #0
 
 attributes #0 = { nounwind readnone }
 

Modified: polly/trunk/test/GPGPU/libdevice-functions-copied-into-kernel.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/libdevice-functions-copied-into-kernel.ll?rev=312239&r1=312238&r2=312239&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/libdevice-functions-copied-into-kernel.ll (original)
+++ polly/trunk/test/GPGPU/libdevice-functions-copied-into-kernel.ll Thu Aug 31 06:03:37 2017
@@ -22,6 +22,11 @@
 ; KERNEL-IR:   %p_cosf = tail call float @__nv_cosf(float %p_expf)
 ; KERNEL-IR:   %p_logf = tail call float @__nv_logf(float %p_cosf)
 
+; Powi and exp cannot be lowered directly. Rather, we expect them to be
+; lowered by libdevice.
+; KERNEL-IR: %p_powi = tail call float @__nv_powif(float %p_logf, i32 2)
+; KERNEL-IR: %p_exp = tail call float @__nv_expf(float %p_powi)
+
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
 ; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
@@ -33,6 +38,8 @@
 ;       float expf  = expf(tmp1);
 ;       cosf = cosf(expf);
 ;       logf = logf(cosf);
+;       powi = powi(logf, 2);
+;       exp = exp(powi);
 ;       B[i] = logf;
 ;   }
 ; }
@@ -58,8 +65,10 @@ for.body:
   %expf = tail call float @expf(float %A.arr.i.val)
   %cosf = tail call float @cosf(float %expf)
   %logf = tail call float @logf(float %cosf)
+  %powi = tail call float @llvm.powi.f32(float %logf, i32 2)
+  %exp = tail call float @llvm.exp.f32(float %powi)
   %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %logf, float* %B.arr.i, align 4
+  store float %exp, float* %B.arr.i, align 4
 
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %wide.trip.count = zext i32 %N to i64
@@ -77,6 +86,8 @@ for.end:
 declare float @expf(float) #0
 declare float @cosf(float) #0
 declare float @logf(float) #0
+declare float @llvm.powi.f32(float, i32) #0
+declare float @llvm.exp.f32(float) #0
 
 attributes #0 = { nounwind readnone }