[llvm] e8039ad - [OpenMP] Identify GPU kernels (aka. OpenMP target regions)

Fri Jul 10 23:50:11 PDT 2020

Author: Johannes Doerfert
Date: 2020-07-11T01:44:00-05:00
New Revision: e8039ad4def0c4a2499cfbaba38bcc8ef48dee92

URL: https://github.com/llvm/llvm-project/commit/e8039ad4def0c4a2499cfbaba38bcc8ef48dee92
DIFF: https://github.com/llvm/llvm-project/commit/e8039ad4def0c4a2499cfbaba38bcc8ef48dee92.diff

LOG: [OpenMP] Identify GPU kernels (aka. OpenMP target regions)

We now identify GPU kernels, that is entry points into the GPU code.
These kernels (can) correspond to OpenMP target regions. With this patch
we identify and on request print them via remarks.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D83269

Added: 
    llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll

Modified: 
    llvm/include/llvm/Transforms/IPO/OpenMPOpt.h
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h b/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h
index 0bd81ea8f543..d96187b73f9b 100644

--- a/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h
+++ b/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h
@@ -17,6 +17,9 @@ namespace llvm {
 
 namespace omp {
 
+/// Summary of a kernel (=entry point for target offloading).
+using Kernel = Function *;
+
 /// Helper to remember if the module contains OpenMP (runtime calls), to be used
 /// foremost with containsOpenMP.
 struct OpenMPInModule {
@@ -30,8 +33,17 @@ struct OpenMPInModule {
   bool isKnown() { return Value != OpenMP::UNKNOWN; }
   operator bool() { return Value != OpenMP::NOT_FOUND; }
 
+  /// Return the known kernels (=GPU entry points) in the module.
+  SmallPtrSetImpl<Kernel> &getKernels() { return Kernels; }
+
+  /// Identify kernels in the module and populate the Kernels set.
+  void identifyKernels(Module &M);
+
 private:
   enum class OpenMP { FOUND, NOT_FOUND, UNKNOWN } Value = OpenMP::UNKNOWN;
+
+  /// Collection of known kernels (=GPU entry points) in the module.
+  SmallPtrSet<Kernel, 8> Kernels;
 };
 
 /// Helper to determine if \p M contains OpenMP (runtime calls).

diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index b2e30a4d2b79..f0fc8a6c8c4a 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -39,6 +39,8 @@ static cl::opt<bool> DisableOpenMPOptimizations(
 
 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
                                     cl::Hidden);
+static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
+                                        cl::init(false), cl::Hidden);
 
 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
           "Number of OpenMP runtime calls deduplicated");
@@ -48,6 +50,8 @@ STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
           "Number of OpenMP runtime functions identified");
 STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
           "Number of OpenMP runtime function uses identified");
+STATISTIC(NumOpenMPTargetRegionKernels,
+          "Number of OpenMP target region entry points (=kernels) identified");
 
 #if !defined(NDEBUG)
 static constexpr auto TAG = "[" DEBUG_TYPE "]";
@@ -99,9 +103,10 @@ struct AAICVTracker;
 struct OMPInformationCache : public InformationCache {
   OMPInformationCache(Module &M, AnalysisGetter &AG,
                       BumpPtrAllocator &Allocator, SetVector<Function *> *CGSCC,
-                      SmallPtrSetImpl<Function *> &ModuleSlice)
+                      SmallPtrSetImpl<Function *> &ModuleSlice,
+                      SmallPtrSetImpl<Kernel> &Kernels)
       : InformationCache(M, AG, Allocator, CGSCC), ModuleSlice(ModuleSlice),
-        OMPBuilder(M) {
+        OMPBuilder(M), Kernels(Kernels) {
     OMPBuilder.initialize();
     initializeRuntimeFunctions();
     initializeInternalControlVars();
@@ -399,6 +404,9 @@ struct OMPInformationCache : public InformationCache {
 
     // TODO: We should attach the attributes defined in OMPKinds.def.
   }
+
+  /// Collection of known kernels (\see Kernel) in the module.
+  SmallPtrSetImpl<Kernel> &Kernels;
 };
 
 struct OpenMPOpt {
@@ -423,26 +431,10 @@ struct OpenMPOpt {
                       << " functions in a slice with "
                       << OMPInfoCache.ModuleSlice.size() << " functions\n");
 
-    /// Print initial ICV values for testing.
-    /// FIXME: This should be done from the Attributor once it is added.
-    if (PrintICVValues) {
-      InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel};
-
-      for (Function *F : OMPInfoCache.ModuleSlice) {
-        for (auto ICV : ICVs) {
-          auto ICVInfo = OMPInfoCache.ICVs[ICV];
-          auto Remark = [&](OptimizationRemark OR) {
-            return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
-                      << " Value: "
-                      << (ICVInfo.InitValue
-                              ? ICVInfo.InitValue->getValue().toString(10, true)
-                              : "IMPLEMENTATION_DEFINED");
-          };
-
-          emitRemarkOnFunction(F, "OpenMPICVTracker", Remark);
-        }
-      }
-    }
+    if (PrintICVValues)
+      printICVs();
+    if (PrintOpenMPKernels)
+      printKernels();
 
     Changed |= runAttributor();
 
@@ -455,6 +447,42 @@ struct OpenMPOpt {
     return Changed;
   }
 
+  /// Print initial ICV values for testing.
+  /// FIXME: This should be done from the Attributor once it is added.
+  void printICVs() const {
+    InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel};
+
+    for (Function *F : OMPInfoCache.ModuleSlice) {
+      for (auto ICV : ICVs) {
+        auto ICVInfo = OMPInfoCache.ICVs[ICV];
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
+                    << " Value: "
+                    << (ICVInfo.InitValue
+                            ? ICVInfo.InitValue->getValue().toString(10, true)
+                            : "IMPLEMENTATION_DEFINED");
+        };
+
+        emitRemarkOnFunction(F, "OpenMPICVTracker", Remark);
+      }
+    }
+  }
+
+  /// Print OpenMP GPU kernels for testing.
+  void printKernels() const {
+    for (Function *F : SCC) {
+      if (!OMPInfoCache.Kernels.count(F))
+        continue;
+
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "OpenMP GPU kernel "
+                  << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
+      };
+
+      emitRemarkOnFunction(F, "OpenMPGPU", Remark);
+    }
+  }
+
   /// Return the call if \p U is a callee use in a regular call. If \p RFI is
   /// given it has to be the callee or a nullptr is returned.
   static CallInst *getCallIfRegularCall(
@@ -775,7 +803,7 @@ struct OpenMPOpt {
   template <typename RemarkKind,
             typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>>
   void emitRemark(Instruction *Inst, StringRef RemarkName,
-                  RemarkCallBack &&RemarkCB) {
+                  RemarkCallBack &&RemarkCB) const {
     Function *F = Inst->getParent()->getParent();
     auto &ORE = OREGetter(F);
 
@@ -785,9 +813,10 @@ struct OpenMPOpt {
 
   /// Emit a remark on a function. Since only OptimizationRemark is supporting
   /// this, it can't be made generic.
-  void emitRemarkOnFunction(
-      Function *F, StringRef RemarkName,
-      function_ref<OptimizationRemark(OptimizationRemark &&)> &&RemarkCB) {
+  void
+  emitRemarkOnFunction(Function *F, StringRef RemarkName,
+                       function_ref<OptimizationRemark(OptimizationRemark &&)>
+                           &&RemarkCB) const {
     auto &ORE = OREGetter(F);
 
     ORE.emit([&]() {
@@ -1044,7 +1073,8 @@ PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C,
   SetVector<Function *> Functions(SCC.begin(), SCC.end());
   BumpPtrAllocator Allocator;
   OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
-                                /*CGSCC*/ &Functions, ModuleSlice);
+                                /*CGSCC*/ &Functions, ModuleSlice,
+                                OMPInModule.getKernels());
 
   Attributor A(Functions, InfoCache, CGUpdater);
 
@@ -1109,9 +1139,9 @@ struct OpenMPOptLegacyPass : public CallGraphSCCPass {
     AnalysisGetter AG;
     SetVector<Function *> Functions(SCC.begin(), SCC.end());
     BumpPtrAllocator Allocator;
-    OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG,
-                                  Allocator,
-                                  /*CGSCC*/ &Functions, ModuleSlice);
+    OMPInformationCache InfoCache(
+        *(Functions.back()->getParent()), AG, Allocator,
+        /*CGSCC*/ &Functions, ModuleSlice, OMPInModule.getKernels());
 
     Attributor A(Functions, InfoCache, CGUpdater);
 
@@ -1125,14 +1155,45 @@ struct OpenMPOptLegacyPass : public CallGraphSCCPass {
 
 } // end anonymous namespace
 
+void OpenMPInModule::identifyKernels(Module &M) {
+
+  NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+  if (!MD)
+    return;
+
+  for (auto *Op : MD->operands()) {
+    if (Op->getNumOperands() < 2)
+      continue;
+    MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
+    if (!KindID || KindID->getString() != "kernel")
+      continue;
+
+    Function *KernelFn =
+        mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
+    if (!KernelFn)
+      continue;
+
+    ++NumOpenMPTargetRegionKernels;
+
+    Kernels.insert(KernelFn);
+  }
+}
+
 bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) {
   if (OMPInModule.isKnown())
     return OMPInModule;
-
 #define OMP_RTL(_Enum, _Name, ...)                                             \
-  if (M.getFunction(_Name))                                                    \
-    return OMPInModule = true;
+  else if (M.getFunction(_Name)) OMPInModule = true;
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
+
+  // Identify kernels once. TODO: We should split the OMPInformationCache into a
+  // module and an SCC part. The kernel information, among other things, could
+  // go into the module part.
+  if (OMPInModule.isKnown() && OMPInModule) {
+    OMPInModule.identifyKernels(M);
+    return true;
+  }
+
   return OMPInModule = false;
 }
 

diff  --git a/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll b/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll
new file mode 100644
index 000000000000..ccdf0b981dc2
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll
@@ -0,0 +1,27 @@
+; RUN: opt -passes=openmpopt -pass-remarks=openmp-opt -openmp-print-gpu-kernels -disable-output < %s 2>&1 | FileCheck %s --implicit-check-not=non_kernel
+; RUN: opt        -openmpopt -pass-remarks=openmp-opt -openmp-print-gpu-kernels -disable-output < %s 2>&1 | FileCheck %s --implicit-check-not=non_kernel
+
+; CHECK-DAG: remark: <unknown>:0:0: OpenMP GPU kernel kernel1
+; CHECK-DAG: remark: <unknown>:0:0: OpenMP GPU kernel kernel2
+
+define void @kernel1() {
+  ret void
+}
+
+define void @kernel2() {
+  ret void
+}
+
+define void @non_kernel() {
+  ret void
+}
+
+; Needed to trigger the openmp-opt pass
+declare dso_local void @__kmpc_kernel_prepare_parallel(i8*)
+
+!nvvm.annotations = !{!2, !0, !1, !3, !1, !2}
+
+!0 = !{void ()* @kernel1, !"kernel", i32 1}
+!1 = !{void ()* @non_kernel, !"non_kernel", i32 1}
+!2 = !{null, !"align", i32 1}
+!3 = !{void ()* @kernel2, !"kernel", i32 1}