[llvm] fec1f21 - [OpenMP] Emit remarks during GPU state machine optimization

Tue Jul 14 20:36:21 PDT 2020

Author: Johannes Doerfert
Date: 2020-07-14T22:33:57-05:00
New Revision: fec1f2109f33c9a1a7650272b3bfb8f0f81f6a2b

URL: https://github.com/llvm/llvm-project/commit/fec1f2109f33c9a1a7650272b3bfb8f0f81f6a2b
DIFF: https://github.com/llvm/llvm-project/commit/fec1f2109f33c9a1a7650272b3bfb8f0f81f6a2b.diff

LOG: [OpenMP] Emit remarks during GPU state machine optimization

Since D83271 we can optimize the GPU state machine to avoid spurious
call edges that increase the register usage of kernels. With this patch
we inform the user why and if this optimization is happening and when it
is not.

Reviewed By: ye-luo

Differential Revision: https://reviews.llvm.org/D83707

Added: 
    clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
    clang/test/OpenMP/remarks_parallel_in_target_state_machine.c

Modified: 
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Removed: 
    


################################################################################
diff  --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
new file mode 100644
index 000000000000..c5152d401c8b

--- /dev/null
+++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
@@ -0,0 +1,102 @@
+// RUN: %clang_cc1                                 -verify=host                                                              -Rpass=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1                                 -verify=all,safe                                                          -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe                                                          -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+
+// host-no-diagnostics
+
+void bar1(void) {
+#pragma omp parallel // #0
+                     // all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // safe-remark@#0 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}}
+                     // force-remark@#0 {{[UNSAFE] Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will rewrite the state machine use due to command line flag, this can lead to undefined behavior if the parallel region is called from a target region outside this translation unit.}}
+                     // force-remark@#0 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: <NONE>}}
+  {
+  }
+}
+void bar2(void) {
+#pragma omp parallel // #1
+                     // all-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // safe-remark@#1 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}}
+                     // force-remark@#1 {{[UNSAFE] Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will rewrite the state machine use due to command line flag, this can lead to undefined behavior if the parallel region is called from a target region outside this translation unit.}}
+                     // force-remark@#1 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__6_wrapper, kernel ID: <NONE>}}
+  {
+  }
+}
+
+void foo1(void) {
+#pragma omp target teams // #2
+                         // all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}}
+                         // all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}}
+  {
+#pragma omp parallel // #3
+                     // all-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // all-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+    bar1();
+#pragma omp parallel // #4
+                     // all-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // all-remark@#4 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+  }
+}
+
+void foo2(void) {
+#pragma omp target teams // #5
+                         // all-remark@#5 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__5_wrapper, kernel ID: __omp_offloading_22}}
+                         // all-remark@#5 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__7_wrapper, kernel ID: __omp_offloading_22}}
+  {
+#pragma omp parallel // #6
+                     // all-remark@#6 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // all-remark@#6 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__5_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+    bar1();
+    bar2();
+#pragma omp parallel // #7
+                     // all-remark@#7 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // all-remark@#7 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__7_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+    bar1();
+    bar2();
+  }
+}
+
+void foo3(void) {
+#pragma omp target teams // #8
+                         // all-remark@#8 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__9_wrapper, kernel ID: __omp_offloading_22}}
+                         // all-remark@#8 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__10_wrapper, kernel ID: __omp_offloading_22}}
+  {
+#pragma omp parallel // #9
+                     // all-remark@#9 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // all-remark@#9 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__9_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+    bar1();
+    bar2();
+#pragma omp parallel // #10
+                     // all-remark@#10 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // all-remark@#10 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__10_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+    bar1();
+    bar2();
+  }
+}
+
+void spmd(void) {
+  // Verify we do not emit the remarks above for "SPMD" regions.
+#pragma omp target teams
+#pragma omp parallel
+  {
+  }
+
+#pragma omp target teams distribute parallel for
+  for (int i = 0; i < 100; ++i) {
+  }
+}
+
+// all-remark@* 3 {{OpenMP runtime call __kmpc_global_thread_num moved to}}
+// all-remark@* 3 {{OpenMP runtime call __kmpc_global_thread_num deduplicated}}

diff  --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
new file mode 100644
index 000000000000..117ef6d46d49
--- /dev/null
+++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1                                 -verify=host -Rpass=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1                                 -verify      -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify      -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+
+// host-no-diagnostics
+
+void bar(void) {
+#pragma omp parallel // #1                                                                                                                                                                                                                                                                                                                                           \
+                     // expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
+                     // expected-remark@#1 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}}
+  {
+  }
+}
+
+void foo(void) {
+#pragma omp target teams // #2                                                                                                                                                                      \
+                         // expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}} \
+                         // expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}}
+  {
+#pragma omp parallel // #3                                                                                                                                                                                                                                                                                                                                           \
+                     // expected-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
+                     // expected-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+    bar();
+#pragma omp parallel // #4                                                                                                                                                                                                                                                                                                                                           \
+                     // expected-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
+                     // expected-remark@#4 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+  }
+}
+
+void spmd(void) {
+  // Verify we do not emit the remarks above for "SPMD" regions.
+#pragma omp target teams
+#pragma omp parallel
+  {
+  }
+
+#pragma omp target teams distribute parallel for
+  for (int i = 0; i < 100; ++i) {
+  }
+}
+
+// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num moved to}}
+// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num deduplicated}}

diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 1da47e97e3bd..bc7e1dc6236e 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1033,6 +1033,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
     // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at
     // all.
     bool UnknownUse = false;
+    bool KernelPrepareUse = false;
     unsigned NumDirectCalls = 0;
 
     SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
@@ -1047,33 +1048,92 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
         ToBeReplacedStateMachineUses.push_back(&U);
         return;
       }
-      if (OpenMPOpt::getCallIfRegularCall(*U.getUser(),
-                                          &KernelPrepareParallelRFI)) {
+      if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall(
+                                   *U.getUser(), &KernelPrepareParallelRFI)) {
+        KernelPrepareUse = true;
         ToBeReplacedStateMachineUses.push_back(&U);
         return;
       }
       UnknownUse = true;
     });
 
-    // If this ever hits, we should investigate.
-    if (UnknownUse || NumDirectCalls != 1)
+    // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel
+    // use.
+    if (!KernelPrepareUse)
       continue;
 
-    // TODO: This is not a necessary restriction and should be lifted.
-    if (ToBeReplacedStateMachineUses.size() != 2)
+    {
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "Found a parallel region that is called in a target "
+                     "region but not part of a combined target construct nor "
+                     "nesed inside a target construct without intermediate "
+                     "code. This can lead to excessive register usage for "
+                     "unrelated target regions in the same translation unit "
+                     "due to spurious call edges assumed by ptxas.";
+      };
+      emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+    }
+
+    // If this ever hits, we should investigate.
+    // TODO: Checking the number of uses is not a necessary restriction and
+    // should be lifted.
+    if (UnknownUse || NumDirectCalls != 1 ||
+        ToBeReplacedStateMachineUses.size() != 2) {
+      {
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region is used in "
+                    << (UnknownUse ? "unknown" : "unexpected")
+                    << " ways; will not attempt to rewrite the state machine.";
+        };
+        emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+      }
       continue;
+    }
 
     // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give
     // up if the function is not called from a unique kernel.
     Kernel K = getUniqueKernelFor(*F);
-    if (!K)
+    if (!K) {
+      {
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region is not known to be called from a "
+                       "unique single target region, maybe the surrounding "
+                       "function has external linkage?; will not attempt to "
+                       "rewrite the state machine use.";
+        };
+        emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl",
+                             Remark);
+      }
       continue;
+    }
 
     // We now know F is a parallel body function called only from the kernel K.
     // We also identified the state machine uses in which we replace the
     // function pointer by a new global symbol for identification purposes. This
     // ensures only direct calls to the function are left.
 
+    {
+      auto RemarkParalleRegion = [&](OptimizationRemark OR) {
+        return OR << "Specialize parallel region that is only reached from a "
+                     "single target region to avoid spurious call edges and "
+                     "excessive register usage in other target regions. "
+                     "(parallel region ID: "
+                  << ore::NV("OpenMPParallelRegion", F->getName())
+                  << ", kernel ID: "
+                  << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
+      };
+      emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD",
+                           RemarkParalleRegion);
+      auto RemarkKernel = [&](OptimizationRemark OR) {
+        return OR << "Target region containing the parallel region that is "
+                     "specialized. (parallel region ID: "
+                  << ore::NV("OpenMPParallelRegion", F->getName())
+                  << ", kernel ID: "
+                  << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
+      };
+      emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel);
+    }
+
     Module &M = *F->getParent();
     Type *Int8Ty = Type::getInt8Ty(M.getContext());