[Openmp-commits] [openmp] a013981 - [OpenMPOpt][FIX] Ensure to propagate information about parallel regions

Fri Aug 25 10:47:08 PDT 2023

Author: Johannes Doerfert
Date: 2023-08-25T10:46:56-07:00
New Revision: a01398156a0ce4750d018033015e39c3eb159980

URL: https://github.com/llvm/llvm-project/commit/a01398156a0ce4750d018033015e39c3eb159980
DIFF: https://github.com/llvm/llvm-project/commit/a01398156a0ce4750d018033015e39c3eb159980.diff

LOG: [OpenMPOpt][FIX] Ensure to propagate information about parallel regions

Before, we checked the parallel region only once, and ignored updates in
the KernelInfo for the parallel region that happened later. This caused
us to think nested parallel sections are not present even if they are,
among other things.

Added: 
    openmp/libomptarget/test/offloading/bug64959.c

Modified: 
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 8c68470a184721..63493eb78c451a 100644

--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -43,6 +43,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -670,7 +671,7 @@ struct KernelInfoState : AbstractState {
 
   /// The parallel regions (identified by the outlined parallel functions) that
   /// can be reached from the associated function.
-  BooleanStateWithPtrSetVector<Function, /* InsertInvalidates */ false>
+  BooleanStateWithPtrSetVector<CallBase, /* InsertInvalidates */ false>
       ReachedKnownParallelRegions;
 
   /// State to track what parallel region we might reach.
@@ -4455,11 +4456,15 @@ struct AAKernelInfoFunction : AAKernelInfo {
     Value *ZeroArg =
         Constant::getNullValue(ParallelRegionFnTy->getParamType(0));
 
+    const unsigned int WrapperFunctionArgNo = 6;
+
     // Now that we have most of the CFG skeleton it is time for the if-cascade
     // that checks the function pointer we got from the runtime against the
     // parallel regions we expect, if there are any.
     for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) {
-      auto *ParallelRegion = ReachedKnownParallelRegions[I];
+      auto *CB = ReachedKnownParallelRegions[I];
+      auto *ParallelRegion = dyn_cast<Function>(
+          CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
       BasicBlock *PRExecuteBB = BasicBlock::Create(
           Ctx, "worker_state_machine.parallel_region.execute", Kernel,
           StateMachineEndParallelBB);
@@ -4822,8 +4827,6 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       return;
     }
 
-    const unsigned int NonWrapperFunctionArgNo = 5;
-    const unsigned int WrapperFunctionArgNo = 6;
     RuntimeFunction RF = It->getSecond();
     switch (RF) {
     // All the functions we know are compatible with SPMD mode.
@@ -4902,28 +4905,10 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     case OMPRTL___kmpc_target_deinit:
       KernelDeinitCB = &CB;
       break;
-    case OMPRTL___kmpc_parallel_51: {
-      auto *ParallelRegionOp =
-          CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts();
-      if (isa<ConstantPointerNull>(ParallelRegionOp))
-        ParallelRegionOp =
-            CB.getArgOperand(NonWrapperFunctionArgNo)->stripPointerCasts();
-      if (auto *ParallelRegion = dyn_cast<Function>(ParallelRegionOp)) {
-        ReachedKnownParallelRegions.insert(ParallelRegion);
-        /// Check nested parallelism
-        auto *FnAA = A.getAAFor<AAKernelInfo>(
-            *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL);
-        NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
-                             !FnAA->ReachedKnownParallelRegions.empty() ||
-                             !FnAA->ReachedUnknownParallelRegions.empty();
-        break;
-      }
-      // The condition above should usually get the parallel region function
-      // pointer and record it. In the off chance it doesn't we assume the
-      // worst.
-      ReachedUnknownParallelRegions.insert(&CB);
-      break;
-    }
+    case OMPRTL___kmpc_parallel_51:
+      if (!handleParallel51(A, CB))
+        indicatePessimisticFixpoint();
+      return;
     case OMPRTL___kmpc_omp_task:
       // We do not look into tasks right now, just give up.
       SPMDCompatibilityTracker.indicatePessimisticFixpoint();
@@ -4969,14 +4954,21 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       return ChangeStatus::CHANGED;
     }
 
+    KernelInfoState StateBefore = getState();
+    CallBase &CB = cast<CallBase>(getAssociatedValue());
+    if (It->getSecond() == OMPRTL___kmpc_parallel_51) {
+      if (!handleParallel51(A, CB))
+        return indicatePessimisticFixpoint();
+      return StateBefore == getState() ? ChangeStatus::UNCHANGED
+                                       : ChangeStatus::CHANGED;
+    }
+
     // F is a runtime function that allocates or frees memory, check
     // AAHeapToStack and AAHeapToShared.
-    KernelInfoState StateBefore = getState();
     assert((It->getSecond() == OMPRTL___kmpc_alloc_shared ||
             It->getSecond() == OMPRTL___kmpc_free_shared) &&
            "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
 
-    CallBase &CB = cast<CallBase>(getAssociatedValue());
 
     auto *HeapToStackAA = A.getAAFor<AAHeapToStack>(
         *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
@@ -5008,6 +5000,32 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     return StateBefore == getState() ? ChangeStatus::UNCHANGED
                                      : ChangeStatus::CHANGED;
   }
+
+  /// Deal with a __kmpc_parallel_51 call (\p CB). Returns true if the call was
+  /// handled, if a problem occurred, false is returned.
+  bool handleParallel51(Attributor &A, CallBase &CB) {
+    const unsigned int NonWrapperFunctionArgNo = 5;
+    const unsigned int WrapperFunctionArgNo = 6;
+    auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
+                                     ? NonWrapperFunctionArgNo
+                                     : WrapperFunctionArgNo;
+
+    auto *ParallelRegion = dyn_cast<Function>(
+        CB.getArgOperand(ParallelRegionOpArgNo)->stripPointerCasts());
+    if (!ParallelRegion)
+      return false;
+
+    ReachedKnownParallelRegions.insert(&CB);
+    /// Check nested parallelism
+    auto *FnAA = A.getAAFor<AAKernelInfo>(
+        *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL);
+    NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
+                         !FnAA->ReachedKnownParallelRegions.empty() ||
+                         !FnAA->ReachedKnownParallelRegions.isValidState() ||
+                         !FnAA->ReachedUnknownParallelRegions.isValidState() ||
+                         !FnAA->ReachedUnknownParallelRegions.empty();
+    return true;
+  }
 };
 
 struct AAFoldRuntimeCall

diff  --git a/openmp/libomptarget/test/offloading/bug64959.c b/openmp/libomptarget/test/offloading/bug64959.c
new file mode 100644
index 00000000000000..500911fa3f3163
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/bug64959.c
@@ -0,0 +1,57 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+
+// TODO: This requires malloc support for the threads states.
+// UNSUPPORTED: amdgcn-amd-amdhsa
+
+#include <omp.h>
+#include <stdio.h>
+#define N 10
+
+int isCPU() { return 1; }
+
+#pragma omp begin declare variant match(device = {kind(gpu)})
+int isCPU() { return 0; }
+#pragma omp end declare variant
+
+int main(void) {
+  long int aa = 0;
+  int res = 0;
+
+  int ng = 12;
+  int cmom = 14;
+  int nxyz;
+
+#pragma omp target map(from : nxyz, ng, cmom)
+  {
+    nxyz = isCPU() ? 2 : 5000;
+    ng = isCPU() ? 2 : 12;
+    cmom = isCPU() ? 2 : 14;
+  }
+
+#pragma omp target teams distribute num_teams(nxyz)                            \
+    thread_limit(ng *(cmom - 1)) map(tofrom : aa)
+  for (int gid = 0; gid < nxyz; gid++) {
+#pragma omp parallel for collapse(2)
+    for (unsigned int g = 0; g < ng; g++) {
+      for (unsigned int l = 0; l < cmom - 1; l++) {
+        int a = 0;
+#pragma omp parallel for reduction(+ : a)
+        for (int i = 0; i < N; i++) {
+          a += i;
+        }
+#pragma omp atomic
+        aa += a;
+      }
+    }
+  }
+  long exp = (long)ng * (cmom - 1) * nxyz * (N * (N - 1) / 2);
+  printf("The result is = %ld exp:%ld!\n", aa, exp);
+  if (aa != exp) {
+    printf("Failed %ld\n", aa);
+    return 1;
+  }
+  // CHECK: Success
+  printf("Success\n");
+  return 0;
+}