[llvm] d61aac7 - [OpenMP][FIX] Do not signal SPMD-mode but then keep generic-mode

Tue Nov 2 21:23:05 PDT 2021

Author: Johannes Doerfert
Date: 2021-11-02T23:22:04-05:00
New Revision: d61aac76bf9009317ef15b1fc38a98a808ce9b19

URL: https://github.com/llvm/llvm-project/commit/d61aac76bf9009317ef15b1fc38a98a808ce9b19
DIFF: https://github.com/llvm/llvm-project/commit/d61aac76bf9009317ef15b1fc38a98a808ce9b19.diff

LOG: [OpenMP][FIX] Do not signal SPMD-mode but then keep generic-mode

If we assume SPMD-mode during the fixpoint iteration we have to execute
the kernel in SPMD-mode. If we change our mind during manifest there is
the chance of a mismatch between the simplification, e.g., of
`__kmpc_is_spmd_exec_mode` calls, and the execution mode. This problem
was introduced in D109438.

This patch is compromise to resolve the problem purely in OpenMP-opt
while trying to keep the benefits of D109438 around. This might not
always work, see `get_hardware_num_threads_in_block_fold` but it often
does. At the same time we do keep value specialization and execution
mode in sync.

Proper solutions to this problem should be considered. I believe a new
execution mode is the easiest way forward (Singleton-SPMD).
Alternatively, SPMD-mode execution can be used with a way to provide a
new thread_limit (here 1) to the runtime. This is more general and could
be useful if we see `num_threads` clauses or workshared loops with small
trip counts in the kernel. In either proposal we need to disable the
guarding for the kernel (which was the motivation for D109438).

Reviewed By: jhuber6

Differential Revision: https://reviews.llvm.org/D112894

Added: 
    

Modified: 
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    llvm/test/Transforms/OpenMP/always_inline_device.ll
    llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 9aa8c9f832018..33ca121c71fef 100644

--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -597,6 +597,10 @@ struct KernelInfoState : AbstractState {
   /// See AbstractState::indicateOptimisticFixpoint(...)
   ChangeStatus indicateOptimisticFixpoint() override {
     IsAtFixpoint = true;
+    ReachingKernelEntries.indicateOptimisticFixpoint();
+    SPMDCompatibilityTracker.indicateOptimisticFixpoint();
+    ReachedKnownParallelRegions.indicateOptimisticFixpoint();
+    ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
     return ChangeStatus::UNCHANGED;
   }
 
@@ -3058,19 +3062,16 @@ struct AAKernelInfoFunction : AAKernelInfo {
     if (!KernelInitCB || !KernelDeinitCB)
       return ChangeStatus::UNCHANGED;
 
-    // Known SPMD-mode kernels need no manifest changes.
-    if (SPMDCompatibilityTracker.isKnown())
-      return ChangeStatus::UNCHANGED;
-
     // If we can we change the execution mode to SPMD-mode otherwise we build a
     // custom state machine.
-    if (!mayContainParallelRegion() || !changeToSPMDMode(A))
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    if (!changeToSPMDMode(A, Changed))
       return buildCustomStateMachine(A);
 
-    return ChangeStatus::CHANGED;
+    return Changed;
   }
 
-  bool changeToSPMDMode(Attributor &A) {
+  bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
 
     if (!SPMDCompatibilityTracker.isAssumed()) {
@@ -3102,6 +3103,24 @@ struct AAKernelInfoFunction : AAKernelInfo {
       return false;
     }
 
+    // Check if the kernel is already in SPMD mode, if so, return success.
+    Function *Kernel = getAnchorScope();
+    GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
+        (Kernel->getName() + "_exec_mode").str());
+    assert(ExecMode && "Kernel without exec mode?");
+    assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
+
+    // Set the global exec mode flag to indicate SPMD-Generic mode.
+    assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
+           "ExecMode is not an integer!");
+    const int8_t ExecModeVal =
+        cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
+    if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
+      return true;
+
+    // We will now unconditionally modify the IR, indicate a change.
+    Changed = ChangeStatus::CHANGED;
+
     auto CreateGuardedRegion = [&](Instruction *RegionStartI,
                                    Instruction *RegionEndI) {
       LoopInfo *LI = nullptr;
@@ -3312,17 +3331,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
 
     // Adjust the global exec mode flag that tells the runtime what mode this
     // kernel is executed in.
-    Function *Kernel = getAnchorScope();
-    GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
-        (Kernel->getName() + "_exec_mode").str());
-    assert(ExecMode && "Kernel without exec mode?");
-    assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
-
-    // Set the global exec mode flag to indicate SPMD-Generic mode.
-    assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
-           "ExecMode is not an integer!");
-    const int8_t ExecModeVal =
-        cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
     assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&
            "Initially non-SPMD kernel has SPMD exec mode!");
     ExecMode->setInitializer(
@@ -3699,6 +3707,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
     }
 
     // Callback to check a call instruction.
+    bool AllParallelRegionStatesWereFixed = true;
     bool AllSPMDStatesWereFixed = true;
     auto CheckCallInst = [&](Instruction &I) {
       auto &CB = cast<CallBase>(I);
@@ -3706,6 +3715,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
           *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
       getState() ^= CBAA.getState();
       AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint();
+      AllParallelRegionStatesWereFixed &=
+          CBAA.ReachedKnownParallelRegions.isAtFixpoint();
+      AllParallelRegionStatesWereFixed &=
+          CBAA.ReachedUnknownParallelRegions.isAtFixpoint();
       return true;
     };
 
@@ -3717,6 +3730,23 @@ struct AAKernelInfoFunction : AAKernelInfo {
       return indicatePessimisticFixpoint();
     }
 
+    // If we haven't used any assumed information for the reached parallel
+    // region states we can fix it.
+    if (!UsedAssumedInformationInCheckCallInst &&
+        AllParallelRegionStatesWereFixed) {
+      ReachedKnownParallelRegions.indicateOptimisticFixpoint();
+      ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
+    }
+
+    // If we are sure there are no parallel regions in the kernel we do not
+    // want SPMD mode.
+    if (IsKernelEntry && ReachedUnknownParallelRegions.isAtFixpoint() &&
+        ReachedKnownParallelRegions.isAtFixpoint() &&
+        ReachedUnknownParallelRegions.isValidState() &&
+        ReachedKnownParallelRegions.isValidState() &&
+        !mayContainParallelRegion())
+      SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+
     // If we haven't used any assumed information for the SPMD state we can fix
     // it.
     if (!UsedAssumedInformationInCheckRWInst &&

diff  --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll
index 7ff4541a740c3..a1f7a7ab694ba 100644
--- a/llvm/test/Transforms/OpenMP/always_inline_device.ll
+++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll
@@ -6,6 +6,7 @@
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
 @__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode = weak constant i8 1
 @llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode], section "llvm.metadata"
+ at G = external global i8
 
 ; Function Attrs: convergent norecurse nounwind
 define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
@@ -16,6 +17,7 @@ define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
+; CHECK-NEXT:    store i8 0, i8* @G, align 1
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
@@ -27,6 +29,12 @@ entry:
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
 user_code.entry:                                  ; preds = %entry
+  ; Ensure we see a 0 here as the kernel doesn't have parallel regions and we want
+  ; generic execution.
+  ; TODO: This is not perfect. We should rather go for SPMD mode and tell the runtime
+  ;       to only spawn a single thread. Further, we then should not guard any code.
+  %isSPMD = call i8 @__kmpc_is_spmd_exec_mode()
+  store i8 %isSPMD, i8* @G
   call void @bar() #2
   call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true)
   ret void
@@ -35,6 +43,8 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
+declare i8 @__kmpc_is_spmd_exec_mode()
+
 declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1)
 
 declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1)

diff  --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
index a817c384719a6..602d4f0be5779 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -8,9 +8,9 @@ target triple = "nvptx64"
 
 @G = external global i32
 ;.
-; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
+; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
-; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
+; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
 ; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
@@ -25,11 +25,11 @@ define weak void @kernel0() #0 {
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false)
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false)
+  %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false)
   call void @helper0()
   call void @helper1()
   call void @helper2()
-  call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false)
+  call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false)
   ret void
 }
 
@@ -43,9 +43,9 @@ define weak void @kernel1() #0 {
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false)
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false)
+  %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false)
   call void @helper1()
-  call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false)
+  call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false)
   ret void
 }