[Mlir-commits] [mlir] 0637778 - [mlir][LLVMIR][OpenMP] fix dominance for reduction init block (#96052)

Fri Jun 21 03:10:16 PDT 2024

Author: Tom Eccles
Date: 2024-06-21T11:10:12+01:00
New Revision: 0637778af4174daf9cc1c315049a79db27f36e72

URL: https://github.com/llvm/llvm-project/commit/0637778af4174daf9cc1c315049a79db27f36e72
DIFF: https://github.com/llvm/llvm-project/commit/0637778af4174daf9cc1c315049a79db27f36e72.diff

LOG: [mlir][LLVMIR][OpenMP] fix dominance for reduction init block (#96052)

It was incorrect to set the insertion point to the init block after
inlining the initialization region because the code generated in the
init block depends upon the value yielded from the init region. When
there were multiple reduction initialization regions each with multiple
blocks, this could lead to the initilization region being inlined after
the init block which depends upon it.

Moving the insertion point to before inlining the initialization block
turned up further issues around the handling of the terminator for the
initialization block, which are also fixed here.

This fixes a bug in #92430 (but the affected code couldn't compile
before #92430 anyway).

Added: 
    mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir

Modified: 
    mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
    mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index cbfc64972f38b..7793d5da952ef 100644

--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -388,8 +388,19 @@ static LogicalResult inlineConvertOmpRegions(
     // be processed multiple times.
     moduleTranslation.forgetMapping(region);
 
-    if (potentialTerminator && potentialTerminator->isTerminator())
-      potentialTerminator->insertAfter(&builder.GetInsertBlock()->back());
+    if (potentialTerminator && potentialTerminator->isTerminator()) {
+      llvm::BasicBlock *block = builder.GetInsertBlock();
+      if (block->empty()) {
+        // this can happen for really simple reduction init regions e.g.
+        // %0 = llvm.mlir.constant(0 : i32) : i32
+        // omp.yield(%0 : i32)
+        // because the llvm.mlir.constant (MLIR op) isn't converted into any
+        // llvm op
+        potentialTerminator->insertInto(block, block->begin());
+      } else {
+        potentialTerminator->insertAfter(&block->back());
+      }
+    }
 
     return success();
   }
@@ -1171,6 +1182,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       }
     }
 
+    builder.SetInsertPoint(initBlock->getFirstNonPHIOrDbgOrAlloca());
+
     for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       SmallVector<llvm::Value *> phis;
 
@@ -1183,7 +1196,10 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       assert(phis.size() == 1 &&
              "expected one value to be yielded from the "
              "reduction neutral element declaration region");
-      builder.SetInsertPoint(initBlock->getTerminator());
+
+      // mapInitializationArg finishes its block with a terminator. We need to
+      // insert before that terminator.
+      builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
 
       if (isByRef[i]) {
         // Store the result of the inlined region to the allocated reduction var

diff  --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
new file mode 100644
index 0000000000000..f4d599538ac4a
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// This is basically a test that we don't crash while translating this IR
+
+omp.declare_reduction @add_reduction_byref_box_heap_i32 : !llvm.ptr init {
+^bb0(%arg0: !llvm.ptr):
+  %7 = llvm.mlir.constant(0 : i64) : i64
+  %16 = llvm.ptrtoint %arg0 : !llvm.ptr to i64
+  %17 = llvm.icmp "eq" %16, %7 : i64
+  llvm.cond_br %17, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  llvm.br ^bb3
+^bb2:  // pred: ^bb0
+  llvm.br ^bb3
+^bb3:  // 2 preds: ^bb1, ^bb2
+  omp.yield(%arg0 : !llvm.ptr)
+} combiner {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  omp.yield(%arg0 : !llvm.ptr)
+}  cleanup {
+^bb0(%arg0: !llvm.ptr):
+  omp.yield
+}
+llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: !llvm.ptr {fir.bindc_name = "y"}) attributes {fir.internal_name = "_QPmissordered_blocks", frame_pointer = #llvm.framePointerKind<"non-leaf">, target_cpu = "generic", target_features = #llvm.target_features<["+outline-atomics", "+v8a", "+fp-armv8", "+neon"]>} {
+  omp.parallel reduction(byref @add_reduction_byref_box_heap_i32 %arg0 -> %arg2 : !llvm.ptr, byref @add_reduction_byref_box_heap_i32 %arg1 -> %arg3 : !llvm.ptr) {
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK:         %[[VAL_0:.*]] = alloca { ptr, ptr }, align 8
+// CHECK:         br label %[[VAL_1:.*]]
+// CHECK:       entry:                                            ; preds = %[[VAL_2:.*]]
+// CHECK:         %[[VAL_3:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         br label %[[VAL_4:.*]]
+// CHECK:       omp_parallel:                                     ; preds = %[[VAL_1]]
+// CHECK:         %[[VAL_5:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_0]], i32 0, i32 0
+// CHECK:         store ptr %[[VAL_6:.*]], ptr %[[VAL_5]], align 8
+// CHECK:         %[[VAL_7:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_0]], i32 0, i32 1
+// CHECK:         store ptr %[[VAL_8:.*]], ptr %[[VAL_7]], align 8
+// CHECK:         call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @missordered_blocks_..omp_par, ptr %[[VAL_0]])
+// CHECK:         br label %[[VAL_9:.*]]
+// CHECK:       omp.par.outlined.exit:                            ; preds = %[[VAL_4]]
+// CHECK:         br label %[[VAL_10:.*]]
+// CHECK:       omp.par.exit.split:                               ; preds = %[[VAL_9]]
+// CHECK:         ret void
+// CHECK:       omp.par.entry:
+// CHECK:         %[[VAL_11:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_12:.*]], i32 0, i32 0
+// CHECK:         %[[VAL_13:.*]] = load ptr, ptr %[[VAL_11]], align 8
+// CHECK:         %[[VAL_14:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_12]], i32 0, i32 1
+// CHECK:         %[[VAL_15:.*]] = load ptr, ptr %[[VAL_14]], align 8
+// CHECK:         %[[VAL_16:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_17:.*]] = load i32, ptr %[[VAL_18:.*]], align 4
+// CHECK:         store i32 %[[VAL_17]], ptr %[[VAL_16]], align 4
+// CHECK:         %[[VAL_19:.*]] = load i32, ptr %[[VAL_16]], align 4
+// CHECK:         %[[VAL_20:.*]] = alloca ptr, align 8
+// CHECK:         %[[VAL_21:.*]] = alloca ptr, align 8
+// CHECK:         %[[VAL_22:.*]] = alloca [2 x ptr], align 8
+// CHECK:         br label %[[VAL_23:.*]]
+// CHECK:       omp.reduction.init:                               ; preds = %[[VAL_24:.*]]
+// CHECK:         br label %[[VAL_25:.*]]
+// CHECK:       omp.reduction.neutral:                            ; preds = %[[VAL_23]]
+// CHECK:         %[[VAL_26:.*]] = ptrtoint ptr %[[VAL_13]] to i64
+// CHECK:         %[[VAL_27:.*]] = icmp eq i64 %[[VAL_26]], 0
+// CHECK:         br i1 %[[VAL_27]], label %[[VAL_28:.*]], label %[[VAL_29:.*]]
+// CHECK:       omp.reduction.neutral2:                           ; preds = %[[VAL_25]]
+// CHECK:         br label %[[VAL_30:.*]]
+// CHECK:       omp.reduction.neutral3:                           ; preds = %[[VAL_28]], %[[VAL_29]]
+// CHECK:         br label %[[VAL_31:.*]]
+// CHECK:       omp.region.cont:                                  ; preds = %[[VAL_30]]
+// CHECK:         %[[VAL_32:.*]] = phi ptr [ %[[VAL_13]], %[[VAL_30]] ]
+// CHECK:         store ptr %[[VAL_32]], ptr %[[VAL_20]], align 8
+// CHECK:         br label %[[VAL_33:.*]]
+// CHECK:       omp.reduction.neutral5:                           ; preds = %[[VAL_31]]
+// CHECK:         %[[VAL_34:.*]] = ptrtoint ptr %[[VAL_15]] to i64
+// CHECK:         %[[VAL_35:.*]] = icmp eq i64 %[[VAL_34]], 0
+// CHECK:         br i1 %[[VAL_35]], label %[[VAL_36:.*]], label %[[VAL_37:.*]]
+// CHECK:       omp.reduction.neutral7:                           ; preds = %[[VAL_33]]
+// CHECK:         br label %[[VAL_38:.*]]
+// CHECK:       omp.reduction.neutral8:                           ; preds = %[[VAL_36]], %[[VAL_37]]
+// CHECK:         br label %[[VAL_39:.*]]
+// CHECK:       omp.region.cont4:                                 ; preds = %[[VAL_38]]
+// CHECK:         %[[VAL_40:.*]] = phi ptr [ %[[VAL_15]], %[[VAL_38]] ]
+// CHECK:         store ptr %[[VAL_40]], ptr %[[VAL_21]], align 8
+// CHECK:         br label %[[VAL_41:.*]]
+// CHECK:       omp.par.region:                                   ; preds = %[[VAL_39]]
+// CHECK:         br label %[[VAL_42:.*]]
+// CHECK:       omp.par.region10:                                 ; preds = %[[VAL_41]]
+// CHECK:         br label %[[VAL_43:.*]]
+// CHECK:       omp.region.cont9:                                 ; preds = %[[VAL_42]]
+// CHECK:         %[[VAL_44:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_22]], i64 0, i64 0
+// CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_44]], align 8
+// CHECK:         %[[VAL_45:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_22]], i64 0, i64 1
+// CHECK:         store ptr %[[VAL_21]], ptr %[[VAL_45]], align 8
+// CHECK:         %[[VAL_46:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[VAL_47:.*]] = call i32 @__kmpc_reduce(ptr @1, i32 %[[VAL_46]], i32 2, i64 16, ptr %[[VAL_22]], ptr @.omp.reduction.func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK:         switch i32 %[[VAL_47]], label %[[VAL_48:.*]] [
+// CHECK:           i32 1, label %[[VAL_49:.*]]
+// CHECK:           i32 2, label %[[VAL_50:.*]]
+// CHECK:         ]
+// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_43]]
+// CHECK:         unreachable
+// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_43]]
+// CHECK:         %[[VAL_51:.*]] = load ptr, ptr %[[VAL_20]], align 8
+// CHECK:         %[[VAL_52:.*]] = load ptr, ptr %[[VAL_21]], align 8
+// CHECK:         call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_46]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK:         br label %[[VAL_48]]
+// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_49]], %[[VAL_43]]
+// CHECK:         br label %[[VAL_53:.*]]
+// CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_48]]
+// CHECK:         %[[VAL_54:.*]] = load ptr, ptr %[[VAL_20]], align 8
+// CHECK:         %[[VAL_55:.*]] = load ptr, ptr %[[VAL_21]], align 8
+// CHECK:         br label %[[VAL_56:.*]]
+// CHECK:       omp.reduction.neutral6:                           ; preds = %[[VAL_33]]
+// CHECK:         br label %[[VAL_38]]
+// CHECK:       omp.reduction.neutral1:                           ; preds = %[[VAL_25]]
+// CHECK:         br label %[[VAL_30]]
+// CHECK:       omp.par.outlined.exit.exitStub:                   ; preds = %[[VAL_53]]
+// CHECK:         ret void

diff  --git a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
index 361905f7cddeb..0f757de39a006 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
@@ -61,10 +61,10 @@ module {
 // CHECK:         %[[VAL_19:.*]] = load i32, ptr %[[VAL_16]], align 4
 // CHECK:         %[[VAL_21:.*]] = alloca ptr, align 8
 // CHECK:         %[[VAL_23:.*]] = alloca ptr, align 8
-// CHECK:         %[[VAL_20:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_13]], align 8
 // CHECK:         %[[VAL_24:.*]] = alloca [2 x ptr], align 8
 // CHECK:         br label %[[INIT_LABEL:.*]]
 // CHECK: [[INIT_LABEL]]:
+// CHECK:         %[[VAL_20:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_13]], align 8
 // CHECK:         store ptr %[[VAL_13]], ptr %[[VAL_21]], align 8
 // CHECK:         %[[VAL_22:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_15]], align 8
 // CHECK:         store ptr %[[VAL_15]], ptr %[[VAL_23]], align 8