[Mlir-commits] [mlir] [mlir][LLVMIR][OpenMP] fix dominance for reduction init block (PR #96052)

Wed Jun 19 06:14:02 PDT 2024

https://github.com/tblah updated https://github.com/llvm/llvm-project/pull/96052

>From 51f371377ef0a257b36bd54fdfe340d9ad33968e Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Thu, 13 Jun 2024 14:11:13 +0000
Subject: [PATCH 1/3] [mlir][LLVMIR][OpenMP] fix dominance for reduction init
 block

It was incorrect to set the insertion point to the init block after
inlining the initialization region because the code generated in the
init block depends upon the value yielded from the init region. When
there were multiple reduction initialization regions each with multiple
blocks, this could lead to the initilization region being inlined after
the init block which depends upon it.

Moving the insertion point to before inlining the initialization block
turned up further issues around the handling of the terminator for the
initialization block, which are also fixed here.

This fixes a bug in #92430 (but the affected code couldn't compile
before #92430 anyway).
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  21 +-
 .../openmp-parallel-reduction-multiblock.mlir | 342 ++++++++++++++++++
 .../LLVMIR/openmp-reduction-init-arg.mlir     |   2 +-
 3 files changed, 361 insertions(+), 4 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index cbfc64972f38b..9fe63a9655be2 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -388,8 +388,18 @@ static LogicalResult inlineConvertOmpRegions(
     // be processed multiple times.
     moduleTranslation.forgetMapping(region);
 
-    if (potentialTerminator && potentialTerminator->isTerminator())
-      potentialTerminator->insertAfter(&builder.GetInsertBlock()->back());
+    if (potentialTerminator && potentialTerminator->isTerminator()) {
+      llvm::BasicBlock *block = builder.GetInsertBlock();
+      if (block->empty())
+        // this can happen for really simple reduction init regions e.g.
+        // %0 = llvm.mlir.constant(0 : i32) : i32
+        // omp.yield(%0 : i32)
+        // because the llvm.mlir.constant (MLIR op) isn't converted into any
+        // llvm op
+        potentialTerminator->insertInto(block, block->begin());
+      else
+        potentialTerminator->insertAfter(&block->back());
+    }
 
     return success();
   }
@@ -1171,6 +1181,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       }
     }
 
+    builder.SetInsertPoint(initBlock->getFirstNonPHIOrDbgOrAlloca());
+
     for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       SmallVector<llvm::Value *> phis;
 
@@ -1183,7 +1195,10 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       assert(phis.size() == 1 &&
              "expected one value to be yielded from the "
              "reduction neutral element declaration region");
-      builder.SetInsertPoint(initBlock->getTerminator());
+
+      // mapInitializationArg finishes its block with a terminator. We need to
+      // insert before that terminator.
+      builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
 
       if (isByRef[i]) {
         // Store the result of the inlined region to the allocated reduction var
diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
new file mode 100644
index 0000000000000..00020bd4c9d1e
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
@@ -0,0 +1,342 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// generated by flang-new:
+// subroutine missordered_blocks(x,y)
+//   integer, allocatable :: x, y
+//   !$omp parallel reduction(+:x,y)
+//   x = 42
+//   y = 24
+//   !$omp end parallel
+// end subroutine
+
+// This is basically a test that we don't crash while translating this IR
+
+omp.declare_reduction @add_reduction_byref_box_heap_i32 : !llvm.ptr init {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.mlir.constant(1 : i32) : i32
+  %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %4 = llvm.mlir.constant(1 : i32) : i32
+  %5 = llvm.alloca %4 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %6 = llvm.mlir.constant(0 : i64) : i64
+  %7 = llvm.mlir.constant(0 : i32) : i32
+  %8 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  llvm.store %8, %5 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %9 = llvm.mlir.constant(1 : i64) : i64
+  %10 = llvm.alloca %9 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr
+  %11 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  %12 = llvm.load %11 : !llvm.ptr -> !llvm.ptr
+  %13 = llvm.ptrtoint %12 : !llvm.ptr to i64
+  %14 = llvm.icmp "eq" %13, %6 : i64
+  llvm.cond_br %14, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %15 = llvm.mlir.constant(9 : i32) : i32
+  %16 = llvm.mlir.zero : !llvm.ptr
+  %17 = llvm.getelementptr %16[1] : (!llvm.ptr) -> !llvm.ptr, i32
+  %18 = llvm.ptrtoint %17 : !llvm.ptr to i64
+  %19 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  %20 = llvm.insertvalue %18, %19[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %21 = llvm.mlir.constant(20180515 : i32) : i32
+  %22 = llvm.insertvalue %21, %20[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %23 = llvm.mlir.constant(0 : i32) : i32
+  %24 = llvm.trunc %23 : i32 to i8
+  %25 = llvm.insertvalue %24, %22[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %26 = llvm.trunc %15 : i32 to i8
+  %27 = llvm.insertvalue %26, %25[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %28 = llvm.mlir.constant(2 : i32) : i32
+  %29 = llvm.trunc %28 : i32 to i8
+  %30 = llvm.insertvalue %29, %27[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %31 = llvm.mlir.constant(0 : i32) : i32
+  %32 = llvm.trunc %31 : i32 to i8
+  %33 = llvm.insertvalue %32, %30[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %34 = llvm.insertvalue %12, %33[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  llvm.store %34, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %35 = llvm.load %3 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  llvm.store %35, %10 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  llvm.br ^bb3
+^bb2:  // pred: ^bb0
+  %36 = llvm.mlir.zero : !llvm.ptr
+  %37 = llvm.getelementptr %36[1] : (!llvm.ptr) -> !llvm.ptr, i32
+  %38 = llvm.ptrtoint %37 : !llvm.ptr to i64
+  //%39 = llvm.call @malloc(%38) {in_type = i32, operandSegmentSizes = array<i32: 0, 0>} : (i64) -> !llvm.ptr
+  %39 = llvm.mlir.zero : !llvm.ptr
+  llvm.store %7, %39 : i32, !llvm.ptr
+  %40 = llvm.mlir.constant(9 : i32) : i32
+  %41 = llvm.mlir.zero : !llvm.ptr
+  %42 = llvm.getelementptr %41[1] : (!llvm.ptr) -> !llvm.ptr, i32
+  %43 = llvm.ptrtoint %42 : !llvm.ptr to i64
+  %44 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  %45 = llvm.insertvalue %43, %44[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %46 = llvm.mlir.constant(20180515 : i32) : i32
+  %47 = llvm.insertvalue %46, %45[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %48 = llvm.mlir.constant(0 : i32) : i32
+  %49 = llvm.trunc %48 : i32 to i8
+  %50 = llvm.insertvalue %49, %47[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %51 = llvm.trunc %40 : i32 to i8
+  %52 = llvm.insertvalue %51, %50[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %53 = llvm.mlir.constant(2 : i32) : i32
+  %54 = llvm.trunc %53 : i32 to i8
+  %55 = llvm.insertvalue %54, %52[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %56 = llvm.mlir.constant(0 : i32) : i32
+  %57 = llvm.trunc %56 : i32 to i8
+  %58 = llvm.insertvalue %57, %55[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %59 = llvm.insertvalue %39, %58[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  llvm.store %59, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %60 = llvm.load %1 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  llvm.store %60, %10 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  llvm.br ^bb3
+^bb3:  // 2 preds: ^bb1, ^bb2
+  omp.yield(%10 : !llvm.ptr)
+} combiner {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.mlir.constant(1 : i32) : i32
+  %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %4 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  llvm.store %4, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  llvm.store %5, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %6 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  %7 = llvm.load %6 : !llvm.ptr -> !llvm.ptr
+  %8 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr
+  %10 = llvm.load %7 : !llvm.ptr -> i32
+  %11 = llvm.load %9 : !llvm.ptr -> i32
+  %12 = llvm.add %10, %11 : i32
+  llvm.store %12, %7 : i32, !llvm.ptr
+  omp.yield(%arg0 : !llvm.ptr)
+}  cleanup {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.mlir.constant(0 : i64) : i64
+  %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  llvm.store %3, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %4 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  %5 = llvm.load %4 : !llvm.ptr -> !llvm.ptr
+  %6 = llvm.ptrtoint %5 : !llvm.ptr to i64
+  %7 = llvm.icmp "ne" %6, %2 : i64
+  llvm.cond_br %7, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  //llvm.call @free(%5) : (!llvm.ptr) -> ()
+  llvm.br ^bb2
+^bb2:  // 2 preds: ^bb0, ^bb1
+  omp.yield
+}
+llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: !llvm.ptr {fir.bindc_name = "y"}) attributes {fir.internal_name = "_QPmissordered_blocks", frame_pointer = #llvm.framePointerKind<"non-leaf">, target_cpu = "generic", target_features = #llvm.target_features<["+outline-atomics", "+v8a", "+fp-armv8", "+neon"]>} {
+  %0 = llvm.mlir.constant(24 : i32) : i32
+  %1 = llvm.mlir.constant(42 : i32) : i32
+  omp.parallel reduction(byref @add_reduction_byref_box_heap_i32 %arg0 -> %arg2 : !llvm.ptr, byref @add_reduction_byref_box_heap_i32 %arg1 -> %arg3 : !llvm.ptr) {
+    %2 = llvm.mlir.constant(1 : i32) : i32
+    %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %4 = llvm.mlir.constant(1 : i32) : i32
+    %5 = llvm.alloca %4 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %6 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+    llvm.store %6, %5 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+    %7 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+    %8 = llvm.load %7 : !llvm.ptr -> !llvm.ptr
+    llvm.store %1, %8 : i32, !llvm.ptr
+    %9 = llvm.load %arg3 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+    llvm.store %9, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+    %10 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+    %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr
+    llvm.store %0, %11 : i32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+
+
+// CHECK:         %[[VAL_0:.*]] = alloca { ptr, ptr }, align 8
+// CHECK:         br label %[[VAL_1:.*]]
+// CHECK:       entry:                                            ; preds = %[[VAL_2:.*]]
+// CHECK:         %[[VAL_3:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         br label %[[VAL_4:.*]]
+// CHECK:       omp_parallel:                                     ; preds = %[[VAL_1]]
+// CHECK:         %[[VAL_5:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_0]], i32 0, i32 0
+// CHECK:         store ptr %[[VAL_6:.*]], ptr %[[VAL_5]], align 8
+// CHECK:         %[[VAL_7:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_0]], i32 0, i32 1
+// CHECK:         store ptr %[[VAL_8:.*]], ptr %[[VAL_7]], align 8
+// CHECK:         call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @missordered_blocks_..omp_par, ptr %[[VAL_0]])
+// CHECK:         br label %[[VAL_9:.*]]
+// CHECK:       omp.par.outlined.exit:                            ; preds = %[[VAL_4]]
+// CHECK:         br label %[[VAL_10:.*]]
+// CHECK:       omp.par.exit.split:                               ; preds = %[[VAL_9]]
+// CHECK:         ret void
+// CHECK:       omp.par.entry:
+// CHECK:         %[[VAL_11:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_12:.*]], i32 0, i32 0
+// CHECK:         %[[VAL_13:.*]] = load ptr, ptr %[[VAL_11]], align 8
+// CHECK:         %[[VAL_14:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_12]], i32 0, i32 1
+// CHECK:         %[[VAL_15:.*]] = load ptr, ptr %[[VAL_14]], align 8
+// CHECK:         %[[VAL_16:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_17:.*]] = load i32, ptr %[[VAL_18:.*]], align 4
+// CHECK:         store i32 %[[VAL_17]], ptr %[[VAL_16]], align 4
+// CHECK:         %[[VAL_19:.*]] = load i32, ptr %[[VAL_16]], align 4
+// CHECK:         %[[VAL_20:.*]] = alloca ptr, align 8
+// CHECK:         %[[VAL_21:.*]] = alloca ptr, align 8
+// CHECK:         %[[VAL_22:.*]] = alloca [2 x ptr], align 8
+// CHECK:         br label %[[VAL_23:.*]]
+// CHECK:       omp.reduction.init:                               ; preds = %[[VAL_24:.*]]
+// CHECK:         br label %[[VAL_25:.*]]
+// CHECK:       omp.reduction.neutral:                            ; preds = %[[VAL_23]]
+// CHECK:         %[[VAL_26:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_27:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_28:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_29:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_13]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_29]], ptr %[[VAL_28]], align 8
+// CHECK:         %[[VAL_30:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
+// CHECK:         %[[VAL_31:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_28]], i32 0, i32 0
+// CHECK:         %[[VAL_32:.*]] = load ptr, ptr %[[VAL_31]], align 8
+// CHECK:         %[[VAL_33:.*]] = ptrtoint ptr %[[VAL_32]] to i64
+// CHECK:         %[[VAL_34:.*]] = icmp eq i64 %[[VAL_33]], 0
+// CHECK:         br i1 %[[VAL_34]], label %[[VAL_35:.*]], label %[[VAL_36:.*]]
+// CHECK:       omp.reduction.neutral2:                           ; preds = %[[VAL_25]]
+// CHECK:         store i32 0, ptr null, align 4
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } { ptr null, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }, ptr %[[VAL_26]], align 8
+// CHECK:         %[[VAL_37:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_26]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_37]], ptr %[[VAL_30]], align 8
+// CHECK:         br label %[[VAL_38:.*]]
+// CHECK:       omp.reduction.neutral3:                           ; preds = %[[VAL_35]], %[[VAL_36]]
+// CHECK:         br label %[[VAL_39:.*]]
+// CHECK:       omp.region.cont:                                  ; preds = %[[VAL_38]]
+// CHECK:         %[[VAL_40:.*]] = phi ptr [ %[[VAL_30]], %[[VAL_38]] ]
+// CHECK:         store ptr %[[VAL_40]], ptr %[[VAL_20]], align 8
+// CHECK:         br label %[[VAL_41:.*]]
+// CHECK:       omp.reduction.neutral5:                           ; preds = %[[VAL_39]]
+// CHECK:         %[[VAL_42:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_43:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_44:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_45:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_15]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_45]], ptr %[[VAL_44]], align 8
+// CHECK:         %[[VAL_46:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
+// CHECK:         %[[VAL_47:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_44]], i32 0, i32 0
+// CHECK:         %[[VAL_48:.*]] = load ptr, ptr %[[VAL_47]], align 8
+// CHECK:         %[[VAL_49:.*]] = ptrtoint ptr %[[VAL_48]] to i64
+// CHECK:         %[[VAL_50:.*]] = icmp eq i64 %[[VAL_49]], 0
+// CHECK:         br i1 %[[VAL_50]], label %[[VAL_51:.*]], label %[[VAL_52:.*]]
+// CHECK:       omp.reduction.neutral7:                           ; preds = %[[VAL_41]]
+// CHECK:         store i32 0, ptr null, align 4
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } { ptr null, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }, ptr %[[VAL_42]], align 8
+// CHECK:         %[[VAL_53:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_42]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_53]], ptr %[[VAL_46]], align 8
+// CHECK:         br label %[[VAL_54:.*]]
+// CHECK:       omp.reduction.neutral8:                           ; preds = %[[VAL_51]], %[[VAL_52]]
+// CHECK:         br label %[[VAL_55:.*]]
+// CHECK:       omp.region.cont4:                                 ; preds = %[[VAL_54]]
+// CHECK:         %[[VAL_56:.*]] = phi ptr [ %[[VAL_46]], %[[VAL_54]] ]
+// CHECK:         store ptr %[[VAL_56]], ptr %[[VAL_21]], align 8
+// CHECK:         br label %[[VAL_57:.*]]
+// CHECK:       omp.par.region:                                   ; preds = %[[VAL_55]]
+// CHECK:         br label %[[VAL_58:.*]]
+// CHECK:       omp.par.region10:                                 ; preds = %[[VAL_57]]
+// CHECK:         %[[VAL_59:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_60:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_61:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_40]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_61]], ptr %[[VAL_60]], align 8
+// CHECK:         %[[VAL_62:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_60]], i32 0, i32 0
+// CHECK:         %[[VAL_63:.*]] = load ptr, ptr %[[VAL_62]], align 8
+// CHECK:         store i32 42, ptr %[[VAL_63]], align 4
+// CHECK:         %[[VAL_64:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_56]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_64]], ptr %[[VAL_59]], align 8
+// CHECK:         %[[VAL_65:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_59]], i32 0, i32 0
+// CHECK:         %[[VAL_66:.*]] = load ptr, ptr %[[VAL_65]], align 8
+// CHECK:         store i32 24, ptr %[[VAL_66]], align 4
+// CHECK:         br label %[[VAL_67:.*]]
+// CHECK:       omp.region.cont9:                                 ; preds = %[[VAL_58]]
+// CHECK:         %[[VAL_68:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_22]], i64 0, i64 0
+// CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_68]], align 8
+// CHECK:         %[[VAL_69:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_22]], i64 0, i64 1
+// CHECK:         store ptr %[[VAL_21]], ptr %[[VAL_69]], align 8
+// CHECK:         %[[VAL_70:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[VAL_71:.*]] = call i32 @__kmpc_reduce(ptr @1, i32 %[[VAL_70]], i32 2, i64 16, ptr %[[VAL_22]], ptr @.omp.reduction.func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK:         switch i32 %[[VAL_71]], label %[[VAL_72:.*]] [
+// CHECK:           i32 1, label %[[VAL_73:.*]]
+// CHECK:           i32 2, label %[[VAL_74:.*]]
+// CHECK:         ]
+// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_67]]
+// CHECK:         unreachable
+// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_67]]
+// CHECK:         %[[VAL_75:.*]] = load ptr, ptr %[[VAL_20]], align 8
+// CHECK:         %[[VAL_76:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_77:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_78:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_13]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_78]], ptr %[[VAL_77]], align 8
+// CHECK:         %[[VAL_79:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_75]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_79]], ptr %[[VAL_76]], align 8
+// CHECK:         %[[VAL_80:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_77]], i32 0, i32 0
+// CHECK:         %[[VAL_81:.*]] = load ptr, ptr %[[VAL_80]], align 8
+// CHECK:         %[[VAL_82:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_76]], i32 0, i32 0
+// CHECK:         %[[VAL_83:.*]] = load ptr, ptr %[[VAL_82]], align 8
+// CHECK:         %[[VAL_84:.*]] = load i32, ptr %[[VAL_81]], align 4
+// CHECK:         %[[VAL_85:.*]] = load i32, ptr %[[VAL_83]], align 4
+// CHECK:         %[[VAL_86:.*]] = add i32 %[[VAL_84]], %[[VAL_85]]
+// CHECK:         store i32 %[[VAL_86]], ptr %[[VAL_81]], align 4
+// CHECK:         %[[VAL_87:.*]] = load ptr, ptr %[[VAL_21]], align 8
+// CHECK:         %[[VAL_88:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_89:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_90:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_15]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_90]], ptr %[[VAL_89]], align 8
+// CHECK:         %[[VAL_91:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_87]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_91]], ptr %[[VAL_88]], align 8
+// CHECK:         %[[VAL_92:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_89]], i32 0, i32 0
+// CHECK:         %[[VAL_93:.*]] = load ptr, ptr %[[VAL_92]], align 8
+// CHECK:         %[[VAL_94:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_88]], i32 0, i32 0
+// CHECK:         %[[VAL_95:.*]] = load ptr, ptr %[[VAL_94]], align 8
+// CHECK:         %[[VAL_96:.*]] = load i32, ptr %[[VAL_93]], align 4
+// CHECK:         %[[VAL_97:.*]] = load i32, ptr %[[VAL_95]], align 4
+// CHECK:         %[[VAL_98:.*]] = add i32 %[[VAL_96]], %[[VAL_97]]
+// CHECK:         store i32 %[[VAL_98]], ptr %[[VAL_93]], align 4
+// CHECK:         call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_70]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK:         br label %[[VAL_72]]
+// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_73]], %[[VAL_67]]
+// CHECK:         br label %[[VAL_99:.*]]
+// CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_72]]
+// CHECK:         %[[VAL_100:.*]] = load ptr, ptr %[[VAL_20]], align 8
+// CHECK:         br label %[[VAL_101:.*]]
+// CHECK:       omp.reduction.cleanup:                            ; preds = %[[VAL_99]]
+// CHECK:         %[[VAL_102:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_103:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_100]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_103]], ptr %[[VAL_102]], align 8
+// CHECK:         %[[VAL_104:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_102]], i32 0, i32 0
+// CHECK:         %[[VAL_105:.*]] = load ptr, ptr %[[VAL_104]], align 8
+// CHECK:         %[[VAL_106:.*]] = ptrtoint ptr %[[VAL_105]] to i64
+// CHECK:         %[[VAL_107:.*]] = icmp ne i64 %[[VAL_106]], 0
+// CHECK:         br i1 %[[VAL_107]], label %[[VAL_108:.*]], label %[[VAL_109:.*]]
+// CHECK:       omp.reduction.cleanup14:                          ; preds = %[[VAL_108]], %[[VAL_101]]
+// CHECK:         br label %[[VAL_110:.*]]
+// CHECK:       omp.region.cont12:                                ; preds = %[[VAL_109]]
+// CHECK:         %[[VAL_111:.*]] = load ptr, ptr %[[VAL_21]], align 8
+// CHECK:         br label %[[VAL_112:.*]]
+// CHECK:       omp.reduction.cleanup16:                          ; preds = %[[VAL_110]]
+// CHECK:         %[[VAL_113:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         %[[VAL_114:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_111]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_114]], ptr %[[VAL_113]], align 8
+// CHECK:         %[[VAL_115:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_113]], i32 0, i32 0
+// CHECK:         %[[VAL_116:.*]] = load ptr, ptr %[[VAL_115]], align 8
+// CHECK:         %[[VAL_117:.*]] = ptrtoint ptr %[[VAL_116]] to i64
+// CHECK:         %[[VAL_118:.*]] = icmp ne i64 %[[VAL_117]], 0
+// CHECK:         br i1 %[[VAL_118]], label %[[VAL_119:.*]], label %[[VAL_120:.*]]
+// CHECK:       omp.reduction.cleanup18:                          ; preds = %[[VAL_119]], %[[VAL_112]]
+// CHECK:         br label %[[VAL_121:.*]]
+// CHECK:       omp.region.cont15:                                ; preds = %[[VAL_120]]
+// CHECK:         br label %[[VAL_122:.*]]
+// CHECK:       omp.reduction.cleanup17:                          ; preds = %[[VAL_112]]
+// CHECK:         br label %[[VAL_120]]
+// CHECK:       omp.reduction.cleanup13:                          ; preds = %[[VAL_101]]
+// CHECK:         br label %[[VAL_109]]
+// CHECK:       omp.reduction.neutral6:                           ; preds = %[[VAL_41]]
+// CHECK:         %[[VAL_123:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }, ptr %[[VAL_48]], 0
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_123]], ptr %[[VAL_43]], align 8
+// CHECK:         %[[VAL_124:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_43]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_124]], ptr %[[VAL_46]], align 8
+// CHECK:         br label %[[VAL_54]]
+// CHECK:       omp.reduction.neutral1:                           ; preds = %[[VAL_25]]
+// CHECK:         %[[VAL_125:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }, ptr %[[VAL_32]], 0
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_125]], ptr %[[VAL_27]], align 8
+// CHECK:         %[[VAL_126:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_27]], align 8
+// CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_126]], ptr %[[VAL_30]], align 8
+// CHECK:         br label %[[VAL_38]]
+// CHECK:       omp.par.outlined.exit.exitStub:                   ; preds = %[[VAL_121]]
+// CHECK:         ret void
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
index 361905f7cddeb..0f757de39a006 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
@@ -61,10 +61,10 @@ module {
 // CHECK:         %[[VAL_19:.*]] = load i32, ptr %[[VAL_16]], align 4
 // CHECK:         %[[VAL_21:.*]] = alloca ptr, align 8
 // CHECK:         %[[VAL_23:.*]] = alloca ptr, align 8
-// CHECK:         %[[VAL_20:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_13]], align 8
 // CHECK:         %[[VAL_24:.*]] = alloca [2 x ptr], align 8
 // CHECK:         br label %[[INIT_LABEL:.*]]
 // CHECK: [[INIT_LABEL]]:
+// CHECK:         %[[VAL_20:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_13]], align 8
 // CHECK:         store ptr %[[VAL_13]], ptr %[[VAL_21]], align 8
 // CHECK:         %[[VAL_22:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_15]], align 8
 // CHECK:         store ptr %[[VAL_15]], ptr %[[VAL_23]], align 8

>From 5b39edff5d21242e3af64d863ac68cdd6b458f29 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Wed, 19 Jun 2024 13:00:44 +0000
Subject: [PATCH 2/3] Add braces

---
 .../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp      | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 9fe63a9655be2..7793d5da952ef 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -390,15 +390,16 @@ static LogicalResult inlineConvertOmpRegions(
 
     if (potentialTerminator && potentialTerminator->isTerminator()) {
       llvm::BasicBlock *block = builder.GetInsertBlock();
-      if (block->empty())
+      if (block->empty()) {
         // this can happen for really simple reduction init regions e.g.
         // %0 = llvm.mlir.constant(0 : i32) : i32
         // omp.yield(%0 : i32)
         // because the llvm.mlir.constant (MLIR op) isn't converted into any
         // llvm op
         potentialTerminator->insertInto(block, block->begin());
-      else
+      } else {
         potentialTerminator->insertAfter(&block->back());
+      }
     }
 
     return success();

>From 2be9ad26b63872eea8610f575ec13b4956be04af Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Wed, 19 Jun 2024 13:13:37 +0000
Subject: [PATCH 3/3] canonicalize and cse test

---
 .../openmp-parallel-reduction-multiblock.mlir | 194 ++++++++----------
 1 file changed, 86 insertions(+), 108 deletions(-)

diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
index 00020bd4c9d1e..4952b15287f81 100644
--- a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
@@ -13,142 +13,120 @@
 
 omp.declare_reduction @add_reduction_byref_box_heap_i32 : !llvm.ptr init {
 ^bb0(%arg0: !llvm.ptr):
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-  %2 = llvm.mlir.constant(1 : i32) : i32
-  %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-  %4 = llvm.mlir.constant(1 : i32) : i32
-  %5 = llvm.alloca %4 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-  %6 = llvm.mlir.constant(0 : i64) : i64
-  %7 = llvm.mlir.constant(0 : i32) : i32
-  %8 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-  llvm.store %8, %5 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
-  %9 = llvm.mlir.constant(1 : i64) : i64
-  %10 = llvm.alloca %9 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr
-  %11 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-  %12 = llvm.load %11 : !llvm.ptr -> !llvm.ptr
-  %13 = llvm.ptrtoint %12 : !llvm.ptr to i64
-  %14 = llvm.icmp "eq" %13, %6 : i64
-  llvm.cond_br %14, ^bb1, ^bb2
+  %0 = llvm.mlir.constant(2 : i32) : i32
+  %1 = llvm.mlir.constant(20180515 : i32) : i32
+  %2 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  %3 = llvm.mlir.zero : !llvm.ptr
+  %4 = llvm.mlir.constant(9 : i32) : i32
+  %5 = llvm.mlir.constant(1 : i64) : i64
+  %6 = llvm.mlir.constant(0 : i32) : i32
+  %7 = llvm.mlir.constant(0 : i64) : i64
+  %8 = llvm.mlir.constant(1 : i32) : i32
+  %9 = llvm.alloca %8 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %10 = llvm.alloca %8 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %11 = llvm.alloca %8 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %12 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  llvm.store %12, %11 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %13 = llvm.alloca %5 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr
+  %14 = llvm.getelementptr %11[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  %15 = llvm.load %14 : !llvm.ptr -> !llvm.ptr
+  %16 = llvm.ptrtoint %15 : !llvm.ptr to i64
+  %17 = llvm.icmp "eq" %16, %7 : i64
+  llvm.cond_br %17, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
-  %15 = llvm.mlir.constant(9 : i32) : i32
-  %16 = llvm.mlir.zero : !llvm.ptr
-  %17 = llvm.getelementptr %16[1] : (!llvm.ptr) -> !llvm.ptr, i32
-  %18 = llvm.ptrtoint %17 : !llvm.ptr to i64
-  %19 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-  %20 = llvm.insertvalue %18, %19[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  %21 = llvm.mlir.constant(20180515 : i32) : i32
-  %22 = llvm.insertvalue %21, %20[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  %23 = llvm.mlir.constant(0 : i32) : i32
-  %24 = llvm.trunc %23 : i32 to i8
-  %25 = llvm.insertvalue %24, %22[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  %26 = llvm.trunc %15 : i32 to i8
-  %27 = llvm.insertvalue %26, %25[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  %28 = llvm.mlir.constant(2 : i32) : i32
-  %29 = llvm.trunc %28 : i32 to i8
-  %30 = llvm.insertvalue %29, %27[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  %31 = llvm.mlir.constant(0 : i32) : i32
-  %32 = llvm.trunc %31 : i32 to i8
-  %33 = llvm.insertvalue %32, %30[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  %34 = llvm.insertvalue %12, %33[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  llvm.store %34, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
-  %35 = llvm.load %3 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-  llvm.store %35, %10 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %18 = llvm.getelementptr %3[1] : (!llvm.ptr) -> !llvm.ptr, i32
+  %19 = llvm.ptrtoint %18 : !llvm.ptr to i64
+  %20 = llvm.insertvalue %19, %2[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %21 = llvm.insertvalue %1, %20[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %22 = llvm.trunc %6 : i32 to i8
+  %23 = llvm.insertvalue %22, %21[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %24 = llvm.trunc %4 : i32 to i8
+  %25 = llvm.insertvalue %24, %23[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %26 = llvm.trunc %0 : i32 to i8
+  %27 = llvm.insertvalue %26, %25[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %28 = llvm.insertvalue %22, %27[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %29 = llvm.insertvalue %15, %28[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  llvm.store %29, %10 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %30 = llvm.load %10 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  llvm.store %30, %13 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
   llvm.br ^bb3
 ^bb2:  // pred: ^bb0
-  %36 = llvm.mlir.zero : !llvm.ptr
-  %37 = llvm.getelementptr %36[1] : (!llvm.ptr) -> !llvm.ptr, i32
-  %38 = llvm.ptrtoint %37 : !llvm.ptr to i64
-  //%39 = llvm.call @malloc(%38) {in_type = i32, operandSegmentSizes = array<i32: 0, 0>} : (i64) -> !llvm.ptr
-  %39 = llvm.mlir.zero : !llvm.ptr
-  llvm.store %7, %39 : i32, !llvm.ptr
-  %40 = llvm.mlir.constant(9 : i32) : i32
-  %41 = llvm.mlir.zero : !llvm.ptr
-  %42 = llvm.getelementptr %41[1] : (!llvm.ptr) -> !llvm.ptr, i32
-  %43 = llvm.ptrtoint %42 : !llvm.ptr to i64
-  %44 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-  %45 = llvm.insertvalue %43, %44[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  %46 = llvm.mlir.constant(20180515 : i32) : i32
-  %47 = llvm.insertvalue %46, %45[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  %48 = llvm.mlir.constant(0 : i32) : i32
-  %49 = llvm.trunc %48 : i32 to i8
-  %50 = llvm.insertvalue %49, %47[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  %51 = llvm.trunc %40 : i32 to i8
-  %52 = llvm.insertvalue %51, %50[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  %53 = llvm.mlir.constant(2 : i32) : i32
-  %54 = llvm.trunc %53 : i32 to i8
-  %55 = llvm.insertvalue %54, %52[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  %56 = llvm.mlir.constant(0 : i32) : i32
-  %57 = llvm.trunc %56 : i32 to i8
-  %58 = llvm.insertvalue %57, %55[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  %59 = llvm.insertvalue %39, %58[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
-  llvm.store %59, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
-  %60 = llvm.load %1 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-  llvm.store %60, %10 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %31 = llvm.getelementptr %3[1] : (!llvm.ptr) -> !llvm.ptr, i32
+  llvm.store %6, %3 : i32, !llvm.ptr
+  %32 = llvm.ptrtoint %31 : !llvm.ptr to i64
+  %33 = llvm.insertvalue %32, %2[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %34 = llvm.insertvalue %1, %33[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %35 = llvm.trunc %6 : i32 to i8
+  %36 = llvm.insertvalue %35, %34[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %37 = llvm.trunc %4 : i32 to i8
+  %38 = llvm.insertvalue %37, %36[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %39 = llvm.trunc %0 : i32 to i8
+  %40 = llvm.insertvalue %39, %38[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %41 = llvm.insertvalue %35, %40[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  %42 = llvm.insertvalue %3, %41[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> 
+  llvm.store %42, %9 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %43 = llvm.load %9 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  llvm.store %43, %13 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
   llvm.br ^bb3
 ^bb3:  // 2 preds: ^bb1, ^bb2
-  omp.yield(%10 : !llvm.ptr)
+  omp.yield(%13 : !llvm.ptr)
 } combiner {
 ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-  %2 = llvm.mlir.constant(1 : i32) : i32
-  %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-  %4 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-  llvm.store %4, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
-  %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-  llvm.store %5, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
-  %6 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-  %7 = llvm.load %6 : !llvm.ptr -> !llvm.ptr
-  %8 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-  %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr
-  %10 = llvm.load %7 : !llvm.ptr -> i32
-  %11 = llvm.load %9 : !llvm.ptr -> i32
-  %12 = llvm.add %10, %11 : i32
-  llvm.store %12, %7 : i32, !llvm.ptr
+  %2 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  llvm.store %3, %2 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %4 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  llvm.store %4, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %5 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  %6 = llvm.load %5 : !llvm.ptr -> !llvm.ptr
+  %7 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  %8 = llvm.load %7 : !llvm.ptr -> !llvm.ptr
+  %9 = llvm.load %6 : !llvm.ptr -> i32
+  %10 = llvm.load %8 : !llvm.ptr -> i32
+  %11 = llvm.add %9, %10 : i32
+  llvm.store %11, %6 : i32, !llvm.ptr
   omp.yield(%arg0 : !llvm.ptr)
 }  cleanup {
 ^bb0(%arg0: !llvm.ptr):
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-  %2 = llvm.mlir.constant(0 : i64) : i64
+  %0 = llvm.mlir.constant(0 : i64) : i64
+  %1 = llvm.mlir.constant(1 : i32) : i32
+  %2 = llvm.alloca %1 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
   %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-  llvm.store %3, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
-  %4 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+  llvm.store %3, %2 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+  %4 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
   %5 = llvm.load %4 : !llvm.ptr -> !llvm.ptr
   %6 = llvm.ptrtoint %5 : !llvm.ptr to i64
-  %7 = llvm.icmp "ne" %6, %2 : i64
+  %7 = llvm.icmp "ne" %6, %0 : i64
   llvm.cond_br %7, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
-  //llvm.call @free(%5) : (!llvm.ptr) -> ()
   llvm.br ^bb2
 ^bb2:  // 2 preds: ^bb0, ^bb1
   omp.yield
 }
 llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: !llvm.ptr {fir.bindc_name = "y"}) attributes {fir.internal_name = "_QPmissordered_blocks", frame_pointer = #llvm.framePointerKind<"non-leaf">, target_cpu = "generic", target_features = #llvm.target_features<["+outline-atomics", "+v8a", "+fp-armv8", "+neon"]>} {
-  %0 = llvm.mlir.constant(24 : i32) : i32
-  %1 = llvm.mlir.constant(42 : i32) : i32
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.mlir.constant(24 : i32) : i32
+  %2 = llvm.mlir.constant(42 : i32) : i32
   omp.parallel reduction(byref @add_reduction_byref_box_heap_i32 %arg0 -> %arg2 : !llvm.ptr, byref @add_reduction_byref_box_heap_i32 %arg1 -> %arg3 : !llvm.ptr) {
-    %2 = llvm.mlir.constant(1 : i32) : i32
-    %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-    %4 = llvm.mlir.constant(1 : i32) : i32
-    %5 = llvm.alloca %4 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-    %6 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-    llvm.store %6, %5 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
-    %7 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-    %8 = llvm.load %7 : !llvm.ptr -> !llvm.ptr
-    llvm.store %1, %8 : i32, !llvm.ptr
-    %9 = llvm.load %arg3 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-    llvm.store %9, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
-    %10 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-    %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr
-    llvm.store %0, %11 : i32, !llvm.ptr
+    %3 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %4 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %5 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+    llvm.store %5, %4 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+    %6 = llvm.getelementptr %4[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+    %7 = llvm.load %6 : !llvm.ptr -> !llvm.ptr
+    llvm.store %2, %7 : i32, !llvm.ptr
+    %8 = llvm.load %arg3 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+    llvm.store %8, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+    %9 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+    %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+    llvm.store %1, %10 : i32, !llvm.ptr
     omp.terminator
   }
   llvm.return
 }
 
-
 // CHECK:         %[[VAL_0:.*]] = alloca { ptr, ptr }, align 8
 // CHECK:         br label %[[VAL_1:.*]]
 // CHECK:       entry:                                            ; preds = %[[VAL_2:.*]]