[Mlir-commits] [flang] [mlir] [Flang][mlir] - Translation of delayed privatization for deferred target-tasks (PR #155348)

Pranav Bhandarkar llvmlistbot at llvm.org
Mon Sep 29 08:53:16 PDT 2025


https://github.com/bhandarkar-pranav updated https://github.com/llvm/llvm-project/pull/155348

>From 938091dd5d8c0392ac3874e5f2ec915428a5da08 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Tue, 17 Jun 2025 14:25:29 -0500
Subject: [PATCH 01/27] [Flang][mlir] - Translation of delayed privatization
 for deferred target-tasks

This patch adds support for translation of the private clause on deferred
target tasks - that is `omp.target` operations with the `nowait` clause.

An offloading call for a deferred target-task is not blocking - the offloading
host task continues it execution after issuing the offloading call. Therefore,
the key problem we need to solve is to ensure that the data needed for private
variables to be initialized in the target task persists even after the host
task has completed.
We do this in a new pass called PrepareForOMPOffloadPrivatizationPass. For a privatized
variable that needs its host counterpart for initialization (such as the shape
of the data from the descriptor when an allocatable is privatized or the value of
the data when an allocatable is firstprivatized),
  - the pass allocates memory on the heap.
  - it then initializes this memory by copying the contents of host variable to
    the newly allocated location on the heap.
  - Then, the pass updates all the `omp.map.info` operations that pointed to the
    host variable to now point to the one located in the heap.

The pass uses a rewrite pattern applied using the greedy pattern matcher, which
in turn does some constant folding and DCE. Due to this a number of lit tests
had to be updated. In GEPs constant get folded into indices and truncated to
i32 types. In some tests sequence of insertvalue and extractvalue instructions
get cancelled out. So, these needed to be updated too.
---
 .../flang/Optimizer/Passes/Pipelines.h        |   1 +
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       |   1 +
 flang/lib/Optimizer/Passes/Pipelines.cpp      |   6 +
 flang/test/Driver/tco-emit-final-mlir.fir     |   4 +-
 flang/test/Driver/tco-test-gen.fir            |   5 +-
 flang/test/Fir/alloc-32.fir                   |   2 +-
 flang/test/Fir/alloc.fir                      |  17 +-
 flang/test/Fir/arrexp.fir                     |   4 +-
 flang/test/Fir/basic-program.fir              |   2 +
 flang/test/Fir/box.fir                        |   6 +-
 flang/test/Fir/boxproc.fir                    |  16 +-
 flang/test/Fir/embox.fir                      |   6 +-
 .../test/Fir/omp-reduction-embox-codegen.fir  |   6 +-
 flang/test/Fir/optional.fir                   |   3 +-
 flang/test/Fir/pdt.fir                        |   6 +-
 flang/test/Fir/rebox.fir                      |  18 +-
 flang/test/Fir/select.fir                     |   2 +-
 flang/test/Fir/target.fir                     |   4 -
 flang/test/Fir/tbaa-codegen2.fir              |  12 +-
 .../OpenMP/map-types-and-sizes.f90            |  14 +-
 flang/test/Lower/allocatable-polymorphic.f90  |   4 -
 flang/test/Lower/forall/character-1.f90       |   4 +-
 .../OpenMPOffloadPrivatizationPrepare.h       |  23 +
 .../mlir/Dialect/LLVMIR/Transforms/Passes.td  |  12 +
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |   4 +-
 .../Dialect/LLVMIR/IR/FunctionCallUtils.cpp   |   3 +-
 .../Dialect/LLVMIR/Transforms/CMakeLists.txt  |   2 +
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 429 ++++++++++++++++++
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  11 +-
 mlir/lib/Tools/mlir-opt/MlirOptMain.cpp       |   1 +
 .../omp-offload-privatization-prepare.mlir    | 167 +++++++
 mlir/test/Target/LLVMIR/openmp-todo.mlir      |  18 -
 32 files changed, 705 insertions(+), 108 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h
 create mode 100644 mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
 create mode 100644 mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir

diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index fd8c43cc88a19..4d4d30e69cdd7 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -22,6 +22,7 @@
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 76f3cbd421cb9..b4b4e6e7e1283 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -52,6 +52,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/LLVMIR/Transforms/AddComdats.h"
+#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/BuiltinTypes.h"
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 98f947a1f635d..d9cb14262ea95 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -411,6 +411,12 @@ void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
 
   // Add codegen pass pipeline.
   fir::createDefaultFIRCodeGenPassPipeline(pm, config, inputFilename);
+
+  // Run a pass to prepare for translation of delayed privatization in the
+  // context of deferred target tasks.
+  addNestedPassConditionally<mlir::LLVM::LLVMFuncOp>(pm, disableFirToLlvmIr,[&]() {
+    return mlir::LLVM::createPrepareForOMPOffloadPrivatizationPass();
+  });
 }
 
 } // namespace fir
diff --git a/flang/test/Driver/tco-emit-final-mlir.fir b/flang/test/Driver/tco-emit-final-mlir.fir
index 75f8f153127af..177810cf41378 100644
--- a/flang/test/Driver/tco-emit-final-mlir.fir
+++ b/flang/test/Driver/tco-emit-final-mlir.fir
@@ -13,7 +13,7 @@
 // CHECK: llvm.return
 // CHECK-NOT: func.func
 
-func.func @_QPfoo() {
+func.func @_QPfoo() -> !fir.ref<i32> {
   %1 = fir.alloca i32
-  return
+  return %1 : !fir.ref<i32>
 }
diff --git a/flang/test/Driver/tco-test-gen.fir b/flang/test/Driver/tco-test-gen.fir
index 38d4e50ecf3aa..15483f7ee3534 100644
--- a/flang/test/Driver/tco-test-gen.fir
+++ b/flang/test/Driver/tco-test-gen.fir
@@ -42,11 +42,10 @@ func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "num"}, %arg1: !fir.re
 // CHECK-SAME:      %[[ARG2:.*]]: !llvm.ptr {fir.bindc_name = "ub", llvm.nocapture},
 // CHECK-SAME:      %[[ARG3:.*]]: !llvm.ptr {fir.bindc_name = "step", llvm.nocapture}) {
 
+// CMPLX:           %[[VAL_3:.*]] = llvm.mlir.constant(0 : index) : i64
+// CMPLX:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : index) : i64
 // CMPLX:           %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CMPLX:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
-// CMPLX:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : index) : i64
-// CMPLX:           %[[VAL_3:.*]] = llvm.mlir.constant(0 : index) : i64
-// CMPLX:           %[[VAL_4:.*]] = llvm.mlir.constant(1 : i64) : i64
 
 // SIMPLE:          %[[VAL_3:.*]] = llvm.mlir.constant(0 : index) : i64
 // SIMPLE:          %[[VAL_2:.*]] = llvm.mlir.constant(1 : index) : i64
diff --git a/flang/test/Fir/alloc-32.fir b/flang/test/Fir/alloc-32.fir
index a3cbf200c24fc..f57f6ce6fcf5e 100644
--- a/flang/test/Fir/alloc-32.fir
+++ b/flang/test/Fir/alloc-32.fir
@@ -19,7 +19,7 @@ func.func @allocmem_scalar_nonchar() -> !fir.heap<i32> {
 // CHECK-LABEL: define ptr @allocmem_scalar_dynchar(
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
-// CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]]
+// CHECK: %[[mul2:.*]] = mul i64 %[[mul1]], 1
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
 // CHECK: %[[sz:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
 // CHECK: %[[trunc:.*]] = trunc i64 %[[sz]] to i32
diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir
index 8da8b828c18b9..0d3ce323d0d7c 100644
--- a/flang/test/Fir/alloc.fir
+++ b/flang/test/Fir/alloc.fir
@@ -86,7 +86,7 @@ func.func @alloca_scalar_dynchar_kind(%l : i32) -> !fir.ref<!fir.char<2,?>> {
 // CHECK-LABEL: define ptr @allocmem_scalar_dynchar(
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
-// CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]]
+// CHECK: %[[mul2:.*]] = mul i64 %[[mul1]], 1
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
 // CHECK: call ptr @malloc(i64 %[[size]])
@@ -98,7 +98,7 @@ func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap<!fir.char<1,?>> {
 // CHECK-LABEL: define ptr @allocmem_scalar_dynchar_kind(
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
-// CHECK: %[[mul2:.*]] = mul i64 2, %[[mul1]]
+// CHECK: %[[mul2:.*]] = mul i64 %[[mul1]], 2
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
 // CHECK: call ptr @malloc(i64 %[[size]])
@@ -185,7 +185,7 @@ func.func @alloca_dynarray_of_nonchar2(%e: index) -> !fir.ref<!fir.array<?x?xi32
 
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_nonchar(
 // CHECK-SAME: i64 %[[extent:.*]])
-// CHECK: %[[prod1:.*]] = mul i64 12, %[[extent]]
+// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 12
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod1]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod1]], i64 1
 // CHECK: call ptr @malloc(i64 %[[size]])
@@ -196,7 +196,7 @@ func.func @allocmem_dynarray_of_nonchar(%e: index) -> !fir.heap<!fir.array<3x?xi
 
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_nonchar2(
 // CHECK-SAME: i64 %[[extent:.*]])
-// CHECK: %[[prod1:.*]] = mul i64 4, %[[extent]]
+// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 4
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod2]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod2]], i64 1
@@ -227,7 +227,7 @@ func.func @alloca_dynarray_of_char2(%e : index) -> !fir.ref<!fir.array<?x?x!fir.
 
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_char(
 // CHECK-SAME: i64 %[[extent:.*]])
-// CHECK: %[[prod1:.*]] = mul i64 60, %[[extent]]
+// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 60
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod1]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod1]], i64 1
 // CHECK: call ptr @malloc(i64 %[[size]])
@@ -238,7 +238,7 @@ func.func @allocmem_dynarray_of_char(%e : index) -> !fir.heap<!fir.array<3x?x!fi
 
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_char2(
 // CHECK-SAME: i64 %[[extent:.*]])
-// CHECK: %[[prod1:.*]] = mul i64 20, %[[extent]]
+// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 20
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod2]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
@@ -286,7 +286,7 @@ func.func @allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> !fir.heap<!fir.a
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_dynchar2(
 // CHECK-SAME: i32 %[[len:.*]], i64 %[[extent:.*]])
 // CHECK: %[[a:.*]] = sext i32 %[[len]] to i64
-// CHECK: %[[prod1:.*]] = mul i64 2, %[[a]]
+// CHECK: %[[prod1:.*]] = mul i64 %[[a]], 2
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
 // CHECK: %[[prod3:.*]] = mul i64 %[[prod2]], %[[extent]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod3]], 0
@@ -366,12 +366,13 @@ func.func @allocmem_array_with_holes_dynchar(%arg0: index, %arg1: index) -> !fir
 // CHECK:    %[[VAL_0:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
 // CHECK:    %[[VAL_3:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, i64 1
 // CHECK:    %[[VAL_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
-
+func.func private @foo(%0: !fir.ref<!fir.class<none>>, %1: !fir.ref<!fir.class<!fir.array<?xnone>>>, %2: !fir.ref<!fir.box<none>>, %3: !fir.ref<!fir.box<!fir.array<?xnone>>>)
 func.func @alloca_unlimited_polymorphic_box() {
   %0 = fir.alloca !fir.class<none>
   %1 = fir.alloca !fir.class<!fir.array<?xnone>>
   %2 = fir.alloca !fir.box<none>
   %3 = fir.alloca !fir.box<!fir.array<?xnone>>
+  fir.call @foo(%0, %1, %2, %3) : (!fir.ref<!fir.class<none>>, !fir.ref<!fir.class<!fir.array<?xnone>>>, !fir.ref<!fir.box<none>>, !fir.ref<!fir.box<!fir.array<?xnone>>>) -> ()
   return
 }
 // Note: allocmem of fir.box are not possible (fir::HeapType::verify does not
diff --git a/flang/test/Fir/arrexp.fir b/flang/test/Fir/arrexp.fir
index e8ec8ac79e0c2..2eb717228d998 100644
--- a/flang/test/Fir/arrexp.fir
+++ b/flang/test/Fir/arrexp.fir
@@ -143,9 +143,9 @@ func.func @f6(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: f32) {
   %c9 = arith.constant 9 : index
   %c10 = arith.constant 10 : index
 
-  // CHECK: %[[EXT_GEP:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 7, i64 0, i32 1
+  // CHECK: %[[EXT_GEP:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 7, i32 0, i32 1
   // CHECK: %[[EXTENT:.*]] = load i64, ptr %[[EXT_GEP]]
-  // CHECK: %[[SIZE:.*]] = mul i64 4, %[[EXTENT]]
+  // CHECK: %[[SIZE:.*]] = mul i64 %[[EXTENT]], 4
   // CHECK: %[[CMP:.*]] = icmp sgt i64 %[[SIZE]], 0
   // CHECK: %[[SZ:.*]] = select i1 %[[CMP]], i64 %[[SIZE]], i64 1
   // CHECK: %[[MALLOC:.*]] = call ptr @malloc(i64 %[[SZ]])
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index c9fe53bf093a1..6bad03dded24d 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -158,4 +158,6 @@ func.func @_QQmain() {
 // PASSES-NEXT:  LowerNontemporalPass
 // PASSES-NEXT: FIRToLLVMLowering
 // PASSES-NEXT: ReconcileUnrealizedCasts
+// PASSES-NEXT: 'llvm.func' Pipeline
+// PASSES-NEXT: PrepareForOMPOffloadPrivatizationPass
 // PASSES-NEXT: LLVMIRLoweringPass
diff --git a/flang/test/Fir/box.fir b/flang/test/Fir/box.fir
index c0cf3d8375983..760fbd4792122 100644
--- a/flang/test/Fir/box.fir
+++ b/flang/test/Fir/box.fir
@@ -57,7 +57,7 @@ func.func @fa(%a : !fir.ref<!fir.array<100xf32>>) {
 // CHECK-SAME: ptr {{[^%]*}}%[[res:.*]], ptr {{[^%]*}}%[[arg0:.*]], i64 %[[arg1:.*]])
 func.func @b1(%arg0 : !fir.ref<!fir.char<1,?>>, %arg1 : index) -> !fir.box<!fir.char<1,?>> {
   // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }
-  // CHECK: %[[size:.*]] = mul i64 1, %[[arg1]]
+  // CHECK: %[[size:.*]] = mul i64 %[[arg1]], 1
   // CHECK: insertvalue {{.*}} undef, i64 %[[size]], 1
   // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0
@@ -89,7 +89,7 @@ func.func @b2(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,5>>>, %arg1 : index) ->
 func.func @b3(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,?>>>, %arg1 : index, %arg2 : index) -> !fir.box<!fir.array<?x!fir.char<1,?>>> {
   %1 = fir.shape %arg2 : (index) -> !fir.shape<1>
   // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
-  // CHECK: %[[size:.*]] = mul i64 1, %[[arg1]]
+  // CHECK: %[[size:.*]] = mul i64 %[[arg1]], 1
   // CHECK: insertvalue {{.*}} i64 %[[size]], 1
   // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} i64 %[[arg2]], 7, 0, 1
@@ -108,7 +108,7 @@ func.func @b4(%arg0 : !fir.ref<!fir.array<7x!fir.char<1,?>>>, %arg1 : index) ->
   %c_7 = arith.constant 7 : index
   %1 = fir.shape %c_7 : (index) -> !fir.shape<1>
   // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
-  // CHECK:   %[[size:.*]] = mul i64 1, %[[arg1]]
+  // CHECK:   %[[size:.*]] = mul i64 %[[arg1]], 1
   // CHECK: insertvalue {{.*}} i64 %[[size]], 1
   // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} i64 7, 7, 0, 1
diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir
index 97d9b38ed6f40..d4c36a4f5b213 100644
--- a/flang/test/Fir/boxproc.fir
+++ b/flang/test/Fir/boxproc.fir
@@ -82,12 +82,8 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 // CHECK:         store [1 x i8] c" ", ptr %[[VAL_18]], align 1
 // CHECK:         call void @llvm.init.trampoline(ptr %[[VAL_20]], ptr @_QFtest_proc_dummy_charPgen_message, ptr %[[VAL_2]])
 // CHECK:         %[[VAL_23:.*]] = call ptr @llvm.adjust.trampoline(ptr %[[VAL_20]])
-// CHECK:         %[[VAL_25:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_23]], 0
-// CHECK:         %[[VAL_26:.*]] = insertvalue { ptr, i64 } %[[VAL_25]], i64 10, 1
 // CHECK:         %[[VAL_27:.*]] = call ptr @llvm.stacksave.p0()
-// CHECK:         %[[VAL_28:.*]] = extractvalue { ptr, i64 } %[[VAL_26]], 0
-// CHECK:         %[[VAL_29:.*]] = extractvalue { ptr, i64 } %[[VAL_26]], 1
-// CHECK:         %[[VAL_30:.*]] = call { ptr, i64 } @_QPget_message(ptr %[[VAL_0]], i64 40, ptr %[[VAL_28]], i64 %[[VAL_29]])
+// CHECK:         %[[VAL_30:.*]] = call { ptr, i64 } @_QPget_message(ptr %[[VAL_0]], i64 40, ptr %[[VAL_23]], i64 10)
 // CHECK:         %[[VAL_32:.*]] = call i1 @_FortranAioOutputAscii(ptr %{{.*}}, ptr %[[VAL_0]], i64 40)
 // CHECK:         call void @llvm.stackrestore.p0(ptr %[[VAL_27]])
 
@@ -115,14 +111,10 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 // CHECK-LABEL: define { ptr, i64 } @_QPget_message(ptr
 // CHECK-SAME:                  %[[VAL_0:.*]], i64 %[[VAL_1:.*]], ptr %[[VAL_2:.*]], i64
 // CHECK-SAME:                                                 %[[VAL_3:.*]])
-// CHECK:         %[[VAL_4:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_2]], 0
-// CHECK:         %[[VAL_5:.*]] = insertvalue { ptr, i64 } %[[VAL_4]], i64 %[[VAL_3]], 1
-// CHECK:         %[[VAL_7:.*]] = extractvalue { ptr, i64 } %[[VAL_5]], 0
-// CHECK:         %[[VAL_8:.*]] = extractvalue { ptr, i64 } %[[VAL_5]], 1
 // CHECK:         %[[VAL_9:.*]] = call ptr @llvm.stacksave.p0()
-// CHECK:         %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_8]], align 1
-// CHECK:         %[[VAL_12:.*]] = call { ptr, i64 } %[[VAL_7]](ptr %[[VAL_10]], i64 %[[VAL_8]])
-// CHECK:         %[[VAL_13:.*]] = add i64 %[[VAL_8]], 12
+// CHECK:         %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_3]], align 1
+// CHECK:         %[[VAL_12:.*]] = call { ptr, i64 } %[[VAL_2]](ptr %[[VAL_10]], i64 %[[VAL_3]])
+// CHECK:         %[[VAL_13:.*]] = add i64 %[[VAL_3]], 12
 // CHECK:         %[[VAL_14:.*]] = alloca i8, i64 %[[VAL_13]], align 1
 // CHECK:         call void @llvm.memmove.p0.p0.i64(ptr %[[VAL_14]], ptr {{.*}}, i64 12, i1 false)
 // CHECK:         %[[VAL_18:.*]] = phi i64
diff --git a/flang/test/Fir/embox.fir b/flang/test/Fir/embox.fir
index 0f304cff2c79e..11f7457b6873c 100644
--- a/flang/test/Fir/embox.fir
+++ b/flang/test/Fir/embox.fir
@@ -11,7 +11,7 @@ func.func @_QPtest_callee(%arg0: !fir.box<!fir.array<?xi32>>) {
 func.func @_QPtest_slice() {
 // CHECK:  %[[a1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 // CHECK:  %[[a2:.*]] = alloca [20 x i32], i64 1, align 4
-// CHECK:  %[[a3:.*]] = getelementptr [20 x i32], ptr %[[a2]], i64 0, i64 0
+// CHECK:  %[[a3:.*]] = getelementptr [20 x i32], ptr %[[a2]], i32 0, i64 0
 // CHECK:  %[[a4:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
 // CHECK:  { ptr undef, i64 4, i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]]
 // CHECK: [i64 1, i64 5, i64 8]] }, ptr %[[a3]], 0
@@ -38,7 +38,7 @@ func.func @_QPtest_dt_callee(%arg0: !fir.box<!fir.array<?xi32>>) {
 func.func @_QPtest_dt_slice() {
 // CHECK:  %[[a1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 // CHECK:  %[[a3:.*]] = alloca [20 x %_QFtest_dt_sliceTt], i64 1, align 8
-// CHECK:  %[[a4:.*]] = getelementptr [20 x %_QFtest_dt_sliceTt], ptr %[[a3]], i64 0, i64 0, i32 0
+// CHECK:  %[[a4:.*]] = getelementptr [20 x %_QFtest_dt_sliceTt], ptr %[[a3]], i32 0, i64 0, i32 0
 // CHECK: %[[a5:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
 // CHECK-SAME: { ptr undef, i64 4, i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]]
 // CHECK-SAME: [i64 1, i64 5, i64 16
@@ -73,7 +73,7 @@ func.func @emboxSubstring(%arg0: !fir.ref<!fir.array<2x3x!fir.char<1,4>>>) {
   %0 = fir.shape %c2, %c3 : (index, index) -> !fir.shape<2>
   %1 = fir.slice %c1, %c2, %c1, %c1, %c3, %c1 substr %c1_i64, %c2_i64 : (index, index, index, index, index, index, i64, i64) -> !fir.slice<2>
   %2 = fir.embox %arg0(%0) [%1] : (!fir.ref<!fir.array<2x3x!fir.char<1,4>>>, !fir.shape<2>, !fir.slice<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
-  // CHECK: %[[addr:.*]] = getelementptr [3 x [2 x [4 x i8]]], ptr %[[arg0]], i64 0, i64 0, i64 0, i64 1
+  // CHECK: %[[addr:.*]] = getelementptr [3 x [2 x [4 x i8]]], ptr %[[arg0]], i32 0, i64 0, i64 0, i32 1
   // CHECK: insertvalue {[[descriptorType:.*]]} { ptr undef, i64 2, i32 20240719, i8 2, i8 40, i8 0, i8 0
   // CHECK-SAME: [2 x [3 x i64]] [{{\[}}3 x i64] [i64 1, i64 2, i64 4], [3 x i64] [i64 1, i64 3, i64 8]] }
   // CHECK-SAME: ptr %[[addr]], 0
diff --git a/flang/test/Fir/omp-reduction-embox-codegen.fir b/flang/test/Fir/omp-reduction-embox-codegen.fir
index 1645e1a407ad4..e517b1352ff5c 100644
--- a/flang/test/Fir/omp-reduction-embox-codegen.fir
+++ b/flang/test/Fir/omp-reduction-embox-codegen.fir
@@ -23,14 +23,14 @@ omp.declare_reduction @test_reduction : !fir.ref<!fir.box<i32>> init {
   omp.yield(%0 : !fir.ref<!fir.box<i32>>)
 }
 
-func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+func.func @_QQmain()  -> !fir.ref<!fir.box<i32>> attributes {fir.bindc_name = "reduce"} {
   %4 = fir.alloca !fir.box<i32>
   omp.parallel reduction(byref @test_reduction %4 -> %arg0 : !fir.ref<!fir.box<i32>>) {
     omp.terminator
   }
-  return
+  return %4: !fir.ref<!fir.box<i32>>
 }
 
 // basically we are testing that there isn't a crash
-// CHECK-LABEL: define void @_QQmain
+// CHECK-LABEL: define ptr @_QQmain
 // CHECK-NEXT:    alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
diff --git a/flang/test/Fir/optional.fir b/flang/test/Fir/optional.fir
index bded8b5332a30..66ff69f083467 100644
--- a/flang/test/Fir/optional.fir
+++ b/flang/test/Fir/optional.fir
@@ -37,8 +37,7 @@ func.func @bar2() -> i1 {
 
 // CHECK-LABEL: @foo3
 func.func @foo3(%arg0: !fir.boxchar<1>) -> i1 {
-  // CHECK: %[[extract:.*]] = extractvalue { ptr, i64 } %{{.*}}, 0
-  // CHECK: %[[ptr:.*]] = ptrtoint ptr %[[extract]] to i64
+  // CHECK: %[[ptr:.*]] = ptrtoint ptr %0 to i64
   // CHECK: icmp ne i64 %[[ptr]], 0
   %0 = fir.is_present %arg0 : (!fir.boxchar<1>) -> i1
   return %0 : i1
diff --git a/flang/test/Fir/pdt.fir b/flang/test/Fir/pdt.fir
index a200cd7e7cc03..411927aae6bdf 100644
--- a/flang/test/Fir/pdt.fir
+++ b/flang/test/Fir/pdt.fir
@@ -96,13 +96,13 @@ func.func @_QTt1P.f2.offset(%0 : i32, %1 : i32) -> i32 {
 
 func.func private @bar(!fir.ref<!fir.char<1,?>>)
 
-// CHECK-LABEL: define void @_QPfoo(i32 %0, i32 %1)
-func.func @_QPfoo(%arg0 : i32, %arg1 : i32) {
+// CHECK-LABEL: define ptr @_QPfoo(i32 %0, i32 %1)
+func.func @_QPfoo(%arg0 : i32, %arg1 : i32) -> !fir.ref<!fir.type<_QTt1>> {
   // CHECK: %[[size:.*]] = call i64 @_QTt1P.mem.size(i32 %0, i32 %1)
   // CHECK: %[[alloc:.*]] = alloca i8, i64 %[[size]]
   %0 = fir.alloca !fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>(%arg0, %arg1 : i32, i32)
   //%2 = fir.coordinate_of %0, f2 : (!fir.ref<!fir.type<_QTt1>>) -> !fir.ref<!fir.char<1,?>>
   %2 = fir.zero_bits !fir.ref<!fir.char<1,?>>
   fir.call @bar(%2) : (!fir.ref<!fir.char<1,?>>) -> ()
-  return
+  return %0 : !fir.ref<!fir.type<_QTt1>>
 }
diff --git a/flang/test/Fir/rebox.fir b/flang/test/Fir/rebox.fir
index 0c9f6d9bb94ad..d858adfb7c45d 100644
--- a/flang/test/Fir/rebox.fir
+++ b/flang/test/Fir/rebox.fir
@@ -36,7 +36,7 @@ func.func @test_rebox_1(%arg0: !fir.box<!fir.array<?x?xf32>>) {
   // CHECK: %[[VOIDBASE0:.*]] = getelementptr i8, ptr %[[INBASE]], i64 %[[OFFSET_0]]
   // CHECK: %[[OFFSET_1:.*]] = mul i64 2, %[[INSTRIDE_1]]
   // CHECK: %[[VOIDBASE1:.*]] = getelementptr i8, ptr %[[VOIDBASE0]], i64 %[[OFFSET_1]]
-  // CHECK: %[[OUTSTRIDE0:.*]] = mul i64 3, %[[INSTRIDE_1]]
+  // CHECK: %[[OUTSTRIDE0:.*]] = mul i64 %[[INSTRIDE_1]], 3
   // CHECK: %[[OUTBOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[OUTSTRIDE0]], 7, 0, 2
   // CHECK: %[[OUTBOX2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[OUTBOX1]], ptr %[[VOIDBASE1]], 0
   // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[OUTBOX2]], ptr %[[OUTBOX_ALLOC]], align 8
@@ -63,7 +63,7 @@ func.func @test_rebox_2(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,?>>>) {
   // CHECK: %[[OUTBOX:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }
   // CHECK: %[[LEN_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 1
   // CHECK: %[[LEN:.*]] = load i64, ptr %[[LEN_GEP]]
-  // CHECK: %[[SIZE:.*]] = mul i64 1, %[[LEN]]
+  // CHECK: %[[SIZE:.*]] = mul i64 %[[LEN]], 1
   // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } undef, i64 %[[SIZE]], 1
 
   %1 = fir.rebox %arg0 [%0]  : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.slice<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
@@ -94,8 +94,8 @@ func.func @test_rebox_3(%arg0: !fir.box<!fir.array<?xf32>>) {
   // CHECK: %[[INSTRIDE:.*]] = load i64, ptr %[[INSTRIDE_GEP]]
   // CHECK: %[[INBASE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 0
   // CHECK: %[[INBASE:.*]] = load ptr, ptr %[[INBASE_GEP]]
-  // CHECK: %[[OUTSTRIDE1:.*]] = mul i64 3, %[[INSTRIDE]]
-  // CHECK: %[[OUTSTRIDE2:.*]] = mul i64 4, %[[OUTSTRIDE1]]
+  // CHECK: %[[OUTSTRIDE1:.*]] = mul i64 %[[INSTRIDE]], 3
+  // CHECK: %[[OUTSTRIDE2:.*]] = mul i64 %[[OUTSTRIDE1]], 4
   // CHECK: %[[OUTBOX0:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %{{.*}}, i64 %[[INSTRIDE]], 7, 0, 2
   // CHECK: %[[OUTBOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %[[OUTBOX0]], i64 3, 7, 1, 0
   // CHECK: %[[OUTBOX2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %[[OUTBOX1]], i64 4, 7, 1, 1
@@ -153,13 +153,13 @@ func.func @test_cmplx_1(%arg0: !fir.box<!fir.array<?xcomplex<f32>>>) {
   %0:3 = fir.box_dims %arg0, %c0 : (!fir.box<!fir.array<?xcomplex<f32>>>, index) -> (index, index, index)
   %1 = fir.slice %c1, %0#1, %c1 path %c1_i32 : (index, index, index, i32) -> !fir.slice<1>
   %2 = fir.rebox %arg0 [%1] : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.slice<1>) -> !fir.box<!fir.array<?xf32>>
-  // CHECK: %[[INSTRIDE_0_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i64 0, i32 1
+  // CHECK: %[[INSTRIDE_0_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i32 0, i32 1
   // CHECK: %[[INSTRIDE_0:.*]] = load i64, ptr %[[INSTRIDE_0_GEP]]
   // CHECK: %[[INSTRIDE_1_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i32 0, i32 2
   // CHECK: %[[INSTRIDE_1:.*]] = load i64, ptr %[[INSTRIDE_1_GEP]]
   // CHECK: %[[FRONT_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 0
   // CHECK: %[[FRONT_PTR:.*]] = load ptr, ptr %[[FRONT_GEP]]
-  // CHECK: %[[FIELD_OFFSET_GEP:.*]] = getelementptr { float, float }, ptr %[[FRONT_PTR]], i64 0, i32 0
+  // CHECK: %[[FIELD_OFFSET_GEP:.*]] = getelementptr { float, float }, ptr %[[FRONT_PTR]], i32 0, i32 0
   // CHECK: %[[FRONT_OFFSET:.*]] = mul i64 0, %[[INSTRIDE_1]]
   // CHECK: %[[OFFSET_GEP:.*]] = getelementptr i8, ptr %[[FIELD_OFFSET_GEP]], i64 %[[FRONT_OFFSET]]
   // CHECK: %[[SUB_1:.*]] = sub i64 %[[INSTRIDE_0]], 1
@@ -167,7 +167,7 @@ func.func @test_cmplx_1(%arg0: !fir.box<!fir.array<?xcomplex<f32>>>) {
   // CHECK: %[[DIV_1:.*]] = sdiv i64 %[[ADD_1]], 1
   // CHECK: %[[CHECK_NONZERO:.*]] = icmp sgt i64 %[[DIV_1]], 0
   // CHECK: %[[CHECKED_BOUND:.*]] = select i1 %[[CHECK_NONZERO]], i64 %[[DIV_1]], i64 0
-  // CHECK: %[[STRIDE:.*]] = mul i64 1, %[[INSTRIDE_1]]
+  // CHECK: %[[STRIDE:.*]] = mul i64 %[[INSTRIDE_1]], 1
   // CHECK: %[[VAL_BUILD_1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[CHECKED_BOUND]], 7, 0, 1
   // CHECK: %[[VAL_BUILD_2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_1]], i64 %[[STRIDE]], 7, 0, 2
   // CHECK: %[[VAL_BUILD_3:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_2]], ptr %[[OFFSET_GEP]], 0
@@ -198,10 +198,10 @@ func.func @test_cmplx_2(%arg0: !fir.box<!fir.array<?xcomplex<f32>>>) {
   // CHECK: %[[INSTRIDE_0:.*]] = load i64, ptr %[[INSTRIDE_0_GEP]]
   // CHECK: %[[FRONT_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 0
   // CHECK: %[[FRONT_PTR:.*]] = load ptr, ptr %[[FRONT_GEP]]
-  // CHECK: %[[FIELD_OFFSET_GEP:.*]] = getelementptr { float, float }, ptr %[[FRONT_PTR]], i64 0, i32 1
+  // CHECK: %[[FIELD_OFFSET_GEP:.*]] = getelementptr { float, float }, ptr %[[FRONT_PTR]], i32 0, i32 1
   // CHECK: %[[FRONT_OFFSET:.*]] = mul i64 6, %[[INSTRIDE_0]]
   // CHECK: %[[OFFSET_GEP:.*]] = getelementptr i8, ptr %[[FIELD_OFFSET_GEP]], i64 %[[FRONT_OFFSET]]
-  // CHECK: %[[STRIDE:.*]] = mul i64 5, %[[INSTRIDE_0]]
+  // CHECK: %[[STRIDE:.*]] = mul i64 %[[INSTRIDE_0]], 5
   // CHECK: %[[VAL_BUILD_1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[STRIDE]], 7, 0, 2
   // CHECK: %[[VAL_BUILD_2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_1]], ptr %[[OFFSET_GEP]], 0
   // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_2]], ptr %[[OUTBOX_ALLOC]]
diff --git a/flang/test/Fir/select.fir b/flang/test/Fir/select.fir
index 5e88048446407..6d843e824d33f 100644
--- a/flang/test/Fir/select.fir
+++ b/flang/test/Fir/select.fir
@@ -64,6 +64,6 @@ func.func @h(%a : i32) -> i32 {
    return %1 : i32
 ^bb6:
    %x = arith.addi %b4, %b3 : i32
-   // CHECK: ret i32
+   // CHECK-DAG: ret i32
    return %x : i32
 }
diff --git a/flang/test/Fir/target.fir b/flang/test/Fir/target.fir
index b04e23a018e7e..1e721a09c835e 100644
--- a/flang/test/Fir/target.fir
+++ b/flang/test/Fir/target.fir
@@ -97,10 +97,6 @@ func.func @call8() {
 // X64-LABEL: define i64 @char1lensum(ptr {{[^%]*}}%0, ptr {{[^%]*}}%1, i64 %2, i64 %3)
 // PPC-LABEL: define i64 @char1lensum(ptr {{[^%]*}}%0, ptr {{[^%]*}}%1, i64 %2, i64 %3)
 func.func @char1lensum(%arg0 : !fir.boxchar<1>, %arg1 : !fir.boxchar<1>) -> i64 {
-  // X64-DAG: %[[p0:.*]] = insertvalue { ptr, i64 } undef, ptr %1, 0
-  // X64-DAG: = insertvalue { ptr, i64 } %[[p0]], i64 %3, 1
-  // X64-DAG: %[[p1:.*]] = insertvalue { ptr, i64 } undef, ptr %0, 0
-  // X64-DAG: = insertvalue { ptr, i64 } %[[p1]], i64 %2, 1
   %1:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1>>, i64)
   %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1>>, i64)
   // I32: %[[add:.*]] = add i64 %
diff --git a/flang/test/Fir/tbaa-codegen2.fir b/flang/test/Fir/tbaa-codegen2.fir
index 4907aa03ec5a5..072c8bbe4e80c 100644
--- a/flang/test/Fir/tbaa-codegen2.fir
+++ b/flang/test/Fir/tbaa-codegen2.fir
@@ -62,9 +62,9 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
 // CHECK-LABEL: define void @_QPfunc(
 // CHECK-SAME:      ptr {{[^%]*}}%[[ARG0:.*]]){{.*}}{
 // [...]
-// CHECK:  %[[VAL5:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ARG0]], i32 0, i32 7, i32 0, i32 0
+// CHECK:  %[[VAL5:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ARG0]], i32 0, i32 6
 // box access:
-// CHECK:  %[[VAL6:.*]] = load i64, ptr %[[VAL5]], align 4, !tbaa ![[BOX_ACCESS_TAG:.*]]
+// CHECK:  %[[VAL6:.*]] = load i8, ptr %[[VAL5]], align 1, !tbaa ![[BOX_ACCESS_TAG:.*]]
 // CHECK:  %[[VAL7:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %0, i32 0, i32 7, i32 0, i32 1
 // box access:
 // CHECK:  %[[VAL8:.*]] = load i64, ptr %[[VAL7]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
@@ -76,15 +76,9 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
 // CHECK:  %[[VAL12:.*]] = load ptr, ptr %[[VAL11]], align 8, !tbaa ![[BOX_ACCESS_TAG]]
 // CHECK:  %[[VAL15:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, ptr %[[VAL12]], 0
 // CHECK:  store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL15]], ptr %{{.*}}, align 8, !tbaa ![[BOX_ACCESS_TAG]]
-// CHECK:  %[[VAL16:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 0
-// box access:
-// CHECK:  %[[VAL17:.*]] = load i64, ptr %[[VAL16]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
-// CHECK:  %[[VAL18:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 1
+// CHECK:  %[[VAL18:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i32 0, i32 1
 // box access:
 // CHECK:  %[[VAL19:.*]] = load i64, ptr %[[VAL18]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
-// CHECK:  %[[VAL20:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 2
-// box access:
-// CHECK:  %[[VAL21:.*]] = load i64, ptr %[[VAL20]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
 // [...]
 // box access:
 // CHECK:  store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, ptr %{{.*}}, align 8, !tbaa ![[BOX_ACCESS_TAG]]
diff --git a/flang/test/Integration/OpenMP/map-types-and-sizes.f90 b/flang/test/Integration/OpenMP/map-types-and-sizes.f90
index 665be5a8db4d4..5ce36ac87ca8c 100644
--- a/flang/test/Integration/OpenMP/map-types-and-sizes.f90
+++ b/flang/test/Integration/OpenMP/map-types-and-sizes.f90
@@ -545,7 +545,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[ALLOCATABLE_DESC_ALLOCA:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 !CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_derived_type_allocaTone_layer, i64 1, align 8
 !CHECK: %[[MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_derived_type_allocaTone_layer, ptr %[[ALLOCA]], i32 0, i32 4
-!CHECK: %[[DESC_BOUND_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_DESC_ALLOCA]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[DESC_BOUND_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_DESC_ALLOCA]], i32 0, i32 7, i32 0, i32 1
 !CHECK: %[[DESC_BOUND_ACCESS_LOAD:.*]] = load i64, ptr %[[DESC_BOUND_ACCESS]], align 8
 !CHECK: %[[OFFSET_UB:.*]] = sub i64 %[[DESC_BOUND_ACCESS_LOAD]], 1
 !CHECK: %[[MEMBER_DESCRIPTOR_BASE_ADDR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[MEMBER_ACCESS]], i32 0, i32 0
@@ -596,7 +596,7 @@ end subroutine mapType_common_block_members
 !CHECK: %{{.*}} = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 0
 !CHECK: %{{.*}} = load ptr, ptr %{{.*}}, align 8
 !CHECK: %{{.*}} = getelementptr %_QFmaptype_alloca_derived_typeTone_layer, ptr %{{.*}}, i32 0, i32 4
-!CHECK: %[[ACCESS_DESC_MEMBER_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[DTYPE_ARRAY_MEMBER_DESC_ALLOCA]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[ACCESS_DESC_MEMBER_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[DTYPE_ARRAY_MEMBER_DESC_ALLOCA]], i32 0, i32 7, i32 0, i32 1
 !CHECK: %[[LOAD_DESC_MEMBER_UB:.*]] = load i64, ptr %[[ACCESS_DESC_MEMBER_UB]], align 8
 !CHECK: %[[OFFSET_MEMBER_UB:.*]] = sub i64 %[[LOAD_DESC_MEMBER_UB]], 1
 !CHECK: %[[DTYPE_BASE_ADDR_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[DTYPE_DESC_ALLOCA_2]], i32 0, i32 0
@@ -665,7 +665,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[ALLOCATABLE_MEMBER_ALLOCA:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 !CHECK: %[[DTYPE_DESC_ALLOCA_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, align 8
 !CHECK: %[[DTYPE_DESC_ALLOCA_3:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1, align 8
-!CHECK: %[[ALLOCATABLE_MEMBER_ALLOCA_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_MEMBER_ALLOCA]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[ALLOCATABLE_MEMBER_ALLOCA_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_MEMBER_ALLOCA]], i32 0, i32 7, i32 0, i32 1
 !CHECK: %[[ALLOCATABLE_MEMBER_ALLOCA_UB_LOAD:.*]] = load i64, ptr %[[ALLOCATABLE_MEMBER_ALLOCA_UB]], align 8
 !CHECK: %[[ALLOCATABLE_MEMBER_SIZE_CALC_1:.*]] = sub i64 %[[ALLOCATABLE_MEMBER_ALLOCA_UB_LOAD]], 1
 !CHECK: %[[DTYPE_DESC_BASE_ADDR_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[DTYPE_DESC_ALLOCA_2]], i32 0, i32 0
@@ -734,7 +734,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_nested_derived_type_allocaTtop_layer, i64 1, align 8
 !CHECK: %[[NESTED_DTYPE_MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_nested_derived_type_allocaTtop_layer, ptr %[[ALLOCA]], i32 0, i32 6
 !CHECK: %[[NESTED_MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_nested_derived_type_allocaTmiddle_layer, ptr %[[NESTED_DTYPE_MEMBER_ACCESS]], i32 0, i32 2
-!CHECK: %[[ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_MEMBER_ALLOCA]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_MEMBER_ALLOCA]], i32 0, i32 7, i32 0, i32 1
 !CHECK: %[[ALLOCATABLE_MEMBER_ADDR_LOAD:.*]] = load i64, ptr %[[ALLOCATABLE_MEMBER_BASE_ADDR]], align 8
 !CHECK: %[[ALLOCATABLE_MEMBER_SIZE_CALC_1:.*]] = sub i64 %[[ALLOCATABLE_MEMBER_ADDR_LOAD]], 1
 !CHECK: %[[NESTED_MEMBER_BASE_ADDR_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 0
@@ -778,9 +778,9 @@ end subroutine mapType_common_block_members
 !CHECK: %[[ALLOCA:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, align 8
 !CHECK: %[[BASE_PTR_1:.*]] = alloca %_QFmaptype_nested_derived_type_member_idxTdtype, i64 1, align 8
 !CHECK: %[[OFF_PTR_1:.*]] = getelementptr %_QFmaptype_nested_derived_type_member_idxTdtype, ptr %[[BASE_PTR_1]], i32 0, i32 1
-!CHECK: %[[BOUNDS_ACC:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[BOUNDS_ACC:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA]], i32 0, i32 7, i32 0, i32 1
 !CHECK: %[[BOUNDS_LD:.*]] = load i64, ptr %[[BOUNDS_ACC]], align 8
-!CHECK: %[[BOUNDS_ACC_2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCA_1]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[BOUNDS_ACC_2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCA_1]], i32 0, i32 7, i32 0, i32 1
 !CHECK: %[[BOUNDS_LD_2:.*]] = load i64, ptr %[[BOUNDS_ACC_2]], align 8
 !CHECK: %[[BOUNDS_CALC:.*]] = sub i64 %[[BOUNDS_LD_2]], 1
 !CHECK: %[[OFF_PTR_CALC_0:.*]] = sub i64 %[[BOUNDS_LD]], 1
@@ -789,7 +789,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[LOAD_DESC_PTR:.*]] = load ptr, ptr %[[GEP_DESC_PTR]], align 8
 !CHECK: %[[SZ_CALC_1:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA_0]], i32 0, i32 7, i32 0, i32 2
 !CHECK: %[[SZ_CALC_2:.*]] = load i64, ptr %[[SZ_CALC_1]], align 8
-!CHECK: %[[SZ_CALC_3:.*]] = mul nsw i64 1, %[[SZ_CALC_2]]
+!CHECK: %[[SZ_CALC_3:.*]] = mul nsw i64 %[[SZ_CALC_2]], 1
 !CHECK: %[[SZ_CALC_4:.*]] = add nsw i64 %[[SZ_CALC_3]], 0
 !CHECK: %[[SZ_CALC_5:.*]] = getelementptr i8, ptr %[[LOAD_DESC_PTR]], i64 %[[SZ_CALC_4]]
 !CHECK: %[[SZ_CALC_6:.*]] = getelementptr %_QFmaptype_nested_derived_type_member_idxTvertexes, ptr %[[SZ_CALC_5]], i32 0, i32 2
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index e6a8c5e025123..5a28e97054359 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -606,8 +606,6 @@ program test_alloc
 ! LLVM-COUNT-2:  call void %{{[0-9]*}}()
 
 ! LLVM: call void @llvm.memcpy.p0.p0.i32
-! LLVM: %[[GEP_TDESC_C1:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 7
-! LLVM: %[[TDESC_C1:.*]] = load ptr, ptr %[[GEP_TDESC_C1]]
 ! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1
 ! LLVM: %[[ELEM_SIZE:.*]] = load i64, ptr %[[ELEM_SIZE_GEP]]
 ! LLVM: %[[TYPE_CODE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 4
@@ -620,8 +618,6 @@ program test_alloc
 ! LLVM: call void %{{.*}}(ptr %{{.*}}) 
 
 ! LLVM: call void @llvm.memcpy.p0.p0.i32
-! LLVM: %[[GEP_TDESC_C2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 7
-! LLVM: %[[TDESC_C2:.*]] = load ptr, ptr %[[GEP_TDESC_C2]]
 ! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1
 ! LLVM: %[[ELEM_SIZE:.*]] = load i64, ptr %[[ELEM_SIZE_GEP]]
 ! LLVM: %[[TYPE_CODE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 4
diff --git a/flang/test/Lower/forall/character-1.f90 b/flang/test/Lower/forall/character-1.f90
index d1e12a8dbdfec..7a1f4b125a79f 100644
--- a/flang/test/Lower/forall/character-1.f90
+++ b/flang/test/Lower/forall/character-1.f90
@@ -23,11 +23,11 @@ end program test
 
 ! CHECK-LABEL: define internal void @_QFPsub(
 ! CHECK-SAME:    ptr {{[^%]*}}%[[arg:.*]])
-! CHECK: %[[extent:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 7, i64 0, i32 1
+! CHECK: %[[extent:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 7, i32 0, i32 1
 ! CHECK: %[[extval:.*]] = load i64, ptr %[[extent]]
 ! CHECK: %[[elesize:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 1
 ! CHECK: %[[esval:.*]] = load i64, ptr %[[elesize]]
-! CHECK: %[[mul:.*]] = mul i64 1, %[[esval]]
+! CHECK: %[[mul:.*]] = mul i64 %[[esval]], 1
 ! CHECK: %[[mul2:.*]] = mul i64 %[[mul]], %[[extval]]
 ! CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
 ! CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h b/mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h
new file mode 100644
index 0000000000000..af6dfb0057688
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h
@@ -0,0 +1,23 @@
+//===- OpenMPOffloadPrivatizationPrepare.h - Prepare for OpenMP Offload
+// Privatization -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LLVMIR_TRANSFORMS_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS_H
+#define MLIR_DIALECT_LLVMIR_TRANSFORMS_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+namespace LLVM {
+#define GEN_PASS_DECL_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS
+#include "mlir/Dialect/LLVMIR/Transforms/Passes.h.inc"
+} // namespace LLVM
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS_H
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
index 961909d5c8d27..1ba67caba05be 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
@@ -73,4 +73,16 @@ def DIScopeForLLVMFuncOpPass : Pass<"ensure-debug-info-scope-on-llvm-func", "::m
   ];
 }
 
+def PrepareForOMPOffloadPrivatizationPass : Pass<"omp-offload-privatization-prepare", "::mlir::LLVM::LLVMFuncOp"> {
+    let summary = "Prepare OpenMP maps for privatization for deferred target tasks";
+    let description = [{
+      When generating LLVMIR for privatized variables in an OpenMP offloading directive (eg. omp::TargetOp)
+      that creates a deferred target task (when the nowait clause is used), we need to copy the privatized
+      variable out of the stack of the generating task and into the heap so that the deferred target task
+      can still access it. However, if such a privatized variable is also mapped, typically the case for
+      allocatables, then the corresponding `omp::MapInfoOp` needs to be fixed up to map the new heap-allocated
+      variable and not the original variable.
+    }];
+  let dependentDialects = ["LLVM::LLVMDialect", "mlir::omp::OpenMPDialect"];
+}
 #endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 2548a8ab4aac6..efa43107da068 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -1479,8 +1479,8 @@ def TargetOp : OpenMP_Op<"target", traits = [
     `map` operands. For `private` operands that require a map, the value of the
     corresponding element in the attribute is the index of the `map` operand
     (relative to other `map` operands not the whole operands of the operation). For
-    `private` opernads that do not require a map, this value is -1 (which is omitted
-    from the assembly foramt printing).
+    `private` operands that do not require a map, this value is -1 (which is omitted
+    from the assembly format printing).
   }] # clausesDescription;
 
   let arguments = !con(clausesArgs,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index feaffa34897b6..10c398e67d88e 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -68,6 +68,7 @@ mlir::LLVM::lookupOrCreateFn(OpBuilder &b, Operation *moduleOp, StringRef name,
                              SymbolTableCollection *symbolTables) {
   assert(moduleOp->hasTrait<OpTrait::SymbolTable>() &&
          "expected SymbolTable operation");
+  llvm::errs() << "Looking up " << name << "\n";
   auto func = lookupFuncOp(name, moduleOp, symbolTables);
   auto funcT = LLVMFunctionType::get(resultType, paramTypes, isVarArg);
   // Assert the signature of the found function is same as expected
@@ -85,7 +86,7 @@ mlir::LLVM::lookupOrCreateFn(OpBuilder &b, Operation *moduleOp, StringRef name,
     }
     return func;
   }
-
+  llvm::errs() << "Did not find " << name << ".. creating it \n";
   OpBuilder::InsertionGuard g(b);
   assert(!moduleOp->getRegion(0).empty() && "expected non-empty region");
   b.setInsertionPointToStart(&moduleOp->getRegion(0).front());
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
index d4ff0955c5d0e..729f5191cd557 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
@@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRLLVMIRTransforms
   LegalizeForExport.cpp
   OptimizeForNVVM.cpp
   RequestCWrappers.cpp
+  OpenMPOffloadPrivatizationPrepare.cpp
 
   DEPENDS
   MLIRLLVMPassIncGen
@@ -18,4 +19,5 @@ add_mlir_dialect_library(MLIRLLVMIRTransforms
   MLIRPass
   MLIRTransforms
   MLIRNVVMDialect
+  MLIROpenMPDialect
   )
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
new file mode 100644
index 0000000000000..e745e43072113
--- /dev/null
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -0,0 +1,429 @@
+//===- OpenMPOffloadPrivatizationPrepare.cpp - Prepare for OpenMP Offload
+// Privatization ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include <cstdint>
+#include <utility>
+
+//===----------------------------------------------------------------------===//
+// A pass that prepares OpenMP code for translation of delayed privatization
+// in the context of deferred target tasks. Deferred target tasks are created
+// when the nowait clause is used on the target directive.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "omp-prepare-for-offload-privatization"
+#define PDBGS() (llvm::dbgs() << "[" << DEBUG_TYPE << "]: ")
+
+namespace mlir {
+namespace LLVM {
+
+#define GEN_PASS_DEF_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS
+#include "mlir/Dialect/LLVMIR/Transforms/Passes.h.inc"
+
+} // namespace LLVM
+} // namespace mlir
+
+using namespace mlir;
+namespace {
+
+//===----------------------------------------------------------------------===//
+// OMPTargetPrepareDelayedPrivatizationPattern
+//===----------------------------------------------------------------------===//
+
+class OMPTargetPrepareDelayedPrivatizationPattern
+    : public OpRewritePattern<omp::TargetOp> {
+public:
+  using OpRewritePattern<omp::TargetOp>::OpRewritePattern;
+
+  // Match omp::TargetOp that have the following characteristics.
+  // 1. have private vars which refer to local (stack) memory
+  // 2. the target op has the nowait clause
+  // In this case, we allocate memory for the privatized variable on the heap
+  // and copy the original variable into this new heap allocation. We fix up
+  // any omp::MapInfoOp instances that may be mapping the private variable.
+  mlir::LogicalResult
+  matchAndRewrite(omp::TargetOp targetOp,
+                  PatternRewriter &rewriter) const override {
+    if (!hasPrivateVars(targetOp) || !isTargetTaskDeferred(targetOp))
+      return rewriter.notifyMatchFailure(
+          targetOp,
+          "targetOp does not have privateVars or does not need a target task");
+
+    ModuleOp mod = targetOp->getParentOfType<ModuleOp>();
+    LLVM::LLVMFuncOp llvmFunc = targetOp->getParentOfType<LLVM::LLVMFuncOp>();
+    OperandRange privateVars = targetOp.getPrivateVars();
+    mlir::SmallVector<mlir::Value> newPrivVars;
+
+    newPrivVars.reserve(privateVars.size());
+    std::optional<ArrayAttr> privateSyms = targetOp.getPrivateSyms();
+    for (auto [privVarIdx, privVarSymPair] :
+         llvm::enumerate(llvm::zip_equal(privateVars, *privateSyms))) {
+      auto privVar = std::get<0>(privVarSymPair);
+      auto privSym = std::get<1>(privVarSymPair);
+
+      omp::PrivateClauseOp privatizer = findPrivatizer(targetOp, privSym);
+      if (!privatizer.needsMap()) {
+        newPrivVars.push_back(privVar);
+        continue;
+      }
+      bool isFirstPrivate = privatizer.getDataSharingType() ==
+                            omp::DataSharingClauseType::FirstPrivate;
+
+      mlir::Value mappedValue =
+          targetOp.getMappedValueForPrivateVar(privVarIdx);
+      Operation *mapInfoOperation = mappedValue.getDefiningOp();
+      auto mapInfoOp = mlir::cast<omp::MapInfoOp>(mapInfoOperation);
+
+      if (mapInfoOp.getMapCaptureType() == omp::VariableCaptureKind::ByCopy) {
+        newPrivVars.push_back(privVar);
+        continue;
+      }
+      // Treat the privVar as varPtr. TODO: For boxchars this likely wont be a
+      // pointer. Allocate heap memory that corresponds to the type of memory
+      // pointed to by varPtr
+      mlir::Value varPtr = privVar;
+      mlir::Value heapMem = allocateHeapMem(targetOp, privVar, mod, rewriter);
+      if (!heapMem) {
+        newPrivVars.push_back(privVar);
+        return failure();
+      }
+      newPrivVars.push_back(heapMem);
+
+      // Find the earliest insertion point for the copy.
+
+      // Now, fix up the omp::MapInfoOp instances that use varPtr to refer
+      // to heapMem instead.
+      using ReplacementEntry = std::pair<Operation *, Operation *>;
+      llvm::SmallVector<ReplacementEntry> replRecord;
+
+
+      Operation *varPtrDefiningOp = varPtr.getDefiningOp();
+      std::set<Operation *> users;
+      users.insert(varPtrDefiningOp->user_begin(),
+                   varPtrDefiningOp->user_end());
+
+      auto usesVarPtr = [&users](Operation *op) -> bool {
+        return users.count(op);
+      };
+
+      SmallVector<Operation *> chainOfOps;
+      chainOfOps.push_back(mapInfoOperation);
+      if (!mapInfoOp.getMembers().empty()) {
+
+        for (auto member : mapInfoOp.getMembers()) {
+          if (usesVarPtr(member.getDefiningOp()))
+            chainOfOps.push_back(member.getDefiningOp());
+
+          omp::MapInfoOp memberMap =
+              mlir::cast<omp::MapInfoOp>(member.getDefiningOp());
+          if (memberMap.getVarPtrPtr() &&
+              usesVarPtr(memberMap.getVarPtrPtr().getDefiningOp()))
+            chainOfOps.push_back(memberMap.getVarPtrPtr().getDefiningOp());
+        }
+      }
+      DominanceInfo dom;
+      llvm::sort(chainOfOps, [&](Operation *l, Operation *r) {
+        return dom.dominates(l, r);
+      });
+
+      rewriter.setInsertionPoint(chainOfOps.front());
+      mlir::Location loc = chainOfOps.front()->getLoc();
+      mlir::Type varType = getElemType(varPtr);
+      // Copy the value of the local variable into the heap-allocated location.
+      auto loadVal = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
+      LLVM_ATTRIBUTE_UNUSED auto storeInst =
+          rewriter.create<LLVM::StoreOp>(loc, loadVal.getResult(), heapMem);
+
+      auto cloneAndMarkForDeletion = [&](Operation *origOp) -> Operation * {
+        Operation *clonedOp = rewriter.clone(*origOp);
+        rewriter.replaceAllOpUsesWith(origOp, clonedOp);
+        replRecord.push_back(std::make_pair(origOp, clonedOp));
+        return clonedOp;
+      };
+
+      rewriter.setInsertionPoint(targetOp);
+      rewriter.setInsertionPoint(cloneAndMarkForDeletion(mapInfoOperation));
+
+      // Fix any members that may use varPtr to now use heapMem
+      if (!mapInfoOp.getMembers().empty()) {
+        for (auto member : mapInfoOp.getMembers()) {
+          Operation *memberOperation = member.getDefiningOp();
+          if (!usesVarPtr(memberOperation))
+            continue;
+          rewriter.setInsertionPoint(cloneAndMarkForDeletion(memberOperation));
+
+          auto memberMapInfoOp = mlir::cast<omp::MapInfoOp>(memberOperation);
+          if (memberMapInfoOp.getVarPtrPtr()) {
+            Operation *varPtrPtrdefOp =
+                memberMapInfoOp.getVarPtrPtr().getDefiningOp();
+
+            // In the case of firstprivate, we have to do the following
+            // 1. Allocate heap memory for the underlying data.
+            // 2. Copy the original underlying data to the new memory allocated
+            // on the heap.
+            // 3. Put this new (heap) address in the originating
+            // struct/descriptor
+
+            // Consider the following sequence of omp.map.info and omp.target
+            // operations.
+            // %0 = llvm.getelementptr %19[0, 0]
+            // %1 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) ...
+            //                   var_ptr_ptr(%0 : !llvm.ptr)  bounds(..)
+            // %2 = omp.map.info var_ptr(%19 : !llvm.ptr, !desc_type)>) ...
+            //                   members(%1 : [0] : !llvm.ptr) -> !llvm.ptr
+            // omp.target nowait map_entries(%2 -> %arg5, %1 -> %arg8 : ..)
+            //                   private(@privatizer %19 -> %arg9 [map_idx=1] :
+            //                   !llvm.ptr) {
+            // We need to allocate memory on the heap for the underlying pointer
+            // which is stored at the var_ptr_ptr operand of %1. Then we need to
+            // copy this pointer to the new heap allocated memory location.
+            // Then, we need to store the address of the new heap location in
+            // the originating struct/descriptor. So, we generate the following
+            // (pseudo) MLIR code (Using the same names of mlir::Value instances
+            // in the example as in the code below)
+            //
+            // %dataMalloc = malloc(totalSize)
+            // %loadDataPtr = load %0 : !llvm.ptr -> !llvm.ptr
+            // memcpy(%dataMalloc, %loadDataPtr, totalSize)
+            // %newVarPtrPtrOp = llvm.getelementptr %heapMem[0, 0]
+            // llvm.store %dataMalloc, %newVarPtrPtrOp
+            // %1.cloned = omp.map.info var_ptr(%heapMem : !llvm.ptr, i32) ...
+            //                          var_ptr_ptr(%newVarPtrPtrOp : !llvm.ptr)
+            // %2.cloned = omp.map.info var_ptr(%heapMem : !llvm.ptr,
+            //                                             !desc_type)>) ...
+            //                          members(%1.cloned : [0] : !llvm.ptr)
+            //             -> !llvm.ptr
+            // omp.target nowait map_entries(%2.cloned -> %arg5,
+            //                               %1.cloned -> %arg8 : ..)
+            //            private(@privatizer %heapMem -> .. [map_idx=1] : ..) {
+
+            if (isFirstPrivate) {
+              assert(!memberMapInfoOp.getBounds().empty() &&
+                     "empty bounds on member map of firstprivate variable");
+              mlir::Location loc = memberMapInfoOp.getLoc();
+              mlir::Value totalSize =
+                  getSizeInBytes(memberMapInfoOp, mod, rewriter);
+              auto dataMalloc = allocateHeapMem(loc, totalSize, mod, rewriter);
+              auto loadDataPtr = rewriter.create<LLVM::LoadOp>(
+                  loc, memberMapInfoOp.getVarPtrPtr().getType(),
+                  memberMapInfoOp.getVarPtrPtr());
+              LLVM_ATTRIBUTE_UNUSED auto memcpy =
+                  rewriter.create<mlir::LLVM::MemcpyOp>(
+                      loc, dataMalloc.getResult(), loadDataPtr.getResult(),
+                      totalSize, /*isVolatile=*/false);
+              Operation *newVarPtrPtrOp = rewriter.clone(*varPtrPtrdefOp);
+              rewriter.replaceAllUsesExcept(memberMapInfoOp.getVarPtrPtr(),
+                                            newVarPtrPtrOp->getOpResult(0),
+                                            loadDataPtr);
+              rewriter.modifyOpInPlace(newVarPtrPtrOp, [&]() {
+                newVarPtrPtrOp->replaceUsesOfWith(varPtr, heapMem);
+              });
+              LLVM_ATTRIBUTE_UNUSED auto storePtr =
+                  rewriter.create<LLVM::StoreOp>(loc, dataMalloc.getResult(),
+                                                 newVarPtrPtrOp->getResult(0));
+            } else
+              rewriter.setInsertionPoint(
+                  cloneAndMarkForDeletion(varPtrPtrdefOp));
+          }
+        }
+      }
+
+      for (auto repl : replRecord) {
+        Operation *origOp = repl.first;
+        Operation *clonedOp = repl.second;
+        rewriter.modifyOpInPlace(clonedOp, [&]() {
+          clonedOp->replaceUsesOfWith(varPtr, heapMem);
+        });
+        rewriter.eraseOp(origOp);
+      }
+    }
+    assert(newPrivVars.size() == privateVars.size() &&
+           "The number of private variables must match before and after "
+           "transformation");
+
+    rewriter.setInsertionPoint(targetOp);
+    Operation *newOp = rewriter.clone(*targetOp.getOperation());
+    omp::TargetOp newTargetOp = mlir::cast<omp::TargetOp>(newOp);
+    rewriter.modifyOpInPlace(newTargetOp, [&]() {
+      newTargetOp.getPrivateVarsMutable().assign(newPrivVars);
+    });
+    rewriter.replaceOp(targetOp, newTargetOp);
+    return mlir::success();
+  }
+
+private:
+  bool hasPrivateVars(omp::TargetOp targetOp) const {
+    return !targetOp.getPrivateVars().empty();
+  }
+
+  bool isTargetTaskDeferred(omp::TargetOp targetOp) const {
+    return targetOp.getNowait();
+  }
+
+  template <typename OpTy>
+  omp::PrivateClauseOp findPrivatizer(OpTy op, mlir::Attribute privSym) const {
+    SymbolRefAttr privatizerName = llvm::cast<SymbolRefAttr>(privSym);
+    omp::PrivateClauseOp privatizer =
+        SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+            op, privatizerName);
+    return privatizer;
+  }
+
+  template <typename OpType>
+  mlir::Type getElemType(OpType op) const {
+    return op.getElemType();
+  }
+
+  mlir::Type getElemType(mlir::Value varPtr) const {
+    Operation *definingOp = unwrapAddrSpaceCast(varPtr.getDefiningOp());
+    assert((mlir::isa<LLVM::AllocaOp, LLVM::GEPOp>(definingOp)) &&
+           "getElemType in PrepareForOMPOffloadPrivatizationPass can deal only "
+           "with Alloca or GEP for now");
+    if (auto allocaOp = mlir::dyn_cast<LLVM::AllocaOp>(definingOp))
+      return getElemType(allocaOp);
+    // TODO: get rid of this because GEPOp.getElemType() is not the right thing
+    // to use.
+    if (auto gepOp = mlir::dyn_cast<LLVM::GEPOp>(definingOp))
+      return getElemType(gepOp);
+    return mlir::Type{};
+  }
+
+  mlir::Operation *unwrapAddrSpaceCast(Operation *op) const {
+    if (!mlir::isa<LLVM::AddrSpaceCastOp>(op))
+      return op;
+    mlir::LLVM::AddrSpaceCastOp addrSpaceCastOp =
+        mlir::cast<LLVM::AddrSpaceCastOp>(op);
+    return unwrapAddrSpaceCast(addrSpaceCastOp.getArg().getDefiningOp());
+  }
+
+  // Get the (compile-time constant) size of varType as per the
+  // given DataLayout dl.
+  std::int64_t getSizeInBytes(const mlir::DataLayout &dl,
+                           mlir::Type varType) const {
+    llvm::TypeSize size = dl.getTypeSize(varType);
+    unsigned short alignment = dl.getTypeABIAlignment(varType);
+    return llvm::alignTo(size, alignment);
+  }
+
+  // Generate code to get the size of data being mapped from the bounds
+  // of mapInfoOp
+  mlir::Value getSizeInBytes(omp::MapInfoOp mapInfoOp, ModuleOp mod,
+                             PatternRewriter &rewriter) const {
+    mlir::Location loc = mapInfoOp.getLoc();
+    mlir::Type llvmInt64Ty = rewriter.getI64Type();
+    mlir::Value constOne =
+        rewriter.create<LLVM::ConstantOp>(loc, llvmInt64Ty, 1);
+    mlir::Value elementCount = constOne;
+    // TODO: Consider using  boundsOp.getExtent() if available.
+    for (auto bounds : mapInfoOp.getBounds()) {
+      auto boundsOp = mlir::cast<omp::MapBoundsOp>(bounds.getDefiningOp());
+      elementCount = rewriter.create<LLVM::MulOp>(
+          loc, llvmInt64Ty, elementCount,
+          rewriter.create<LLVM::AddOp>(
+              loc, llvmInt64Ty,
+              (rewriter.create<LLVM::SubOp>(loc, llvmInt64Ty,
+                                            boundsOp.getUpperBound(),
+                                            boundsOp.getLowerBound())),
+              constOne));
+    }
+    const mlir::DataLayout &dl = mlir::DataLayout(mod);
+    std::int64_t elemSize = getSizeInBytes(dl, mapInfoOp.getVarType());
+    mlir::Value elemSizeV =
+        rewriter.create<LLVM::ConstantOp>(loc, llvmInt64Ty, elemSize);
+    return rewriter.create<LLVM::MulOp>(loc, llvmInt64Ty, elementCount,
+                                        elemSizeV);
+  }
+
+  LLVM::LLVMFuncOp getMalloc(ModuleOp mod, PatternRewriter &rewriter) const {
+    llvm::FailureOr<mlir::LLVM::LLVMFuncOp> mallocCall =
+        LLVM::lookupOrCreateMallocFn(rewriter, mod, rewriter.getI64Type());
+    assert(llvm::succeeded(mallocCall) && "Could not find malloc in the module");
+    return mallocCall.value();
+  }
+
+  template <typename OpTy>
+  mlir::Value allocateHeapMem(OpTy targetOp, mlir::Value privVar, ModuleOp mod,
+                              PatternRewriter &rewriter) const {
+    mlir::Value varPtr = privVar;
+    Operation *definingOp = varPtr.getDefiningOp();
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(definingOp);
+    LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
+
+    mlir::Location loc = definingOp->getLoc();
+    mlir::Type varType = getElemType(varPtr);
+    assert(mod.getDataLayoutSpec() &&
+           "MLIR module with no datalayout spec not handled yet");
+    const mlir::DataLayout &dl = mlir::DataLayout(mod);
+    std::int64_t distance = getSizeInBytes(dl, varType);
+    mlir::Value sizeBytes = rewriter.create<LLVM::ConstantOp>(
+        loc, mallocFn.getFunctionType().getParamType(0), distance);
+
+    auto mallocCallOp = rewriter.create<LLVM::CallOp>(loc, mallocFn,
+                                                      ValueRange{sizeBytes});
+    return mallocCallOp.getResult();
+  }
+
+  LLVM::CallOp allocateHeapMem(mlir::Location loc, mlir::Value size,
+                               ModuleOp mod, PatternRewriter &rewriter) const {
+    LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
+    return rewriter.create<LLVM::CallOp>(loc, mallocFn, ValueRange{size});
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// PrepareForOMPOffloadPrivatizationPass
+//===----------------------------------------------------------------------===//
+
+struct PrepareForOMPOffloadPrivatizationPass
+    : public LLVM::impl::PrepareForOMPOffloadPrivatizationPassBase<
+          PrepareForOMPOffloadPrivatizationPass> {
+
+  void runOnOperation() override {
+    LLVM::LLVMFuncOp func = getOperation();
+    MLIRContext &context = getContext();
+    ModuleOp mod = func->getParentOfType<ModuleOp>();
+
+    // FunctionFilteringPass removes bounds arguments from omp.map.info
+    // operations. We require bounds else our pass asserts. But, that's only for
+    // maps in functions that are on the host. So, skip functions being compiled
+    // for the target.
+    auto offloadModuleInterface =
+        mlir::dyn_cast<omp::OffloadModuleInterface>(mod.getOperation());
+    if (offloadModuleInterface && offloadModuleInterface.getIsTargetDevice()) {
+      return;
+    }
+
+    RewritePatternSet patterns(&context);
+    patterns.add<OMPTargetPrepareDelayedPrivatizationPattern>(&context);
+    LLVM_DEBUG(llvm::dbgs() << " Module before : " << mod << "\n");
+    if (mlir::failed(
+            applyPatternsGreedily(func, std::move(patterns),
+                                  GreedyRewriteConfig().setStrictness(
+                                      GreedyRewriteStrictness::ExistingOps)))) {
+      emitError(func.getLoc(),
+                "error in preparing targetOps for delayed privatization.");
+      signalPassFailure();
+    }
+    LLVM_DEBUG(llvm::dbgs() << " Module after : " << mod << "\n");
+
+  }
+};
+} // namespace
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6694de8383534..f3cbd62b53342 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -356,14 +356,8 @@ static LogicalResult checkImplementationStatus(Operation &op) {
       result = todo("priority");
   };
   auto checkPrivate = [&todo](auto op, LogicalResult &result) {
-    if constexpr (std::is_same_v<std::decay_t<decltype(op)>, omp::TargetOp>) {
-      // Privatization is supported only for included target tasks.
-      if (!op.getPrivateVars().empty() && op.getNowait())
-        result = todo("privatization for deferred target tasks");
-    } else {
-      if (!op.getPrivateVars().empty() || op.getPrivateSyms())
-        result = todo("privatization");
-    }
+    if (!op.getPrivateVars().empty() || op.getPrivateSyms())
+      result = todo("privatization");
   };
   auto checkReduction = [&todo](auto op, LogicalResult &result) {
     if (isa<omp::TeamsOp>(op))
@@ -450,7 +444,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkDevice(op, result);
         checkInReduction(op, result);
         checkIsDevicePtr(op, result);
-        checkPrivate(op, result);
       })
       .Default([](Operation &) {
         // Assume all clauses for an operation can be translated unless they are
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index de714d8b740af..99f384fbb1f7e 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -624,6 +624,7 @@ LogicalResult mlir::MlirOptMain(llvm::raw_ostream &outputStream,
   // We use the thread-pool this context is creating, and avoid
   // creating any thread when disabled.
   MLIRContext threadPoolCtx;
+  llvm::errs() << "threadPoolCtx.isMultithreadingEnabled() = " << threadPoolCtx.isMultithreadingEnabled() << "\n";
   if (threadPoolCtx.isMultithreadingEnabled())
     threadPool = &threadPoolCtx.getThreadPool();
 
diff --git a/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir b/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir
new file mode 100644
index 0000000000000..6b8121b262f47
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir
@@ -0,0 +1,167 @@
+// RUN: mlir-opt --mlir-disable-threading -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>} {
+  llvm.func @free(!llvm.ptr)
+  omp.private {type = private} @privatizer : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(48 : i32) : i32
+    "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    omp.yield(%arg1 : !llvm.ptr)
+  }
+
+  omp.private {type = firstprivate} @firstprivatizer : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> copy {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(48 : i32) : i32
+    "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    omp.yield(%arg1 : !llvm.ptr)
+  }
+
+  llvm.func internal @private_test(%arg0: !llvm.ptr {fir.bindc_name = "ptr0"}, %arg1: !llvm.ptr {fir.bindc_name = "ptr1"}) {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.mlir.constant(0 : index) : i64
+    %5 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %19 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
+    %21 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+    %33 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.store %33, %19 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+    llvm.store %0, %21 : i32, !llvm.ptr
+    %124 = omp.map.info var_ptr(%21 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+    %150 = llvm.getelementptr %19[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %151 = llvm.load %150 : !llvm.ptr -> i64
+    %152 = llvm.getelementptr %19[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %153 = llvm.load %152 : !llvm.ptr -> i64
+    %154 = llvm.getelementptr %19[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %155 = llvm.load %154 : !llvm.ptr -> i64
+    %156 = llvm.sub %153, %1 : i64
+    %157 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%156 : i64) extent(%153 : i64) stride(%155 : i64) start_idx(%151 : i64) {stride_in_bytes = true}
+    %158 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %159 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%158 : !llvm.ptr) bounds(%157) -> !llvm.ptr {name = ""}
+    %160 = omp.map.info var_ptr(%19 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%159 : [0] : !llvm.ptr) -> !llvm.ptr
+    omp.target nowait map_entries(%124 -> %arg2, %160 -> %arg5, %159 -> %arg8 : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@privatizer %19 -> %arg9 [map_idx=1] : !llvm.ptr) {
+      omp.terminator
+    }
+    %166 = llvm.mlir.constant(48 : i32) : i32
+    %167 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %168 = llvm.load %167 : !llvm.ptr -> !llvm.ptr
+    llvm.call @free(%168) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+
+  llvm.func internal @firstprivate_test(%arg0: !llvm.ptr {fir.bindc_name = "ptr0"}, %arg1: !llvm.ptr {fir.bindc_name = "ptr1"}) {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.mlir.constant(0 : index) : i64
+    %5 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %19 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
+    %21 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+    %33 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.store %33, %19 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+    llvm.store %0, %21 : i32, !llvm.ptr
+    %124 = omp.map.info var_ptr(%21 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+    %150 = llvm.getelementptr %19[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %151 = llvm.load %150 : !llvm.ptr -> i64
+    %152 = llvm.getelementptr %19[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %153 = llvm.load %152 : !llvm.ptr -> i64
+    %154 = llvm.getelementptr %19[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %155 = llvm.load %154 : !llvm.ptr -> i64
+    %156 = llvm.sub %153, %1 : i64
+    %157 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%156 : i64) extent(%153 : i64) stride(%155 : i64) start_idx(%151 : i64) {stride_in_bytes = true}
+    %158 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %159 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%158 : !llvm.ptr) bounds(%157) -> !llvm.ptr {name = ""}
+    %160 = omp.map.info var_ptr(%19 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%159 : [0] : !llvm.ptr) -> !llvm.ptr
+    omp.target nowait map_entries(%124 -> %arg2, %160 -> %arg5, %159 -> %arg8 : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %19 -> %arg9 [map_idx=1] : !llvm.ptr) {
+      omp.terminator
+    }
+    %166 = llvm.mlir.constant(48 : i32) : i32
+    %167 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %168 = llvm.load %167 : !llvm.ptr -> !llvm.ptr
+    llvm.call @free(%168) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+}
+
+// CHECK-LABEL:   llvm.func @malloc(i64) -> !llvm.ptr
+// CHECK:         llvm.func @free(!llvm.ptr)
+
+// CHECK-LABEL:   llvm.func internal @private_test(
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[HEAP:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr
+// CHECK: %[[STACK:.*]] = llvm.alloca %[[VAL_1]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_6:.*]] = llvm.alloca %[[VAL_1]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+// CHECK: llvm.store %[[VAL_0]], %[[STACK]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: llvm.store %[[VAL_1]], %[[VAL_6]] : i32, !llvm.ptr
+// CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_6]] : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+// CHECK: %[[VAL_8:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_9:.*]] = llvm.load %[[VAL_8]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_10:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_11:.*]] = llvm.load %[[VAL_10]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_12:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_13:.*]] = llvm.load %[[VAL_12]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_14:.*]] = llvm.sub %[[VAL_11]], %[[VAL_2]] : i64
+// CHECK: %[[VAL_15:.*]] = omp.map.bounds lower_bound(%[[VAL_2]] : i64) upper_bound(%[[VAL_14]] : i64) extent(%[[VAL_11]] : i64) stride(%[[VAL_13]] : i64) start_idx(%[[VAL_9]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_16:.*]] = llvm.load %[[STACK]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_16]], %[[HEAP]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: %[[VAL_17:.*]] = llvm.getelementptr %[[HEAP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_18:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[VAL_17]] : !llvm.ptr) bounds(%[[VAL_15]]) -> !llvm.ptr {name = ""}
+// CHECK: %[[VAL_19:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, to) capture(ByRef) members(%[[VAL_18]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: omp.target nowait map_entries(%[[VAL_7]] -> %[[VAL_20:.*]], %[[VAL_19]] -> %[[VAL_21:.*]], %[[VAL_18]] -> %[[VAL_22:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@privatizer %[[HEAP]] -> %[[VAL_23:.*]] [map_idx=1] : !llvm.ptr) {
+// CHECK:   omp.terminator
+// CHECK: }
+// CHECK: %[[VAL_24:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_25:.*]] = llvm.load %[[VAL_24]] : !llvm.ptr -> !llvm.ptr
+// CHECK: llvm.call @free(%[[VAL_25]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.return
+// CHECK:         }
+
+// CHECK-LABEL:   llvm.func internal @firstprivate_test(
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(4 : i64) : i64
+// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_2:.*]] = llvm.mlir.undef :
+// CHECK-SAME: !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[VAL_5:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[HEAP:.*]] = llvm.call @malloc(%[[VAL_5]]) : (i64) -> !llvm.ptr
+// CHECK: %[[STACK:.*]] = llvm.alloca %[[VAL_3]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_8:.*]] = llvm.alloca %[[VAL_3]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+// CHECK: llvm.store %[[VAL_2]], %[[STACK]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: llvm.store %[[VAL_3]], %[[VAL_8]] : i32, !llvm.ptr
+// CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_8]] : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc)
+// CHECK-SAME: capture(ByCopy) -> !llvm.ptr {name = "i"}
+// CHECK: %[[VAL_10:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 0] : (!llvm.ptr) -> !llvm.ptr,
+// CHECK-SAME: !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_11:.*]] = llvm.load %[[VAL_10]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_12:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 1] : (!llvm.ptr) ->
+// CHECK-SAME: !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_13:.*]] = llvm.load %[[VAL_12]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_14:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_15:.*]] = llvm.load %[[VAL_14]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_16:.*]] = llvm.sub %[[VAL_13]], %[[VAL_4]] : i64
+// CHECK: %[[VAL_17:.*]] = omp.map.bounds lower_bound(%[[VAL_4]] : i64) upper_bound(%[[VAL_16]] : i64) extent(%[[VAL_13]] : i64) stride(%[[VAL_15]] : i64) start_idx(%[[VAL_11]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_18:.*]] = llvm.load %[[STACK]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_18]], %[[HEAP]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: %[[VAL_19:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_20:.*]] = llvm.sub %[[VAL_16]], %[[VAL_4]] : i64
+// CHECK: %[[VAL_21:.*]] = llvm.add %[[VAL_20]], %[[VAL_1]] : i64
+// CHECK: %[[VAL_22:.*]] = llvm.mul %[[VAL_1]], %[[VAL_21]] : i64
+// CHECK: %[[VAL_23:.*]] = llvm.mul %[[VAL_22]], %[[VAL_0]] : i64
+// CHECK: %[[NEW_DATA_PTR:.*]] = llvm.call @malloc(%[[VAL_23]]) : (i64) -> !llvm.ptr
+// CHECK: %[[OLD_DATA_PTR:.*]] = llvm.load %[[VAL_19]] : !llvm.ptr -> !llvm.ptr
+// CHECK: "llvm.intr.memcpy"(%[[NEW_DATA_PTR]], %[[OLD_DATA_PTR]], %[[VAL_23]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// CHECK: %[[VAL_26:.*]] = llvm.getelementptr %[[HEAP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[NEW_DATA_PTR]], %[[VAL_26]] : !llvm.ptr, !llvm.ptr
+// CHECK: %[[VAL_27:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef)
+// CHECK-SAME: var_ptr_ptr(%[[VAL_26]] : !llvm.ptr) bounds(%[[VAL_17]]) -> !llvm.ptr {name = ""}
+// CHECK: %[[VAL_28:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>)
+// CHECK-SAME: map_clauses(always, to) capture(ByRef) members(%[[VAL_27]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: omp.target nowait map_entries(%[[VAL_9]] -> %[[VAL_29:.*]], %[[VAL_28]] -> %[[VAL_30:.*]], %[[VAL_27]] -> %[[VAL_31:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr)
+// CHECK-SAME: private(@firstprivatizer %[[HEAP]] -> %[[VAL_32:.*]] [map_idx=1] : !llvm.ptr) {
+// CHECK:   omp.terminator
+// CHECK: }
+// CHECK: %[[VAL_33:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_34:.*]] = llvm.load %[[VAL_33]] : !llvm.ptr -> !llvm.ptr
+// CHECK: llvm.call @free(%[[VAL_34]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.return
+// CHECK:         }
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 2fa4470bb8300..af6d254cfd3c3 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -249,24 +249,6 @@ llvm.func @target_is_device_ptr(%x : !llvm.ptr) {
 
 // -----
 
-omp.private {type = firstprivate} @x.privatizer : i32 copy {
-^bb0(%mold: !llvm.ptr, %private: !llvm.ptr):
-  %0 = llvm.load %mold : !llvm.ptr -> i32
-  llvm.store %0, %private : i32, !llvm.ptr
-  omp.yield(%private: !llvm.ptr)
-}
-llvm.func @target_firstprivate(%x : !llvm.ptr) {
-  %0 = omp.map.info var_ptr(%x : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr
-  // expected-error at below {{not yet implemented: Unhandled clause privatization for deferred target tasks in omp.target operation}}
-  // expected-error at below {{LLVM Translation failed for operation: omp.target}}
-  omp.target nowait map_entries(%0 -> %blockarg0 : !llvm.ptr) private(@x.privatizer %x -> %arg0 [map_idx=0] : !llvm.ptr) {
-    omp.terminator
-  }
-  llvm.return
-}
-
-// -----
-
 llvm.func @target_enter_data_depend(%x: !llvm.ptr) {
   // expected-error at below {{not yet implemented: Unhandled clause depend in omp.target_enter_data operation}}
   // expected-error at below {{LLVM Translation failed for operation: omp.target_enter_data}}

>From f72152a0b4d7fe53f973cb7c3ee30053e0e6845b Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 25 Aug 2025 22:20:18 -0500
Subject: [PATCH 02/27] Add some comments and clean up some codoe

---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       |  1 -
 .../Dialect/LLVMIR/IR/FunctionCallUtils.cpp   |  3 +-
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 28 ++++++++-----------
 mlir/lib/Tools/mlir-opt/MlirOptMain.cpp       |  2 +-
 4 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index b4b4e6e7e1283..76f3cbd421cb9 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -52,7 +52,6 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/LLVMIR/Transforms/AddComdats.h"
-#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/BuiltinTypes.h"
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index 10c398e67d88e..feaffa34897b6 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -68,7 +68,6 @@ mlir::LLVM::lookupOrCreateFn(OpBuilder &b, Operation *moduleOp, StringRef name,
                              SymbolTableCollection *symbolTables) {
   assert(moduleOp->hasTrait<OpTrait::SymbolTable>() &&
          "expected SymbolTable operation");
-  llvm::errs() << "Looking up " << name << "\n";
   auto func = lookupFuncOp(name, moduleOp, symbolTables);
   auto funcT = LLVMFunctionType::get(resultType, paramTypes, isVarArg);
   // Assert the signature of the found function is same as expected
@@ -86,7 +85,7 @@ mlir::LLVM::lookupOrCreateFn(OpBuilder &b, Operation *moduleOp, StringRef name,
     }
     return func;
   }
-  llvm::errs() << "Did not find " << name << ".. creating it \n";
+
   OpBuilder::InsertionGuard g(b);
   assert(!moduleOp->getRegion(0).empty() && "expected non-empty region");
   b.setInsertionPointToStart(&moduleOp->getRegion(0).front());
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index e745e43072113..3efdd5e141584 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -95,25 +95,21 @@ class OMPTargetPrepareDelayedPrivatizationPattern
         newPrivVars.push_back(privVar);
         continue;
       }
-      // Treat the privVar as varPtr. TODO: For boxchars this likely wont be a
-      // pointer. Allocate heap memory that corresponds to the type of memory
+
+      // Allocate heap memory that corresponds to the type of memory
       // pointed to by varPtr
+      // TODO: For boxchars this likely wont be a pointer.
       mlir::Value varPtr = privVar;
       mlir::Value heapMem = allocateHeapMem(targetOp, privVar, mod, rewriter);
-      if (!heapMem) {
-        newPrivVars.push_back(privVar);
+      if (!heapMem)
         return failure();
-      }
-      newPrivVars.push_back(heapMem);
-
-      // Find the earliest insertion point for the copy.
-
-      // Now, fix up the omp::MapInfoOp instances that use varPtr to refer
-      // to heapMem instead.
-      using ReplacementEntry = std::pair<Operation *, Operation *>;
-      llvm::SmallVector<ReplacementEntry> replRecord;
 
+      newPrivVars.push_back(heapMem);
 
+      // Find the earliest insertion point for the copy. This will be before
+      // the first in the list of omp::MapInfoOp instances that use varPtr.
+      // After the copy these omp::MapInfoOp instances will refer to heapMem
+      // instead.
       Operation *varPtrDefiningOp = varPtr.getDefiningOp();
       std::set<Operation *> users;
       users.insert(varPtrDefiningOp->user_begin(),
@@ -122,11 +118,9 @@ class OMPTargetPrepareDelayedPrivatizationPattern
       auto usesVarPtr = [&users](Operation *op) -> bool {
         return users.count(op);
       };
-
       SmallVector<Operation *> chainOfOps;
       chainOfOps.push_back(mapInfoOperation);
       if (!mapInfoOp.getMembers().empty()) {
-
         for (auto member : mapInfoOp.getMembers()) {
           if (usesVarPtr(member.getDefiningOp()))
             chainOfOps.push_back(member.getDefiningOp());
@@ -144,13 +138,15 @@ class OMPTargetPrepareDelayedPrivatizationPattern
       });
 
       rewriter.setInsertionPoint(chainOfOps.front());
+      // Copy the value of the local variable into the heap-allocated location.
       mlir::Location loc = chainOfOps.front()->getLoc();
       mlir::Type varType = getElemType(varPtr);
-      // Copy the value of the local variable into the heap-allocated location.
       auto loadVal = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
       LLVM_ATTRIBUTE_UNUSED auto storeInst =
           rewriter.create<LLVM::StoreOp>(loc, loadVal.getResult(), heapMem);
 
+      using ReplacementEntry = std::pair<Operation *, Operation *>;
+      llvm::SmallVector<ReplacementEntry> replRecord;
       auto cloneAndMarkForDeletion = [&](Operation *origOp) -> Operation * {
         Operation *clonedOp = rewriter.clone(*origOp);
         rewriter.replaceAllOpUsesWith(origOp, clonedOp);
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index 99f384fbb1f7e..60c5406bdd197 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -624,7 +624,7 @@ LogicalResult mlir::MlirOptMain(llvm::raw_ostream &outputStream,
   // We use the thread-pool this context is creating, and avoid
   // creating any thread when disabled.
   MLIRContext threadPoolCtx;
-  llvm::errs() << "threadPoolCtx.isMultithreadingEnabled() = " << threadPoolCtx.isMultithreadingEnabled() << "\n";
+
   if (threadPoolCtx.isMultithreadingEnabled())
     threadPool = &threadPoolCtx.getThreadPool();
 

>From c859bbc8fc63d53514502570af71c6dfeae68d9f Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Tue, 26 Aug 2025 17:12:35 -0500
Subject: [PATCH 03/27] Fix CHECK stmts in test to account for constant folding
 done by the greedy pattern matcher

---
 flang/test/Fir/omp_target_allocmem_freemem.fir | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/flang/test/Fir/omp_target_allocmem_freemem.fir b/flang/test/Fir/omp_target_allocmem_freemem.fir
index 03eb94acb1ac7..aa7b2dce07153 100644
--- a/flang/test/Fir/omp_target_allocmem_freemem.fir
+++ b/flang/test/Fir/omp_target_allocmem_freemem.fir
@@ -62,7 +62,7 @@ func.func @omp_target_allocmem_scalar_char_kind() -> () {
 // CHECK-LABEL: define void @omp_target_allocmem_scalar_dynchar(
 // CHECK-SAME: i32 [[TMP0:%.*]]) {
 // CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP3:%.*]] = mul i64 1, [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 1
 // CHECK-NEXT:    [[TMP4:%.*]] = mul i64 1, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0)
 // CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
@@ -80,7 +80,7 @@ func.func @omp_target_allocmem_scalar_dynchar(%l : i32) -> () {
 // CHECK-LABEL: define void @omp_target_allocmem_scalar_dynchar_kind(
 // CHECK-SAME: i32 [[TMP0:%.*]]) {
 // CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP3:%.*]] = mul i64 2, [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 // CHECK-NEXT:    [[TMP4:%.*]] = mul i64 1, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0)
 // CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
@@ -141,7 +141,7 @@ func.func @omp_target_allocmem_array_of_dynchar(%l: i32) -> () {
 
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_nonchar(
 // CHECK-SAME: i64 [[TMP0:%.*]]) {
-// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 12, [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 12
 // CHECK-NEXT:    [[TMP3:%.*]] = mul i64 1, [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call ptr @omp_target_alloc(i64 [[TMP3]], i32 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
@@ -157,7 +157,7 @@ func.func @omp_target_allocmem_dynarray_of_nonchar(%e: index) -> () {
 
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_nonchar2(
 // CHECK-SAME: i64 [[TMP0:%.*]]) {
-// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 4, [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 4
 // CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], [[TMP0]]
 // CHECK-NEXT:    [[TMP4:%.*]] = mul i64 1, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0)
@@ -174,7 +174,7 @@ func.func @omp_target_allocmem_dynarray_of_nonchar2(%e: index) -> () {
 
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_char(
 // CHECK-SAME: i64 [[TMP0:%.*]]) {
-// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 60, [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 60
 // CHECK-NEXT:    [[TMP3:%.*]] = mul i64 1, [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call ptr @omp_target_alloc(i64 [[TMP3]], i32 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
@@ -191,7 +191,7 @@ func.func @omp_target_allocmem_dynarray_of_char(%e : index) -> () {
 
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_char2(
 // CHECK-SAME: i64 [[TMP0:%.*]]) {
-// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 20, [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 20
 // CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], [[TMP0]]
 // CHECK-NEXT:    [[TMP4:%.*]] = mul i64 1, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0)
@@ -227,7 +227,7 @@ func.func @omp_target_allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> () {
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_dynchar2(
 // CHECK-SAME: i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) {
 // CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = mul i64 2, [[TMP3]]
+// CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 // CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], [[TMP1]]
 // CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP1]]
 // CHECK-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]

>From 697cc4ff3309a9084cfa9686c18d66fb1f8fbd13 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Tue, 26 Aug 2025 23:09:10 -0500
Subject: [PATCH 04/27] Fix clang-format issues

---
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  7 ++++---
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 20 +++++++++----------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index d9cb14262ea95..6c9e0648fede8 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -414,9 +414,10 @@ void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
 
   // Run a pass to prepare for translation of delayed privatization in the
   // context of deferred target tasks.
-  addNestedPassConditionally<mlir::LLVM::LLVMFuncOp>(pm, disableFirToLlvmIr,[&]() {
-    return mlir::LLVM::createPrepareForOMPOffloadPrivatizationPass();
-  });
+  addNestedPassConditionally<mlir::LLVM::LLVMFuncOp>(
+      pm, disableFirToLlvmIr, [&]() {
+        return mlir::LLVM::createPrepareForOMPOffloadPrivatizationPass();
+      });
 }
 
 } // namespace fir
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index 3efdd5e141584..a2e522d5f536d 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
+#include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -244,9 +244,8 @@ class OMPTargetPrepareDelayedPrivatizationPattern
       for (auto repl : replRecord) {
         Operation *origOp = repl.first;
         Operation *clonedOp = repl.second;
-        rewriter.modifyOpInPlace(clonedOp, [&]() {
-          clonedOp->replaceUsesOfWith(varPtr, heapMem);
-        });
+        rewriter.modifyOpInPlace(
+            clonedOp, [&]() { clonedOp->replaceUsesOfWith(varPtr, heapMem); });
         rewriter.eraseOp(origOp);
       }
     }
@@ -312,7 +311,7 @@ class OMPTargetPrepareDelayedPrivatizationPattern
   // Get the (compile-time constant) size of varType as per the
   // given DataLayout dl.
   std::int64_t getSizeInBytes(const mlir::DataLayout &dl,
-                           mlir::Type varType) const {
+                              mlir::Type varType) const {
     llvm::TypeSize size = dl.getTypeSize(varType);
     unsigned short alignment = dl.getTypeABIAlignment(varType);
     return llvm::alignTo(size, alignment);
@@ -350,7 +349,8 @@ class OMPTargetPrepareDelayedPrivatizationPattern
   LLVM::LLVMFuncOp getMalloc(ModuleOp mod, PatternRewriter &rewriter) const {
     llvm::FailureOr<mlir::LLVM::LLVMFuncOp> mallocCall =
         LLVM::lookupOrCreateMallocFn(rewriter, mod, rewriter.getI64Type());
-    assert(llvm::succeeded(mallocCall) && "Could not find malloc in the module");
+    assert(llvm::succeeded(mallocCall) &&
+           "Could not find malloc in the module");
     return mallocCall.value();
   }
 
@@ -372,8 +372,8 @@ class OMPTargetPrepareDelayedPrivatizationPattern
     mlir::Value sizeBytes = rewriter.create<LLVM::ConstantOp>(
         loc, mallocFn.getFunctionType().getParamType(0), distance);
 
-    auto mallocCallOp = rewriter.create<LLVM::CallOp>(loc, mallocFn,
-                                                      ValueRange{sizeBytes});
+    auto mallocCallOp =
+        rewriter.create<LLVM::CallOp>(loc, mallocFn, ValueRange{sizeBytes});
     return mallocCallOp.getResult();
   }
 
@@ -409,7 +409,7 @@ struct PrepareForOMPOffloadPrivatizationPass
 
     RewritePatternSet patterns(&context);
     patterns.add<OMPTargetPrepareDelayedPrivatizationPattern>(&context);
-    LLVM_DEBUG(llvm::dbgs() << " Module before : " << mod << "\n");
+
     if (mlir::failed(
             applyPatternsGreedily(func, std::move(patterns),
                                   GreedyRewriteConfig().setStrictness(
@@ -418,8 +418,6 @@ struct PrepareForOMPOffloadPrivatizationPass
                 "error in preparing targetOps for delayed privatization.");
       signalPassFailure();
     }
-    LLVM_DEBUG(llvm::dbgs() << " Module after : " << mod << "\n");
-
   }
 };
 } // namespace

>From bc107cd8c199166c1aa12fc517b5298469706949 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 4 Sep 2025 23:17:56 -0500
Subject: [PATCH 05/27] Checkpoint commit, working with operaiton->walk

---
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 335 +++++++++++++++++-
 1 file changed, 333 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index a2e522d5f536d..a7a415348bad4 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -388,13 +388,14 @@ class OMPTargetPrepareDelayedPrivatizationPattern
 // PrepareForOMPOffloadPrivatizationPass
 //===----------------------------------------------------------------------===//
 
-struct PrepareForOMPOffloadPrivatizationPass
+class PrepareForOMPOffloadPrivatizationPass
     : public LLVM::impl::PrepareForOMPOffloadPrivatizationPassBase<
           PrepareForOMPOffloadPrivatizationPass> {
 
   void runOnOperation() override {
     LLVM::LLVMFuncOp func = getOperation();
-    MLIRContext &context = getContext();
+    LLVM_DEBUG(llvm::dbgs() << "In PrepareForOMPOffloadPrivatizationPass\n");
+    LLVM_DEBUG(llvm::dbgs() << "Func is \n" << func << "\n");
     ModuleOp mod = func->getParentOfType<ModuleOp>();
 
     // FunctionFilteringPass removes bounds arguments from omp.map.info
@@ -406,6 +407,8 @@ struct PrepareForOMPOffloadPrivatizationPass
     if (offloadModuleInterface && offloadModuleInterface.getIsTargetDevice()) {
       return;
     }
+#if 0
+    MLIRContext &context = getContext();
 
     RewritePatternSet patterns(&context);
     patterns.add<OMPTargetPrepareDelayedPrivatizationPattern>(&context);
@@ -418,6 +421,334 @@ struct PrepareForOMPOffloadPrivatizationPass
                 "error in preparing targetOps for delayed privatization.");
       signalPassFailure();
     }
+#else
+    getOperation()->walk([&](omp::TargetOp targetOp) {
+      if (!hasPrivateVars(targetOp) || !isTargetTaskDeferred(targetOp))
+        return;
+      IRRewriter rewriter(&getContext());
+      ModuleOp mod = targetOp->getParentOfType<ModuleOp>();
+      LLVM::LLVMFuncOp llvmFunc = targetOp->getParentOfType<LLVM::LLVMFuncOp>();
+      OperandRange privateVars = targetOp.getPrivateVars();
+      mlir::SmallVector<mlir::Value> newPrivVars;
+
+      newPrivVars.reserve(privateVars.size());
+      std::optional<ArrayAttr> privateSyms = targetOp.getPrivateSyms();
+      for (auto [privVarIdx, privVarSymPair] :
+           llvm::enumerate(llvm::zip_equal(privateVars, *privateSyms))) {
+        auto privVar = std::get<0>(privVarSymPair);
+        auto privSym = std::get<1>(privVarSymPair);
+
+        omp::PrivateClauseOp privatizer = findPrivatizer(targetOp, privSym);
+        if (!privatizer.needsMap()) {
+          newPrivVars.push_back(privVar);
+          continue;
+        }
+        bool isFirstPrivate = privatizer.getDataSharingType() ==
+                              omp::DataSharingClauseType::FirstPrivate;
+
+        mlir::Value mappedValue =
+            targetOp.getMappedValueForPrivateVar(privVarIdx);
+        Operation *mapInfoOperation = mappedValue.getDefiningOp();
+        auto mapInfoOp = mlir::cast<omp::MapInfoOp>(mapInfoOperation);
+
+        if (mapInfoOp.getMapCaptureType() == omp::VariableCaptureKind::ByCopy) {
+          newPrivVars.push_back(privVar);
+          continue;
+        }
+
+        // Allocate heap memory that corresponds to the type of memory
+        // pointed to by varPtr
+        // TODO: For boxchars this likely wont be a pointer.
+        mlir::Value varPtr = privVar;
+        mlir::Value heapMem = allocateHeapMem(targetOp, privVar, mod, rewriter);
+        if (!heapMem)
+          targetOp.emitError("Unable to allocate heap memory when try to move "
+                             "a private variable out of the stack and into the "
+                             "heap for use by a deferred target task");
+
+        newPrivVars.push_back(heapMem);
+        // Find the earliest insertion point for the copy. This will be before
+        // the first in the list of omp::MapInfoOp instances that use varPtr.
+        // After the copy these omp::MapInfoOp instances will refer to heapMem
+        // instead.
+        Operation *varPtrDefiningOp = varPtr.getDefiningOp();
+        std::set<Operation *> users;
+        users.insert(varPtrDefiningOp->user_begin(),
+                     varPtrDefiningOp->user_end());
+
+        auto usesVarPtr = [&users](Operation *op) -> bool {
+          return users.count(op);
+        };
+        SmallVector<Operation *> chainOfOps;
+        chainOfOps.push_back(mapInfoOperation);
+        if (!mapInfoOp.getMembers().empty()) {
+          for (auto member : mapInfoOp.getMembers()) {
+            if (usesVarPtr(member.getDefiningOp()))
+              chainOfOps.push_back(member.getDefiningOp());
+
+            omp::MapInfoOp memberMap =
+                mlir::cast<omp::MapInfoOp>(member.getDefiningOp());
+            if (memberMap.getVarPtrPtr() &&
+                usesVarPtr(memberMap.getVarPtrPtr().getDefiningOp()))
+              chainOfOps.push_back(memberMap.getVarPtrPtr().getDefiningOp());
+          }
+        }
+        DominanceInfo dom;
+        llvm::sort(chainOfOps, [&](Operation *l, Operation *r) {
+          return dom.dominates(l, r);
+        });
+
+        rewriter.setInsertionPoint(chainOfOps.front());
+        // Copy the value of the local variable into the heap-allocated
+        // location.
+        mlir::Location loc = chainOfOps.front()->getLoc();
+        mlir::Type varType = getElemType(varPtr);
+        auto loadVal = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
+        LLVM_ATTRIBUTE_UNUSED auto storeInst =
+            rewriter.create<LLVM::StoreOp>(loc, loadVal.getResult(), heapMem);
+
+        using ReplacementEntry = std::pair<Operation *, Operation *>;
+        llvm::SmallVector<ReplacementEntry> replRecord;
+        auto cloneAndMarkForDeletion = [&](Operation *origOp) -> Operation * {
+          Operation *clonedOp = rewriter.clone(*origOp);
+          rewriter.replaceAllOpUsesWith(origOp, clonedOp);
+          replRecord.push_back(std::make_pair(origOp, clonedOp));
+          return clonedOp;
+        };
+
+        rewriter.setInsertionPoint(targetOp);
+        rewriter.setInsertionPoint(cloneAndMarkForDeletion(mapInfoOperation));
+
+        // Fix any members that may use varPtr to now use heapMem
+        if (!mapInfoOp.getMembers().empty()) {
+          for (auto member : mapInfoOp.getMembers()) {
+            Operation *memberOperation = member.getDefiningOp();
+            if (!usesVarPtr(memberOperation))
+              continue;
+            rewriter.setInsertionPoint(
+                cloneAndMarkForDeletion(memberOperation));
+
+            auto memberMapInfoOp = mlir::cast<omp::MapInfoOp>(memberOperation);
+            if (memberMapInfoOp.getVarPtrPtr()) {
+              Operation *varPtrPtrdefOp =
+                  memberMapInfoOp.getVarPtrPtr().getDefiningOp();
+
+              // In the case of firstprivate, we have to do the following
+              // 1. Allocate heap memory for the underlying data.
+              // 2. Copy the original underlying data to the new memory
+              // allocated on the heap.
+              // 3. Put this new (heap) address in the originating
+              // struct/descriptor
+
+              // Consider the following sequence of omp.map.info and omp.target
+              // operations.
+              // %0 = llvm.getelementptr %19[0, 0]
+              // %1 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) ...
+              //                   var_ptr_ptr(%0 : !llvm.ptr)  bounds(..)
+              // %2 = omp.map.info var_ptr(%19 : !llvm.ptr, !desc_type)>) ...
+              //                   members(%1 : [0] : !llvm.ptr) -> !llvm.ptr
+              // omp.target nowait map_entries(%2 -> %arg5, %1 -> %arg8 : ..)
+              //                   private(@privatizer %19 -> %arg9 [map_idx=1]
+              //                   : !llvm.ptr) {
+              // We need to allocate memory on the heap for the underlying
+              // pointer which is stored at the var_ptr_ptr operand of %1. Then
+              // we need to copy this pointer to the new heap allocated memory
+              // location. Then, we need to store the address of the new heap
+              // location in the originating struct/descriptor. So, we generate
+              // the following (pseudo) MLIR code (Using the same names of
+              // mlir::Value instances in the example as in the code below)
+              //
+              // %dataMalloc = malloc(totalSize)
+              // %loadDataPtr = load %0 : !llvm.ptr -> !llvm.ptr
+              // memcpy(%dataMalloc, %loadDataPtr, totalSize)
+              // %newVarPtrPtrOp = llvm.getelementptr %heapMem[0, 0]
+              // llvm.store %dataMalloc, %newVarPtrPtrOp
+              // %1.cloned = omp.map.info var_ptr(%heapMem : !llvm.ptr, i32) ...
+              //                          var_ptr_ptr(%newVarPtrPtrOp :
+              //                          !llvm.ptr)
+              // %2.cloned = omp.map.info var_ptr(%heapMem : !llvm.ptr,
+              //                                             !desc_type)>) ...
+              //                          members(%1.cloned : [0] : !llvm.ptr)
+              //             -> !llvm.ptr
+              // omp.target nowait map_entries(%2.cloned -> %arg5,
+              //                               %1.cloned -> %arg8 : ..)
+              //            private(@privatizer %heapMem -> .. [map_idx=1] : ..)
+              //            {
+
+              if (isFirstPrivate) {
+                assert(!memberMapInfoOp.getBounds().empty() &&
+                       "empty bounds on member map of firstprivate variable");
+                mlir::Location loc = memberMapInfoOp.getLoc();
+                mlir::Value totalSize =
+                    getSizeInBytes(memberMapInfoOp, mod, rewriter);
+                auto dataMalloc =
+                    allocateHeapMem(loc, totalSize, mod, rewriter);
+                auto loadDataPtr = rewriter.create<LLVM::LoadOp>(
+                    loc, memberMapInfoOp.getVarPtrPtr().getType(),
+                    memberMapInfoOp.getVarPtrPtr());
+                LLVM_ATTRIBUTE_UNUSED auto memcpy =
+                    rewriter.create<mlir::LLVM::MemcpyOp>(
+                        loc, dataMalloc.getResult(), loadDataPtr.getResult(),
+                        totalSize, /*isVolatile=*/false);
+                Operation *newVarPtrPtrOp = rewriter.clone(*varPtrPtrdefOp);
+                rewriter.replaceAllUsesExcept(memberMapInfoOp.getVarPtrPtr(),
+                                              newVarPtrPtrOp->getOpResult(0),
+                                              loadDataPtr);
+                rewriter.modifyOpInPlace(newVarPtrPtrOp, [&]() {
+                  newVarPtrPtrOp->replaceUsesOfWith(varPtr, heapMem);
+                });
+                LLVM_ATTRIBUTE_UNUSED auto storePtr =
+                    rewriter.create<LLVM::StoreOp>(
+                        loc, dataMalloc.getResult(),
+                        newVarPtrPtrOp->getResult(0));
+              } else
+                rewriter.setInsertionPoint(
+                    cloneAndMarkForDeletion(varPtrPtrdefOp));
+            }
+          }
+        }
+
+        for (auto repl : replRecord) {
+          Operation *origOp = repl.first;
+          Operation *clonedOp = repl.second;
+          rewriter.modifyOpInPlace(clonedOp, [&]() {
+            clonedOp->replaceUsesOfWith(varPtr, heapMem);
+          });
+          rewriter.eraseOp(origOp);
+        }
+      }
+      assert(newPrivVars.size() == privateVars.size() &&
+             "The number of private variables must match before and after "
+             "transformation");
+
+      rewriter.setInsertionPoint(targetOp);
+      Operation *newOp = rewriter.clone(*targetOp.getOperation());
+      omp::TargetOp newTargetOp = mlir::cast<omp::TargetOp>(newOp);
+      rewriter.modifyOpInPlace(newTargetOp, [&]() {
+        newTargetOp.getPrivateVarsMutable().assign(newPrivVars);
+      });
+      rewriter.replaceOp(targetOp, newTargetOp);
+    });
+#endif
+  }
+private:
+  bool hasPrivateVars(omp::TargetOp targetOp) const {
+    return !targetOp.getPrivateVars().empty();
+  }
+
+  bool isTargetTaskDeferred(omp::TargetOp targetOp) const {
+    return targetOp.getNowait();
+  }
+
+  template <typename OpTy>
+  omp::PrivateClauseOp findPrivatizer(OpTy op, mlir::Attribute privSym) const {
+    SymbolRefAttr privatizerName = llvm::cast<SymbolRefAttr>(privSym);
+    omp::PrivateClauseOp privatizer =
+        SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+            op, privatizerName);
+    return privatizer;
+  }
+
+  template <typename OpType>
+  mlir::Type getElemType(OpType op) const {
+    return op.getElemType();
+  }
+
+  mlir::Type getElemType(mlir::Value varPtr) const {
+    Operation *definingOp = unwrapAddrSpaceCast(varPtr.getDefiningOp());
+    assert((mlir::isa<LLVM::AllocaOp, LLVM::GEPOp>(definingOp)) &&
+           "getElemType in PrepareForOMPOffloadPrivatizationPass can deal only "
+           "with Alloca or GEP for now");
+    if (auto allocaOp = mlir::dyn_cast<LLVM::AllocaOp>(definingOp))
+      return getElemType(allocaOp);
+    // TODO: get rid of this because GEPOp.getElemType() is not the right thing
+    // to use.
+    if (auto gepOp = mlir::dyn_cast<LLVM::GEPOp>(definingOp))
+      return getElemType(gepOp);
+    return mlir::Type{};
+  }
+
+  mlir::Operation *unwrapAddrSpaceCast(Operation *op) const {
+    if (!mlir::isa<LLVM::AddrSpaceCastOp>(op))
+      return op;
+    mlir::LLVM::AddrSpaceCastOp addrSpaceCastOp =
+        mlir::cast<LLVM::AddrSpaceCastOp>(op);
+    return unwrapAddrSpaceCast(addrSpaceCastOp.getArg().getDefiningOp());
+  }
+
+  // Get the (compile-time constant) size of varType as per the
+  // given DataLayout dl.
+  std::int64_t getSizeInBytes(const mlir::DataLayout &dl,
+                              mlir::Type varType) const {
+    llvm::TypeSize size = dl.getTypeSize(varType);
+    unsigned short alignment = dl.getTypeABIAlignment(varType);
+    return llvm::alignTo(size, alignment);
+  }
+
+  // Generate code to get the size of data being mapped from the bounds
+  // of mapInfoOp
+  mlir::Value getSizeInBytes(omp::MapInfoOp mapInfoOp, ModuleOp mod,
+                             IRRewriter &rewriter) const {
+    mlir::Location loc = mapInfoOp.getLoc();
+    mlir::Type llvmInt64Ty = rewriter.getI64Type();
+    mlir::Value constOne =
+        rewriter.create<LLVM::ConstantOp>(loc, llvmInt64Ty, 1);
+    mlir::Value elementCount = constOne;
+    // TODO: Consider using  boundsOp.getExtent() if available.
+    for (auto bounds : mapInfoOp.getBounds()) {
+      auto boundsOp = mlir::cast<omp::MapBoundsOp>(bounds.getDefiningOp());
+      elementCount = rewriter.create<LLVM::MulOp>(
+          loc, llvmInt64Ty, elementCount,
+          rewriter.create<LLVM::AddOp>(
+              loc, llvmInt64Ty,
+              (rewriter.create<LLVM::SubOp>(loc, llvmInt64Ty,
+                                            boundsOp.getUpperBound(),
+                                            boundsOp.getLowerBound())),
+              constOne));
+    }
+    const mlir::DataLayout &dl = mlir::DataLayout(mod);
+    std::int64_t elemSize = getSizeInBytes(dl, mapInfoOp.getVarType());
+    mlir::Value elemSizeV =
+        rewriter.create<LLVM::ConstantOp>(loc, llvmInt64Ty, elemSize);
+    return rewriter.create<LLVM::MulOp>(loc, llvmInt64Ty, elementCount,
+                                        elemSizeV);
+  }
+
+  LLVM::LLVMFuncOp getMalloc(ModuleOp mod, IRRewriter &rewriter) const {
+    llvm::FailureOr<mlir::LLVM::LLVMFuncOp> mallocCall =
+        LLVM::lookupOrCreateMallocFn(rewriter, mod, rewriter.getI64Type());
+    assert(llvm::succeeded(mallocCall) &&
+           "Could not find malloc in the module");
+    return mallocCall.value();
+  }
+
+  template <typename OpTy>
+  mlir::Value allocateHeapMem(OpTy targetOp, mlir::Value privVar, ModuleOp mod,
+                              IRRewriter &rewriter) const {
+    mlir::Value varPtr = privVar;
+    Operation *definingOp = varPtr.getDefiningOp();
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(definingOp);
+    LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
+
+    mlir::Location loc = definingOp->getLoc();
+    mlir::Type varType = getElemType(varPtr);
+    assert(mod.getDataLayoutSpec() &&
+           "MLIR module with no datalayout spec not handled yet");
+    const mlir::DataLayout &dl = mlir::DataLayout(mod);
+    std::int64_t distance = getSizeInBytes(dl, varType);
+    mlir::Value sizeBytes = rewriter.create<LLVM::ConstantOp>(
+        loc, mallocFn.getFunctionType().getParamType(0), distance);
+
+    auto mallocCallOp =
+        rewriter.create<LLVM::CallOp>(loc, mallocFn, ValueRange{sizeBytes});
+    return mallocCallOp.getResult();
+  }
+
+  LLVM::CallOp allocateHeapMem(mlir::Location loc, mlir::Value size,
+                               ModuleOp mod, IRRewriter &rewriter) const {
+    LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
+    return rewriter.create<LLVM::CallOp>(loc, mallocFn, ValueRange{size});
   }
 };
 } // namespace

>From f7dacb33be0ca6fb17ef8cf4559f3d4cdea9ebad Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 8 Sep 2025 14:11:21 -0500
Subject: [PATCH 06/27] Address more comments from tblah and meinersbur

---
 .../OpenMPOffloadPrivatizationPrepare.h       |   3 +-
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |   4 +-
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 486 +++---------------
 3 files changed, 63 insertions(+), 430 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h b/mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h
index af6dfb0057688..86aad5c593025 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h
@@ -1,5 +1,4 @@
-//===- OpenMPOffloadPrivatizationPrepare.h - Prepare for OpenMP Offload
-// Privatization -*- C++ -*-===//
+//===- OpenMPOffloadPrivatizationPrepare.h -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index efa43107da068..2548a8ab4aac6 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -1479,8 +1479,8 @@ def TargetOp : OpenMP_Op<"target", traits = [
     `map` operands. For `private` operands that require a map, the value of the
     corresponding element in the attribute is the index of the `map` operand
     (relative to other `map` operands not the whole operands of the operation). For
-    `private` operands that do not require a map, this value is -1 (which is omitted
-    from the assembly format printing).
+    `private` opernads that do not require a map, this value is -1 (which is omitted
+    from the assembly foramt printing).
   }] # clausesDescription;
 
   let arguments = !con(clausesArgs,
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index a7a415348bad4..781ec34435e1d 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -1,5 +1,4 @@
-//===- OpenMPOffloadPrivatizationPrepare.cpp - Prepare for OpenMP Offload
-// Privatization ---------===//
+//===- OpenMPOffloadPrivatizationPrepare.cpp - Prepare OMP privatization --===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
+//#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
+#include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
@@ -17,8 +17,6 @@
 #include "mlir/IR/Dominance.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include <cstdint>
 #include <utility>
 
@@ -29,7 +27,6 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "omp-prepare-for-offload-privatization"
-#define PDBGS() (llvm::dbgs() << "[" << DEBUG_TYPE << "]: ")
 
 namespace mlir {
 namespace LLVM {
@@ -43,347 +40,6 @@ namespace LLVM {
 using namespace mlir;
 namespace {
 
-//===----------------------------------------------------------------------===//
-// OMPTargetPrepareDelayedPrivatizationPattern
-//===----------------------------------------------------------------------===//
-
-class OMPTargetPrepareDelayedPrivatizationPattern
-    : public OpRewritePattern<omp::TargetOp> {
-public:
-  using OpRewritePattern<omp::TargetOp>::OpRewritePattern;
-
-  // Match omp::TargetOp that have the following characteristics.
-  // 1. have private vars which refer to local (stack) memory
-  // 2. the target op has the nowait clause
-  // In this case, we allocate memory for the privatized variable on the heap
-  // and copy the original variable into this new heap allocation. We fix up
-  // any omp::MapInfoOp instances that may be mapping the private variable.
-  mlir::LogicalResult
-  matchAndRewrite(omp::TargetOp targetOp,
-                  PatternRewriter &rewriter) const override {
-    if (!hasPrivateVars(targetOp) || !isTargetTaskDeferred(targetOp))
-      return rewriter.notifyMatchFailure(
-          targetOp,
-          "targetOp does not have privateVars or does not need a target task");
-
-    ModuleOp mod = targetOp->getParentOfType<ModuleOp>();
-    LLVM::LLVMFuncOp llvmFunc = targetOp->getParentOfType<LLVM::LLVMFuncOp>();
-    OperandRange privateVars = targetOp.getPrivateVars();
-    mlir::SmallVector<mlir::Value> newPrivVars;
-
-    newPrivVars.reserve(privateVars.size());
-    std::optional<ArrayAttr> privateSyms = targetOp.getPrivateSyms();
-    for (auto [privVarIdx, privVarSymPair] :
-         llvm::enumerate(llvm::zip_equal(privateVars, *privateSyms))) {
-      auto privVar = std::get<0>(privVarSymPair);
-      auto privSym = std::get<1>(privVarSymPair);
-
-      omp::PrivateClauseOp privatizer = findPrivatizer(targetOp, privSym);
-      if (!privatizer.needsMap()) {
-        newPrivVars.push_back(privVar);
-        continue;
-      }
-      bool isFirstPrivate = privatizer.getDataSharingType() ==
-                            omp::DataSharingClauseType::FirstPrivate;
-
-      mlir::Value mappedValue =
-          targetOp.getMappedValueForPrivateVar(privVarIdx);
-      Operation *mapInfoOperation = mappedValue.getDefiningOp();
-      auto mapInfoOp = mlir::cast<omp::MapInfoOp>(mapInfoOperation);
-
-      if (mapInfoOp.getMapCaptureType() == omp::VariableCaptureKind::ByCopy) {
-        newPrivVars.push_back(privVar);
-        continue;
-      }
-
-      // Allocate heap memory that corresponds to the type of memory
-      // pointed to by varPtr
-      // TODO: For boxchars this likely wont be a pointer.
-      mlir::Value varPtr = privVar;
-      mlir::Value heapMem = allocateHeapMem(targetOp, privVar, mod, rewriter);
-      if (!heapMem)
-        return failure();
-
-      newPrivVars.push_back(heapMem);
-
-      // Find the earliest insertion point for the copy. This will be before
-      // the first in the list of omp::MapInfoOp instances that use varPtr.
-      // After the copy these omp::MapInfoOp instances will refer to heapMem
-      // instead.
-      Operation *varPtrDefiningOp = varPtr.getDefiningOp();
-      std::set<Operation *> users;
-      users.insert(varPtrDefiningOp->user_begin(),
-                   varPtrDefiningOp->user_end());
-
-      auto usesVarPtr = [&users](Operation *op) -> bool {
-        return users.count(op);
-      };
-      SmallVector<Operation *> chainOfOps;
-      chainOfOps.push_back(mapInfoOperation);
-      if (!mapInfoOp.getMembers().empty()) {
-        for (auto member : mapInfoOp.getMembers()) {
-          if (usesVarPtr(member.getDefiningOp()))
-            chainOfOps.push_back(member.getDefiningOp());
-
-          omp::MapInfoOp memberMap =
-              mlir::cast<omp::MapInfoOp>(member.getDefiningOp());
-          if (memberMap.getVarPtrPtr() &&
-              usesVarPtr(memberMap.getVarPtrPtr().getDefiningOp()))
-            chainOfOps.push_back(memberMap.getVarPtrPtr().getDefiningOp());
-        }
-      }
-      DominanceInfo dom;
-      llvm::sort(chainOfOps, [&](Operation *l, Operation *r) {
-        return dom.dominates(l, r);
-      });
-
-      rewriter.setInsertionPoint(chainOfOps.front());
-      // Copy the value of the local variable into the heap-allocated location.
-      mlir::Location loc = chainOfOps.front()->getLoc();
-      mlir::Type varType = getElemType(varPtr);
-      auto loadVal = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
-      LLVM_ATTRIBUTE_UNUSED auto storeInst =
-          rewriter.create<LLVM::StoreOp>(loc, loadVal.getResult(), heapMem);
-
-      using ReplacementEntry = std::pair<Operation *, Operation *>;
-      llvm::SmallVector<ReplacementEntry> replRecord;
-      auto cloneAndMarkForDeletion = [&](Operation *origOp) -> Operation * {
-        Operation *clonedOp = rewriter.clone(*origOp);
-        rewriter.replaceAllOpUsesWith(origOp, clonedOp);
-        replRecord.push_back(std::make_pair(origOp, clonedOp));
-        return clonedOp;
-      };
-
-      rewriter.setInsertionPoint(targetOp);
-      rewriter.setInsertionPoint(cloneAndMarkForDeletion(mapInfoOperation));
-
-      // Fix any members that may use varPtr to now use heapMem
-      if (!mapInfoOp.getMembers().empty()) {
-        for (auto member : mapInfoOp.getMembers()) {
-          Operation *memberOperation = member.getDefiningOp();
-          if (!usesVarPtr(memberOperation))
-            continue;
-          rewriter.setInsertionPoint(cloneAndMarkForDeletion(memberOperation));
-
-          auto memberMapInfoOp = mlir::cast<omp::MapInfoOp>(memberOperation);
-          if (memberMapInfoOp.getVarPtrPtr()) {
-            Operation *varPtrPtrdefOp =
-                memberMapInfoOp.getVarPtrPtr().getDefiningOp();
-
-            // In the case of firstprivate, we have to do the following
-            // 1. Allocate heap memory for the underlying data.
-            // 2. Copy the original underlying data to the new memory allocated
-            // on the heap.
-            // 3. Put this new (heap) address in the originating
-            // struct/descriptor
-
-            // Consider the following sequence of omp.map.info and omp.target
-            // operations.
-            // %0 = llvm.getelementptr %19[0, 0]
-            // %1 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) ...
-            //                   var_ptr_ptr(%0 : !llvm.ptr)  bounds(..)
-            // %2 = omp.map.info var_ptr(%19 : !llvm.ptr, !desc_type)>) ...
-            //                   members(%1 : [0] : !llvm.ptr) -> !llvm.ptr
-            // omp.target nowait map_entries(%2 -> %arg5, %1 -> %arg8 : ..)
-            //                   private(@privatizer %19 -> %arg9 [map_idx=1] :
-            //                   !llvm.ptr) {
-            // We need to allocate memory on the heap for the underlying pointer
-            // which is stored at the var_ptr_ptr operand of %1. Then we need to
-            // copy this pointer to the new heap allocated memory location.
-            // Then, we need to store the address of the new heap location in
-            // the originating struct/descriptor. So, we generate the following
-            // (pseudo) MLIR code (Using the same names of mlir::Value instances
-            // in the example as in the code below)
-            //
-            // %dataMalloc = malloc(totalSize)
-            // %loadDataPtr = load %0 : !llvm.ptr -> !llvm.ptr
-            // memcpy(%dataMalloc, %loadDataPtr, totalSize)
-            // %newVarPtrPtrOp = llvm.getelementptr %heapMem[0, 0]
-            // llvm.store %dataMalloc, %newVarPtrPtrOp
-            // %1.cloned = omp.map.info var_ptr(%heapMem : !llvm.ptr, i32) ...
-            //                          var_ptr_ptr(%newVarPtrPtrOp : !llvm.ptr)
-            // %2.cloned = omp.map.info var_ptr(%heapMem : !llvm.ptr,
-            //                                             !desc_type)>) ...
-            //                          members(%1.cloned : [0] : !llvm.ptr)
-            //             -> !llvm.ptr
-            // omp.target nowait map_entries(%2.cloned -> %arg5,
-            //                               %1.cloned -> %arg8 : ..)
-            //            private(@privatizer %heapMem -> .. [map_idx=1] : ..) {
-
-            if (isFirstPrivate) {
-              assert(!memberMapInfoOp.getBounds().empty() &&
-                     "empty bounds on member map of firstprivate variable");
-              mlir::Location loc = memberMapInfoOp.getLoc();
-              mlir::Value totalSize =
-                  getSizeInBytes(memberMapInfoOp, mod, rewriter);
-              auto dataMalloc = allocateHeapMem(loc, totalSize, mod, rewriter);
-              auto loadDataPtr = rewriter.create<LLVM::LoadOp>(
-                  loc, memberMapInfoOp.getVarPtrPtr().getType(),
-                  memberMapInfoOp.getVarPtrPtr());
-              LLVM_ATTRIBUTE_UNUSED auto memcpy =
-                  rewriter.create<mlir::LLVM::MemcpyOp>(
-                      loc, dataMalloc.getResult(), loadDataPtr.getResult(),
-                      totalSize, /*isVolatile=*/false);
-              Operation *newVarPtrPtrOp = rewriter.clone(*varPtrPtrdefOp);
-              rewriter.replaceAllUsesExcept(memberMapInfoOp.getVarPtrPtr(),
-                                            newVarPtrPtrOp->getOpResult(0),
-                                            loadDataPtr);
-              rewriter.modifyOpInPlace(newVarPtrPtrOp, [&]() {
-                newVarPtrPtrOp->replaceUsesOfWith(varPtr, heapMem);
-              });
-              LLVM_ATTRIBUTE_UNUSED auto storePtr =
-                  rewriter.create<LLVM::StoreOp>(loc, dataMalloc.getResult(),
-                                                 newVarPtrPtrOp->getResult(0));
-            } else
-              rewriter.setInsertionPoint(
-                  cloneAndMarkForDeletion(varPtrPtrdefOp));
-          }
-        }
-      }
-
-      for (auto repl : replRecord) {
-        Operation *origOp = repl.first;
-        Operation *clonedOp = repl.second;
-        rewriter.modifyOpInPlace(
-            clonedOp, [&]() { clonedOp->replaceUsesOfWith(varPtr, heapMem); });
-        rewriter.eraseOp(origOp);
-      }
-    }
-    assert(newPrivVars.size() == privateVars.size() &&
-           "The number of private variables must match before and after "
-           "transformation");
-
-    rewriter.setInsertionPoint(targetOp);
-    Operation *newOp = rewriter.clone(*targetOp.getOperation());
-    omp::TargetOp newTargetOp = mlir::cast<omp::TargetOp>(newOp);
-    rewriter.modifyOpInPlace(newTargetOp, [&]() {
-      newTargetOp.getPrivateVarsMutable().assign(newPrivVars);
-    });
-    rewriter.replaceOp(targetOp, newTargetOp);
-    return mlir::success();
-  }
-
-private:
-  bool hasPrivateVars(omp::TargetOp targetOp) const {
-    return !targetOp.getPrivateVars().empty();
-  }
-
-  bool isTargetTaskDeferred(omp::TargetOp targetOp) const {
-    return targetOp.getNowait();
-  }
-
-  template <typename OpTy>
-  omp::PrivateClauseOp findPrivatizer(OpTy op, mlir::Attribute privSym) const {
-    SymbolRefAttr privatizerName = llvm::cast<SymbolRefAttr>(privSym);
-    omp::PrivateClauseOp privatizer =
-        SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
-            op, privatizerName);
-    return privatizer;
-  }
-
-  template <typename OpType>
-  mlir::Type getElemType(OpType op) const {
-    return op.getElemType();
-  }
-
-  mlir::Type getElemType(mlir::Value varPtr) const {
-    Operation *definingOp = unwrapAddrSpaceCast(varPtr.getDefiningOp());
-    assert((mlir::isa<LLVM::AllocaOp, LLVM::GEPOp>(definingOp)) &&
-           "getElemType in PrepareForOMPOffloadPrivatizationPass can deal only "
-           "with Alloca or GEP for now");
-    if (auto allocaOp = mlir::dyn_cast<LLVM::AllocaOp>(definingOp))
-      return getElemType(allocaOp);
-    // TODO: get rid of this because GEPOp.getElemType() is not the right thing
-    // to use.
-    if (auto gepOp = mlir::dyn_cast<LLVM::GEPOp>(definingOp))
-      return getElemType(gepOp);
-    return mlir::Type{};
-  }
-
-  mlir::Operation *unwrapAddrSpaceCast(Operation *op) const {
-    if (!mlir::isa<LLVM::AddrSpaceCastOp>(op))
-      return op;
-    mlir::LLVM::AddrSpaceCastOp addrSpaceCastOp =
-        mlir::cast<LLVM::AddrSpaceCastOp>(op);
-    return unwrapAddrSpaceCast(addrSpaceCastOp.getArg().getDefiningOp());
-  }
-
-  // Get the (compile-time constant) size of varType as per the
-  // given DataLayout dl.
-  std::int64_t getSizeInBytes(const mlir::DataLayout &dl,
-                              mlir::Type varType) const {
-    llvm::TypeSize size = dl.getTypeSize(varType);
-    unsigned short alignment = dl.getTypeABIAlignment(varType);
-    return llvm::alignTo(size, alignment);
-  }
-
-  // Generate code to get the size of data being mapped from the bounds
-  // of mapInfoOp
-  mlir::Value getSizeInBytes(omp::MapInfoOp mapInfoOp, ModuleOp mod,
-                             PatternRewriter &rewriter) const {
-    mlir::Location loc = mapInfoOp.getLoc();
-    mlir::Type llvmInt64Ty = rewriter.getI64Type();
-    mlir::Value constOne =
-        rewriter.create<LLVM::ConstantOp>(loc, llvmInt64Ty, 1);
-    mlir::Value elementCount = constOne;
-    // TODO: Consider using  boundsOp.getExtent() if available.
-    for (auto bounds : mapInfoOp.getBounds()) {
-      auto boundsOp = mlir::cast<omp::MapBoundsOp>(bounds.getDefiningOp());
-      elementCount = rewriter.create<LLVM::MulOp>(
-          loc, llvmInt64Ty, elementCount,
-          rewriter.create<LLVM::AddOp>(
-              loc, llvmInt64Ty,
-              (rewriter.create<LLVM::SubOp>(loc, llvmInt64Ty,
-                                            boundsOp.getUpperBound(),
-                                            boundsOp.getLowerBound())),
-              constOne));
-    }
-    const mlir::DataLayout &dl = mlir::DataLayout(mod);
-    std::int64_t elemSize = getSizeInBytes(dl, mapInfoOp.getVarType());
-    mlir::Value elemSizeV =
-        rewriter.create<LLVM::ConstantOp>(loc, llvmInt64Ty, elemSize);
-    return rewriter.create<LLVM::MulOp>(loc, llvmInt64Ty, elementCount,
-                                        elemSizeV);
-  }
-
-  LLVM::LLVMFuncOp getMalloc(ModuleOp mod, PatternRewriter &rewriter) const {
-    llvm::FailureOr<mlir::LLVM::LLVMFuncOp> mallocCall =
-        LLVM::lookupOrCreateMallocFn(rewriter, mod, rewriter.getI64Type());
-    assert(llvm::succeeded(mallocCall) &&
-           "Could not find malloc in the module");
-    return mallocCall.value();
-  }
-
-  template <typename OpTy>
-  mlir::Value allocateHeapMem(OpTy targetOp, mlir::Value privVar, ModuleOp mod,
-                              PatternRewriter &rewriter) const {
-    mlir::Value varPtr = privVar;
-    Operation *definingOp = varPtr.getDefiningOp();
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPoint(definingOp);
-    LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
-
-    mlir::Location loc = definingOp->getLoc();
-    mlir::Type varType = getElemType(varPtr);
-    assert(mod.getDataLayoutSpec() &&
-           "MLIR module with no datalayout spec not handled yet");
-    const mlir::DataLayout &dl = mlir::DataLayout(mod);
-    std::int64_t distance = getSizeInBytes(dl, varType);
-    mlir::Value sizeBytes = rewriter.create<LLVM::ConstantOp>(
-        loc, mallocFn.getFunctionType().getParamType(0), distance);
-
-    auto mallocCallOp =
-        rewriter.create<LLVM::CallOp>(loc, mallocFn, ValueRange{sizeBytes});
-    return mallocCallOp.getResult();
-  }
-
-  LLVM::CallOp allocateHeapMem(mlir::Location loc, mlir::Value size,
-                               ModuleOp mod, PatternRewriter &rewriter) const {
-    LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
-    return rewriter.create<LLVM::CallOp>(loc, mallocFn, ValueRange{size});
-  }
-};
-
 //===----------------------------------------------------------------------===//
 // PrepareForOMPOffloadPrivatizationPass
 //===----------------------------------------------------------------------===//
@@ -393,50 +49,31 @@ class PrepareForOMPOffloadPrivatizationPass
           PrepareForOMPOffloadPrivatizationPass> {
 
   void runOnOperation() override {
-    LLVM::LLVMFuncOp func = getOperation();
-    LLVM_DEBUG(llvm::dbgs() << "In PrepareForOMPOffloadPrivatizationPass\n");
-    LLVM_DEBUG(llvm::dbgs() << "Func is \n" << func << "\n");
-    ModuleOp mod = func->getParentOfType<ModuleOp>();
+    ModuleOp mod = getOperation()->getParentOfType<ModuleOp>();
 
     // FunctionFilteringPass removes bounds arguments from omp.map.info
     // operations. We require bounds else our pass asserts. But, that's only for
     // maps in functions that are on the host. So, skip functions being compiled
     // for the target.
     auto offloadModuleInterface =
-        mlir::dyn_cast<omp::OffloadModuleInterface>(mod.getOperation());
-    if (offloadModuleInterface && offloadModuleInterface.getIsTargetDevice()) {
+        dyn_cast<omp::OffloadModuleInterface>(mod.getOperation());
+    if (offloadModuleInterface && offloadModuleInterface.getIsTargetDevice())
       return;
-    }
-#if 0
-    MLIRContext &context = getContext();
-
-    RewritePatternSet patterns(&context);
-    patterns.add<OMPTargetPrepareDelayedPrivatizationPattern>(&context);
-
-    if (mlir::failed(
-            applyPatternsGreedily(func, std::move(patterns),
-                                  GreedyRewriteConfig().setStrictness(
-                                      GreedyRewriteStrictness::ExistingOps)))) {
-      emitError(func.getLoc(),
-                "error in preparing targetOps for delayed privatization.");
-      signalPassFailure();
-    }
-#else
+
     getOperation()->walk([&](omp::TargetOp targetOp) {
       if (!hasPrivateVars(targetOp) || !isTargetTaskDeferred(targetOp))
         return;
       IRRewriter rewriter(&getContext());
       ModuleOp mod = targetOp->getParentOfType<ModuleOp>();
-      LLVM::LLVMFuncOp llvmFunc = targetOp->getParentOfType<LLVM::LLVMFuncOp>();
       OperandRange privateVars = targetOp.getPrivateVars();
-      mlir::SmallVector<mlir::Value> newPrivVars;
+      SmallVector<mlir::Value> newPrivVars;
 
       newPrivVars.reserve(privateVars.size());
       std::optional<ArrayAttr> privateSyms = targetOp.getPrivateSyms();
       for (auto [privVarIdx, privVarSymPair] :
            llvm::enumerate(llvm::zip_equal(privateVars, *privateSyms))) {
-        auto privVar = std::get<0>(privVarSymPair);
-        auto privSym = std::get<1>(privVarSymPair);
+        Value privVar = std::get<0>(privVarSymPair);
+        Attribute privSym = std::get<1>(privVarSymPair);
 
         omp::PrivateClauseOp privatizer = findPrivatizer(targetOp, privSym);
         if (!privatizer.needsMap()) {
@@ -446,10 +83,10 @@ class PrepareForOMPOffloadPrivatizationPass
         bool isFirstPrivate = privatizer.getDataSharingType() ==
                               omp::DataSharingClauseType::FirstPrivate;
 
-        mlir::Value mappedValue =
+        Value mappedValue =
             targetOp.getMappedValueForPrivateVar(privVarIdx);
         Operation *mapInfoOperation = mappedValue.getDefiningOp();
-        auto mapInfoOp = mlir::cast<omp::MapInfoOp>(mapInfoOperation);
+        auto mapInfoOp = cast<omp::MapInfoOp>(mapInfoOperation);
 
         if (mapInfoOp.getMapCaptureType() == omp::VariableCaptureKind::ByCopy) {
           newPrivVars.push_back(privVar);
@@ -459,8 +96,8 @@ class PrepareForOMPOffloadPrivatizationPass
         // Allocate heap memory that corresponds to the type of memory
         // pointed to by varPtr
         // TODO: For boxchars this likely wont be a pointer.
-        mlir::Value varPtr = privVar;
-        mlir::Value heapMem = allocateHeapMem(targetOp, privVar, mod, rewriter);
+        Value varPtr = privVar;
+        Value heapMem = allocateHeapMem(targetOp, privVar, mod, rewriter);
         if (!heapMem)
           targetOp.emitError("Unable to allocate heap memory when try to move "
                              "a private variable out of the stack and into the "
@@ -472,7 +109,7 @@ class PrepareForOMPOffloadPrivatizationPass
         // After the copy these omp::MapInfoOp instances will refer to heapMem
         // instead.
         Operation *varPtrDefiningOp = varPtr.getDefiningOp();
-        std::set<Operation *> users;
+        DenseSet<Operation *> users;
         users.insert(varPtrDefiningOp->user_begin(),
                      varPtrDefiningOp->user_end());
 
@@ -487,7 +124,7 @@ class PrepareForOMPOffloadPrivatizationPass
               chainOfOps.push_back(member.getDefiningOp());
 
             omp::MapInfoOp memberMap =
-                mlir::cast<omp::MapInfoOp>(member.getDefiningOp());
+                cast<omp::MapInfoOp>(member.getDefiningOp());
             if (memberMap.getVarPtrPtr() &&
                 usesVarPtr(memberMap.getVarPtrPtr().getDefiningOp()))
               chainOfOps.push_back(memberMap.getVarPtrPtr().getDefiningOp());
@@ -501,11 +138,10 @@ class PrepareForOMPOffloadPrivatizationPass
         rewriter.setInsertionPoint(chainOfOps.front());
         // Copy the value of the local variable into the heap-allocated
         // location.
-        mlir::Location loc = chainOfOps.front()->getLoc();
-        mlir::Type varType = getElemType(varPtr);
+        Location loc = chainOfOps.front()->getLoc();
+        Type varType = getElemType(varPtr);
         auto loadVal = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
-        LLVM_ATTRIBUTE_UNUSED auto storeInst =
-            rewriter.create<LLVM::StoreOp>(loc, loadVal.getResult(), heapMem);
+        (void)rewriter.create<LLVM::StoreOp>(loc, loadVal.getResult(), heapMem);
 
         using ReplacementEntry = std::pair<Operation *, Operation *>;
         llvm::SmallVector<ReplacementEntry> replRecord;
@@ -528,7 +164,7 @@ class PrepareForOMPOffloadPrivatizationPass
             rewriter.setInsertionPoint(
                 cloneAndMarkForDeletion(memberOperation));
 
-            auto memberMapInfoOp = mlir::cast<omp::MapInfoOp>(memberOperation);
+            auto memberMapInfoOp = cast<omp::MapInfoOp>(memberOperation);
             if (memberMapInfoOp.getVarPtrPtr()) {
               Operation *varPtrPtrdefOp =
                   memberMapInfoOp.getVarPtrPtr().getDefiningOp();
@@ -578,18 +214,17 @@ class PrepareForOMPOffloadPrivatizationPass
               if (isFirstPrivate) {
                 assert(!memberMapInfoOp.getBounds().empty() &&
                        "empty bounds on member map of firstprivate variable");
-                mlir::Location loc = memberMapInfoOp.getLoc();
-                mlir::Value totalSize =
+                Location loc = memberMapInfoOp.getLoc();
+                Value totalSize =
                     getSizeInBytes(memberMapInfoOp, mod, rewriter);
                 auto dataMalloc =
                     allocateHeapMem(loc, totalSize, mod, rewriter);
                 auto loadDataPtr = rewriter.create<LLVM::LoadOp>(
                     loc, memberMapInfoOp.getVarPtrPtr().getType(),
                     memberMapInfoOp.getVarPtrPtr());
-                LLVM_ATTRIBUTE_UNUSED auto memcpy =
-                    rewriter.create<mlir::LLVM::MemcpyOp>(
-                        loc, dataMalloc.getResult(), loadDataPtr.getResult(),
-                        totalSize, /*isVolatile=*/false);
+                (void)rewriter.create<LLVM::MemcpyOp>(
+                    loc, dataMalloc.getResult(), loadDataPtr.getResult(),
+                    totalSize, /*isVolatile=*/false);
                 Operation *newVarPtrPtrOp = rewriter.clone(*varPtrPtrdefOp);
                 rewriter.replaceAllUsesExcept(memberMapInfoOp.getVarPtrPtr(),
                                               newVarPtrPtrOp->getOpResult(0),
@@ -597,13 +232,12 @@ class PrepareForOMPOffloadPrivatizationPass
                 rewriter.modifyOpInPlace(newVarPtrPtrOp, [&]() {
                   newVarPtrPtrOp->replaceUsesOfWith(varPtr, heapMem);
                 });
-                LLVM_ATTRIBUTE_UNUSED auto storePtr =
-                    rewriter.create<LLVM::StoreOp>(
-                        loc, dataMalloc.getResult(),
-                        newVarPtrPtrOp->getResult(0));
-              } else
+                (void)rewriter.create<LLVM::StoreOp>(
+                    loc, dataMalloc.getResult(), newVarPtrPtrOp->getResult(0));
+              } else {
                 rewriter.setInsertionPoint(
                     cloneAndMarkForDeletion(varPtrPtrdefOp));
+              }
             }
           }
         }
@@ -623,14 +257,14 @@ class PrepareForOMPOffloadPrivatizationPass
 
       rewriter.setInsertionPoint(targetOp);
       Operation *newOp = rewriter.clone(*targetOp.getOperation());
-      omp::TargetOp newTargetOp = mlir::cast<omp::TargetOp>(newOp);
+      omp::TargetOp newTargetOp = cast<omp::TargetOp>(newOp);
       rewriter.modifyOpInPlace(newTargetOp, [&]() {
         newTargetOp.getPrivateVarsMutable().assign(newPrivVars);
       });
       rewriter.replaceOp(targetOp, newTargetOp);
     });
-#endif
   }
+
 private:
   bool hasPrivateVars(omp::TargetOp targetOp) const {
     return !targetOp.getPrivateVars().empty();
@@ -641,7 +275,7 @@ class PrepareForOMPOffloadPrivatizationPass
   }
 
   template <typename OpTy>
-  omp::PrivateClauseOp findPrivatizer(OpTy op, mlir::Attribute privSym) const {
+  omp::PrivateClauseOp findPrivatizer(OpTy op, Attribute privSym) const {
     SymbolRefAttr privatizerName = llvm::cast<SymbolRefAttr>(privSym);
     omp::PrivateClauseOp privatizer =
         SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
@@ -650,36 +284,36 @@ class PrepareForOMPOffloadPrivatizationPass
   }
 
   template <typename OpType>
-  mlir::Type getElemType(OpType op) const {
+  Type getElemType(OpType op) const {
     return op.getElemType();
   }
 
-  mlir::Type getElemType(mlir::Value varPtr) const {
+  Type getElemType(Value varPtr) const {
     Operation *definingOp = unwrapAddrSpaceCast(varPtr.getDefiningOp());
-    assert((mlir::isa<LLVM::AllocaOp, LLVM::GEPOp>(definingOp)) &&
+    assert((isa<LLVM::AllocaOp, LLVM::GEPOp>(definingOp)) &&
            "getElemType in PrepareForOMPOffloadPrivatizationPass can deal only "
            "with Alloca or GEP for now");
-    if (auto allocaOp = mlir::dyn_cast<LLVM::AllocaOp>(definingOp))
+    if (auto allocaOp = dyn_cast<LLVM::AllocaOp>(definingOp))
       return getElemType(allocaOp);
     // TODO: get rid of this because GEPOp.getElemType() is not the right thing
     // to use.
-    if (auto gepOp = mlir::dyn_cast<LLVM::GEPOp>(definingOp))
+    if (auto gepOp = dyn_cast<LLVM::GEPOp>(definingOp))
       return getElemType(gepOp);
-    return mlir::Type{};
+    return Type{};
   }
 
-  mlir::Operation *unwrapAddrSpaceCast(Operation *op) const {
-    if (!mlir::isa<LLVM::AddrSpaceCastOp>(op))
+  Operation *unwrapAddrSpaceCast(Operation *op) const {
+    if (!isa<LLVM::AddrSpaceCastOp>(op))
       return op;
-    mlir::LLVM::AddrSpaceCastOp addrSpaceCastOp =
-        mlir::cast<LLVM::AddrSpaceCastOp>(op);
+    LLVM::AddrSpaceCastOp addrSpaceCastOp =
+        cast<LLVM::AddrSpaceCastOp>(op);
     return unwrapAddrSpaceCast(addrSpaceCastOp.getArg().getDefiningOp());
   }
 
   // Get the (compile-time constant) size of varType as per the
   // given DataLayout dl.
-  std::int64_t getSizeInBytes(const mlir::DataLayout &dl,
-                              mlir::Type varType) const {
+  std::int64_t getSizeInBytes(const DataLayout &dl,
+                              Type varType) const {
     llvm::TypeSize size = dl.getTypeSize(varType);
     unsigned short alignment = dl.getTypeABIAlignment(varType);
     return llvm::alignTo(size, alignment);
@@ -687,16 +321,16 @@ class PrepareForOMPOffloadPrivatizationPass
 
   // Generate code to get the size of data being mapped from the bounds
   // of mapInfoOp
-  mlir::Value getSizeInBytes(omp::MapInfoOp mapInfoOp, ModuleOp mod,
+  Value getSizeInBytes(omp::MapInfoOp mapInfoOp, ModuleOp mod,
                              IRRewriter &rewriter) const {
-    mlir::Location loc = mapInfoOp.getLoc();
-    mlir::Type llvmInt64Ty = rewriter.getI64Type();
-    mlir::Value constOne =
+    Location loc = mapInfoOp.getLoc();
+    Type llvmInt64Ty = rewriter.getI64Type();
+    Value constOne =
         rewriter.create<LLVM::ConstantOp>(loc, llvmInt64Ty, 1);
-    mlir::Value elementCount = constOne;
+    Value elementCount = constOne;
     // TODO: Consider using  boundsOp.getExtent() if available.
     for (auto bounds : mapInfoOp.getBounds()) {
-      auto boundsOp = mlir::cast<omp::MapBoundsOp>(bounds.getDefiningOp());
+      auto boundsOp = cast<omp::MapBoundsOp>(bounds.getDefiningOp());
       elementCount = rewriter.create<LLVM::MulOp>(
           loc, llvmInt64Ty, elementCount,
           rewriter.create<LLVM::AddOp>(
@@ -706,16 +340,16 @@ class PrepareForOMPOffloadPrivatizationPass
                                             boundsOp.getLowerBound())),
               constOne));
     }
-    const mlir::DataLayout &dl = mlir::DataLayout(mod);
+    const DataLayout &dl = DataLayout(mod);
     std::int64_t elemSize = getSizeInBytes(dl, mapInfoOp.getVarType());
-    mlir::Value elemSizeV =
+    Value elemSizeV =
         rewriter.create<LLVM::ConstantOp>(loc, llvmInt64Ty, elemSize);
     return rewriter.create<LLVM::MulOp>(loc, llvmInt64Ty, elementCount,
                                         elemSizeV);
   }
 
   LLVM::LLVMFuncOp getMalloc(ModuleOp mod, IRRewriter &rewriter) const {
-    llvm::FailureOr<mlir::LLVM::LLVMFuncOp> mallocCall =
+    llvm::FailureOr<LLVM::LLVMFuncOp> mallocCall =
         LLVM::lookupOrCreateMallocFn(rewriter, mod, rewriter.getI64Type());
     assert(llvm::succeeded(mallocCall) &&
            "Could not find malloc in the module");
@@ -723,21 +357,21 @@ class PrepareForOMPOffloadPrivatizationPass
   }
 
   template <typename OpTy>
-  mlir::Value allocateHeapMem(OpTy targetOp, mlir::Value privVar, ModuleOp mod,
+  Value allocateHeapMem(OpTy targetOp, Value privVar, ModuleOp mod,
                               IRRewriter &rewriter) const {
-    mlir::Value varPtr = privVar;
+    Value varPtr = privVar;
     Operation *definingOp = varPtr.getDefiningOp();
     OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPoint(definingOp);
     LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
 
-    mlir::Location loc = definingOp->getLoc();
-    mlir::Type varType = getElemType(varPtr);
+    Location loc = definingOp->getLoc();
+    Type varType = getElemType(varPtr);
     assert(mod.getDataLayoutSpec() &&
            "MLIR module with no datalayout spec not handled yet");
-    const mlir::DataLayout &dl = mlir::DataLayout(mod);
+    const DataLayout &dl = DataLayout(mod);
     std::int64_t distance = getSizeInBytes(dl, varType);
-    mlir::Value sizeBytes = rewriter.create<LLVM::ConstantOp>(
+    Value sizeBytes = rewriter.create<LLVM::ConstantOp>(
         loc, mallocFn.getFunctionType().getParamType(0), distance);
 
     auto mallocCallOp =
@@ -745,7 +379,7 @@ class PrepareForOMPOffloadPrivatizationPass
     return mallocCallOp.getResult();
   }
 
-  LLVM::CallOp allocateHeapMem(mlir::Location loc, mlir::Value size,
+  LLVM::CallOp allocateHeapMem(Location loc, Value size,
                                ModuleOp mod, IRRewriter &rewriter) const {
     LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
     return rewriter.create<LLVM::CallOp>(loc, mallocFn, ValueRange{size});

>From f4a13ea2093371a044ec9a29298cd62b1c357e2d Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Wed, 10 Sep 2025 15:37:37 -0500
Subject: [PATCH 07/27] Handle boxchars

---
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 46 +++++++++++++++----
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index 781ec34435e1d..a61e57564657f 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-//#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
-#include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
@@ -17,6 +16,7 @@
 #include "mlir/IR/Dominance.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
+#include "llvm/Support/DebugLog.h"
 #include <cstdint>
 #include <utility>
 
@@ -95,15 +95,31 @@ class PrepareForOMPOffloadPrivatizationPass
 
         // Allocate heap memory that corresponds to the type of memory
         // pointed to by varPtr
-        // TODO: For boxchars this likely wont be a pointer.
+        // For boxchars this won't be a pointer. But, MapsForPrivatizedSymbols
+        // should have mapped the pointer the boxchar so use that as varPtr.
         Value varPtr = privVar;
-        Value heapMem = allocateHeapMem(targetOp, privVar, mod, rewriter);
+        if (!isa<LLVM::LLVMPointerType>(privVar.getType()))
+          varPtr = mapInfoOp.getVarPtr();
+
+        assert(isa<LLVM::LLVMPointerType>(varPtr.getType()));
+        Value heapMem = allocateHeapMem(targetOp, varPtr, mod, rewriter);
         if (!heapMem)
-          targetOp.emitError("Unable to allocate heap memory when try to move "
-                             "a private variable out of the stack and into the "
-                             "heap for use by a deferred target task");
+          targetOp.emitError(
+              "Unable to allocate heap memory when trying to move "
+              "a private variable out of the stack and into the "
+              "heap for use by a deferred target task");
+
+        // The types of private vars should match before and after the
+        // transformation. In particular, if the type is a pointer,
+        // simply record the newly allocated malloc location as the
+        // new private variable. If, however, the type is not a pointer
+        // then, we need to load the value from the newly allocated
+        // location. We'll inser that load later after we have updated
+        // the malloc'd location with the contents of the original
+        // variable.
+        if (isa<LLVM::LLVMPointerType>(privVar.getType()))
+          newPrivVars.push_back(heapMem);
 
-        newPrivVars.push_back(heapMem);
         // Find the earliest insertion point for the copy. This will be before
         // the first in the list of omp::MapInfoOp instances that use varPtr.
         // After the copy these omp::MapInfoOp instances will refer to heapMem
@@ -250,6 +266,18 @@ class PrepareForOMPOffloadPrivatizationPass
           });
           rewriter.eraseOp(origOp);
         }
+
+        // If the type of the private variable is not a pointer,
+        // which is typically the case with !fir.boxchar types, then
+        // we need to ensure that the new private variable is also
+        // not a pointer. Insert a load from heapMem right before
+        // targetOp.
+        if (!isa<LLVM::LLVMPointerType>(privVar.getType())) {
+          rewriter.setInsertionPoint(targetOp);
+          auto newPrivVar = rewriter.create<LLVM::LoadOp>(mapInfoOp.getLoc(),
+                                                          varType, heapMem);
+          newPrivVars.push_back(newPrivVar);
+        }
       }
       assert(newPrivVars.size() == privateVars.size() &&
              "The number of private variables must match before and after "
@@ -358,7 +386,7 @@ class PrepareForOMPOffloadPrivatizationPass
 
   template <typename OpTy>
   Value allocateHeapMem(OpTy targetOp, Value privVar, ModuleOp mod,
-                              IRRewriter &rewriter) const {
+                        IRRewriter &rewriter) const {
     Value varPtr = privVar;
     Operation *definingOp = varPtr.getDefiningOp();
     OpBuilder::InsertionGuard guard(rewriter);

>From c8fad74bd720921c9f1a0c7c071d9ba7bf259f19 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 11 Sep 2025 10:44:59 -0500
Subject: [PATCH 08/27] fix testcases

---
 flang/test/Driver/tco-test-gen.fir            |   5 +-
 flang/test/Fir/alloc-32.fir                   |   2 +-
 flang/test/Fir/alloc.fir                      |  14 +-
 flang/test/Fir/arrexp.fir                     |   4 +-
 flang/test/Fir/box.fir                        |   6 +-
 flang/test/Fir/optional.fir                   |   3 +-
 flang/test/Fir/rebox.fir                      |  18 +--
 flang/test/Fir/tbaa-codegen2.fir              |  12 +-
 flang/test/Lower/forall/character-1.f90       |   4 +-
 .../omp-offload-privatization-prepare.mlir    | 148 +++++++++---------
 10 files changed, 110 insertions(+), 106 deletions(-)

diff --git a/flang/test/Driver/tco-test-gen.fir b/flang/test/Driver/tco-test-gen.fir
index 15483f7ee3534..38d4e50ecf3aa 100644
--- a/flang/test/Driver/tco-test-gen.fir
+++ b/flang/test/Driver/tco-test-gen.fir
@@ -42,10 +42,11 @@ func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "num"}, %arg1: !fir.re
 // CHECK-SAME:      %[[ARG2:.*]]: !llvm.ptr {fir.bindc_name = "ub", llvm.nocapture},
 // CHECK-SAME:      %[[ARG3:.*]]: !llvm.ptr {fir.bindc_name = "step", llvm.nocapture}) {
 
-// CMPLX:           %[[VAL_3:.*]] = llvm.mlir.constant(0 : index) : i64
-// CMPLX:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : index) : i64
 // CMPLX:           %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CMPLX:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+// CMPLX:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : index) : i64
+// CMPLX:           %[[VAL_3:.*]] = llvm.mlir.constant(0 : index) : i64
+// CMPLX:           %[[VAL_4:.*]] = llvm.mlir.constant(1 : i64) : i64
 
 // SIMPLE:          %[[VAL_3:.*]] = llvm.mlir.constant(0 : index) : i64
 // SIMPLE:          %[[VAL_2:.*]] = llvm.mlir.constant(1 : index) : i64
diff --git a/flang/test/Fir/alloc-32.fir b/flang/test/Fir/alloc-32.fir
index f57f6ce6fcf5e..a3cbf200c24fc 100644
--- a/flang/test/Fir/alloc-32.fir
+++ b/flang/test/Fir/alloc-32.fir
@@ -19,7 +19,7 @@ func.func @allocmem_scalar_nonchar() -> !fir.heap<i32> {
 // CHECK-LABEL: define ptr @allocmem_scalar_dynchar(
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
-// CHECK: %[[mul2:.*]] = mul i64 %[[mul1]], 1
+// CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
 // CHECK: %[[sz:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
 // CHECK: %[[trunc:.*]] = trunc i64 %[[sz]] to i32
diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir
index 0d3ce323d0d7c..98d0a03790a58 100644
--- a/flang/test/Fir/alloc.fir
+++ b/flang/test/Fir/alloc.fir
@@ -86,7 +86,7 @@ func.func @alloca_scalar_dynchar_kind(%l : i32) -> !fir.ref<!fir.char<2,?>> {
 // CHECK-LABEL: define ptr @allocmem_scalar_dynchar(
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
-// CHECK: %[[mul2:.*]] = mul i64 %[[mul1]], 1
+// CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
 // CHECK: call ptr @malloc(i64 %[[size]])
@@ -98,7 +98,7 @@ func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap<!fir.char<1,?>> {
 // CHECK-LABEL: define ptr @allocmem_scalar_dynchar_kind(
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
-// CHECK: %[[mul2:.*]] = mul i64 %[[mul1]], 2
+// CHECK: %[[mul2:.*]] = mul i64 2, %[[mul1]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
 // CHECK: call ptr @malloc(i64 %[[size]])
@@ -185,7 +185,7 @@ func.func @alloca_dynarray_of_nonchar2(%e: index) -> !fir.ref<!fir.array<?x?xi32
 
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_nonchar(
 // CHECK-SAME: i64 %[[extent:.*]])
-// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 12
+// CHECK: %[[prod1:.*]] = mul i64 12, %[[extent]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod1]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod1]], i64 1
 // CHECK: call ptr @malloc(i64 %[[size]])
@@ -196,7 +196,7 @@ func.func @allocmem_dynarray_of_nonchar(%e: index) -> !fir.heap<!fir.array<3x?xi
 
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_nonchar2(
 // CHECK-SAME: i64 %[[extent:.*]])
-// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 4
+// CHECK: %[[prod1:.*]] = mul i64 4, %[[extent]]
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod2]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod2]], i64 1
@@ -227,7 +227,7 @@ func.func @alloca_dynarray_of_char2(%e : index) -> !fir.ref<!fir.array<?x?x!fir.
 
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_char(
 // CHECK-SAME: i64 %[[extent:.*]])
-// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 60
+// CHECK: %[[prod1:.*]] = mul i64 60, %[[extent]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod1]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod1]], i64 1
 // CHECK: call ptr @malloc(i64 %[[size]])
@@ -238,7 +238,7 @@ func.func @allocmem_dynarray_of_char(%e : index) -> !fir.heap<!fir.array<3x?x!fi
 
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_char2(
 // CHECK-SAME: i64 %[[extent:.*]])
-// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 20
+// CHECK: %[[prod1:.*]] = mul i64 20, %[[extent]]
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod2]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
@@ -286,7 +286,7 @@ func.func @allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> !fir.heap<!fir.a
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_dynchar2(
 // CHECK-SAME: i32 %[[len:.*]], i64 %[[extent:.*]])
 // CHECK: %[[a:.*]] = sext i32 %[[len]] to i64
-// CHECK: %[[prod1:.*]] = mul i64 %[[a]], 2
+// CHECK: %[[prod1:.*]] = mul i64 2, %[[a]]
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
 // CHECK: %[[prod3:.*]] = mul i64 %[[prod2]], %[[extent]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod3]], 0
diff --git a/flang/test/Fir/arrexp.fir b/flang/test/Fir/arrexp.fir
index 2eb717228d998..e8ec8ac79e0c2 100644
--- a/flang/test/Fir/arrexp.fir
+++ b/flang/test/Fir/arrexp.fir
@@ -143,9 +143,9 @@ func.func @f6(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: f32) {
   %c9 = arith.constant 9 : index
   %c10 = arith.constant 10 : index
 
-  // CHECK: %[[EXT_GEP:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 7, i32 0, i32 1
+  // CHECK: %[[EXT_GEP:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 7, i64 0, i32 1
   // CHECK: %[[EXTENT:.*]] = load i64, ptr %[[EXT_GEP]]
-  // CHECK: %[[SIZE:.*]] = mul i64 %[[EXTENT]], 4
+  // CHECK: %[[SIZE:.*]] = mul i64 4, %[[EXTENT]]
   // CHECK: %[[CMP:.*]] = icmp sgt i64 %[[SIZE]], 0
   // CHECK: %[[SZ:.*]] = select i1 %[[CMP]], i64 %[[SIZE]], i64 1
   // CHECK: %[[MALLOC:.*]] = call ptr @malloc(i64 %[[SZ]])
diff --git a/flang/test/Fir/box.fir b/flang/test/Fir/box.fir
index 760fbd4792122..c0cf3d8375983 100644
--- a/flang/test/Fir/box.fir
+++ b/flang/test/Fir/box.fir
@@ -57,7 +57,7 @@ func.func @fa(%a : !fir.ref<!fir.array<100xf32>>) {
 // CHECK-SAME: ptr {{[^%]*}}%[[res:.*]], ptr {{[^%]*}}%[[arg0:.*]], i64 %[[arg1:.*]])
 func.func @b1(%arg0 : !fir.ref<!fir.char<1,?>>, %arg1 : index) -> !fir.box<!fir.char<1,?>> {
   // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }
-  // CHECK: %[[size:.*]] = mul i64 %[[arg1]], 1
+  // CHECK: %[[size:.*]] = mul i64 1, %[[arg1]]
   // CHECK: insertvalue {{.*}} undef, i64 %[[size]], 1
   // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0
@@ -89,7 +89,7 @@ func.func @b2(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,5>>>, %arg1 : index) ->
 func.func @b3(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,?>>>, %arg1 : index, %arg2 : index) -> !fir.box<!fir.array<?x!fir.char<1,?>>> {
   %1 = fir.shape %arg2 : (index) -> !fir.shape<1>
   // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
-  // CHECK: %[[size:.*]] = mul i64 %[[arg1]], 1
+  // CHECK: %[[size:.*]] = mul i64 1, %[[arg1]]
   // CHECK: insertvalue {{.*}} i64 %[[size]], 1
   // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} i64 %[[arg2]], 7, 0, 1
@@ -108,7 +108,7 @@ func.func @b4(%arg0 : !fir.ref<!fir.array<7x!fir.char<1,?>>>, %arg1 : index) ->
   %c_7 = arith.constant 7 : index
   %1 = fir.shape %c_7 : (index) -> !fir.shape<1>
   // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
-  // CHECK:   %[[size:.*]] = mul i64 %[[arg1]], 1
+  // CHECK:   %[[size:.*]] = mul i64 1, %[[arg1]]
   // CHECK: insertvalue {{.*}} i64 %[[size]], 1
   // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} i64 7, 7, 0, 1
diff --git a/flang/test/Fir/optional.fir b/flang/test/Fir/optional.fir
index 66ff69f083467..bded8b5332a30 100644
--- a/flang/test/Fir/optional.fir
+++ b/flang/test/Fir/optional.fir
@@ -37,7 +37,8 @@ func.func @bar2() -> i1 {
 
 // CHECK-LABEL: @foo3
 func.func @foo3(%arg0: !fir.boxchar<1>) -> i1 {
-  // CHECK: %[[ptr:.*]] = ptrtoint ptr %0 to i64
+  // CHECK: %[[extract:.*]] = extractvalue { ptr, i64 } %{{.*}}, 0
+  // CHECK: %[[ptr:.*]] = ptrtoint ptr %[[extract]] to i64
   // CHECK: icmp ne i64 %[[ptr]], 0
   %0 = fir.is_present %arg0 : (!fir.boxchar<1>) -> i1
   return %0 : i1
diff --git a/flang/test/Fir/rebox.fir b/flang/test/Fir/rebox.fir
index d858adfb7c45d..0c9f6d9bb94ad 100644
--- a/flang/test/Fir/rebox.fir
+++ b/flang/test/Fir/rebox.fir
@@ -36,7 +36,7 @@ func.func @test_rebox_1(%arg0: !fir.box<!fir.array<?x?xf32>>) {
   // CHECK: %[[VOIDBASE0:.*]] = getelementptr i8, ptr %[[INBASE]], i64 %[[OFFSET_0]]
   // CHECK: %[[OFFSET_1:.*]] = mul i64 2, %[[INSTRIDE_1]]
   // CHECK: %[[VOIDBASE1:.*]] = getelementptr i8, ptr %[[VOIDBASE0]], i64 %[[OFFSET_1]]
-  // CHECK: %[[OUTSTRIDE0:.*]] = mul i64 %[[INSTRIDE_1]], 3
+  // CHECK: %[[OUTSTRIDE0:.*]] = mul i64 3, %[[INSTRIDE_1]]
   // CHECK: %[[OUTBOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[OUTSTRIDE0]], 7, 0, 2
   // CHECK: %[[OUTBOX2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[OUTBOX1]], ptr %[[VOIDBASE1]], 0
   // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[OUTBOX2]], ptr %[[OUTBOX_ALLOC]], align 8
@@ -63,7 +63,7 @@ func.func @test_rebox_2(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,?>>>) {
   // CHECK: %[[OUTBOX:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }
   // CHECK: %[[LEN_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 1
   // CHECK: %[[LEN:.*]] = load i64, ptr %[[LEN_GEP]]
-  // CHECK: %[[SIZE:.*]] = mul i64 %[[LEN]], 1
+  // CHECK: %[[SIZE:.*]] = mul i64 1, %[[LEN]]
   // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } undef, i64 %[[SIZE]], 1
 
   %1 = fir.rebox %arg0 [%0]  : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.slice<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
@@ -94,8 +94,8 @@ func.func @test_rebox_3(%arg0: !fir.box<!fir.array<?xf32>>) {
   // CHECK: %[[INSTRIDE:.*]] = load i64, ptr %[[INSTRIDE_GEP]]
   // CHECK: %[[INBASE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 0
   // CHECK: %[[INBASE:.*]] = load ptr, ptr %[[INBASE_GEP]]
-  // CHECK: %[[OUTSTRIDE1:.*]] = mul i64 %[[INSTRIDE]], 3
-  // CHECK: %[[OUTSTRIDE2:.*]] = mul i64 %[[OUTSTRIDE1]], 4
+  // CHECK: %[[OUTSTRIDE1:.*]] = mul i64 3, %[[INSTRIDE]]
+  // CHECK: %[[OUTSTRIDE2:.*]] = mul i64 4, %[[OUTSTRIDE1]]
   // CHECK: %[[OUTBOX0:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %{{.*}}, i64 %[[INSTRIDE]], 7, 0, 2
   // CHECK: %[[OUTBOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %[[OUTBOX0]], i64 3, 7, 1, 0
   // CHECK: %[[OUTBOX2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %[[OUTBOX1]], i64 4, 7, 1, 1
@@ -153,13 +153,13 @@ func.func @test_cmplx_1(%arg0: !fir.box<!fir.array<?xcomplex<f32>>>) {
   %0:3 = fir.box_dims %arg0, %c0 : (!fir.box<!fir.array<?xcomplex<f32>>>, index) -> (index, index, index)
   %1 = fir.slice %c1, %0#1, %c1 path %c1_i32 : (index, index, index, i32) -> !fir.slice<1>
   %2 = fir.rebox %arg0 [%1] : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.slice<1>) -> !fir.box<!fir.array<?xf32>>
-  // CHECK: %[[INSTRIDE_0_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i32 0, i32 1
+  // CHECK: %[[INSTRIDE_0_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i64 0, i32 1
   // CHECK: %[[INSTRIDE_0:.*]] = load i64, ptr %[[INSTRIDE_0_GEP]]
   // CHECK: %[[INSTRIDE_1_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i32 0, i32 2
   // CHECK: %[[INSTRIDE_1:.*]] = load i64, ptr %[[INSTRIDE_1_GEP]]
   // CHECK: %[[FRONT_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 0
   // CHECK: %[[FRONT_PTR:.*]] = load ptr, ptr %[[FRONT_GEP]]
-  // CHECK: %[[FIELD_OFFSET_GEP:.*]] = getelementptr { float, float }, ptr %[[FRONT_PTR]], i32 0, i32 0
+  // CHECK: %[[FIELD_OFFSET_GEP:.*]] = getelementptr { float, float }, ptr %[[FRONT_PTR]], i64 0, i32 0
   // CHECK: %[[FRONT_OFFSET:.*]] = mul i64 0, %[[INSTRIDE_1]]
   // CHECK: %[[OFFSET_GEP:.*]] = getelementptr i8, ptr %[[FIELD_OFFSET_GEP]], i64 %[[FRONT_OFFSET]]
   // CHECK: %[[SUB_1:.*]] = sub i64 %[[INSTRIDE_0]], 1
@@ -167,7 +167,7 @@ func.func @test_cmplx_1(%arg0: !fir.box<!fir.array<?xcomplex<f32>>>) {
   // CHECK: %[[DIV_1:.*]] = sdiv i64 %[[ADD_1]], 1
   // CHECK: %[[CHECK_NONZERO:.*]] = icmp sgt i64 %[[DIV_1]], 0
   // CHECK: %[[CHECKED_BOUND:.*]] = select i1 %[[CHECK_NONZERO]], i64 %[[DIV_1]], i64 0
-  // CHECK: %[[STRIDE:.*]] = mul i64 %[[INSTRIDE_1]], 1
+  // CHECK: %[[STRIDE:.*]] = mul i64 1, %[[INSTRIDE_1]]
   // CHECK: %[[VAL_BUILD_1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[CHECKED_BOUND]], 7, 0, 1
   // CHECK: %[[VAL_BUILD_2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_1]], i64 %[[STRIDE]], 7, 0, 2
   // CHECK: %[[VAL_BUILD_3:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_2]], ptr %[[OFFSET_GEP]], 0
@@ -198,10 +198,10 @@ func.func @test_cmplx_2(%arg0: !fir.box<!fir.array<?xcomplex<f32>>>) {
   // CHECK: %[[INSTRIDE_0:.*]] = load i64, ptr %[[INSTRIDE_0_GEP]]
   // CHECK: %[[FRONT_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 0
   // CHECK: %[[FRONT_PTR:.*]] = load ptr, ptr %[[FRONT_GEP]]
-  // CHECK: %[[FIELD_OFFSET_GEP:.*]] = getelementptr { float, float }, ptr %[[FRONT_PTR]], i32 0, i32 1
+  // CHECK: %[[FIELD_OFFSET_GEP:.*]] = getelementptr { float, float }, ptr %[[FRONT_PTR]], i64 0, i32 1
   // CHECK: %[[FRONT_OFFSET:.*]] = mul i64 6, %[[INSTRIDE_0]]
   // CHECK: %[[OFFSET_GEP:.*]] = getelementptr i8, ptr %[[FIELD_OFFSET_GEP]], i64 %[[FRONT_OFFSET]]
-  // CHECK: %[[STRIDE:.*]] = mul i64 %[[INSTRIDE_0]], 5
+  // CHECK: %[[STRIDE:.*]] = mul i64 5, %[[INSTRIDE_0]]
   // CHECK: %[[VAL_BUILD_1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[STRIDE]], 7, 0, 2
   // CHECK: %[[VAL_BUILD_2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_1]], ptr %[[OFFSET_GEP]], 0
   // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_2]], ptr %[[OUTBOX_ALLOC]]
diff --git a/flang/test/Fir/tbaa-codegen2.fir b/flang/test/Fir/tbaa-codegen2.fir
index 072c8bbe4e80c..4907aa03ec5a5 100644
--- a/flang/test/Fir/tbaa-codegen2.fir
+++ b/flang/test/Fir/tbaa-codegen2.fir
@@ -62,9 +62,9 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
 // CHECK-LABEL: define void @_QPfunc(
 // CHECK-SAME:      ptr {{[^%]*}}%[[ARG0:.*]]){{.*}}{
 // [...]
-// CHECK:  %[[VAL5:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ARG0]], i32 0, i32 6
+// CHECK:  %[[VAL5:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ARG0]], i32 0, i32 7, i32 0, i32 0
 // box access:
-// CHECK:  %[[VAL6:.*]] = load i8, ptr %[[VAL5]], align 1, !tbaa ![[BOX_ACCESS_TAG:.*]]
+// CHECK:  %[[VAL6:.*]] = load i64, ptr %[[VAL5]], align 4, !tbaa ![[BOX_ACCESS_TAG:.*]]
 // CHECK:  %[[VAL7:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %0, i32 0, i32 7, i32 0, i32 1
 // box access:
 // CHECK:  %[[VAL8:.*]] = load i64, ptr %[[VAL7]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
@@ -76,9 +76,15 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
 // CHECK:  %[[VAL12:.*]] = load ptr, ptr %[[VAL11]], align 8, !tbaa ![[BOX_ACCESS_TAG]]
 // CHECK:  %[[VAL15:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, ptr %[[VAL12]], 0
 // CHECK:  store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL15]], ptr %{{.*}}, align 8, !tbaa ![[BOX_ACCESS_TAG]]
-// CHECK:  %[[VAL18:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i32 0, i32 1
+// CHECK:  %[[VAL16:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 0
+// box access:
+// CHECK:  %[[VAL17:.*]] = load i64, ptr %[[VAL16]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
+// CHECK:  %[[VAL18:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 1
 // box access:
 // CHECK:  %[[VAL19:.*]] = load i64, ptr %[[VAL18]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
+// CHECK:  %[[VAL20:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 2
+// box access:
+// CHECK:  %[[VAL21:.*]] = load i64, ptr %[[VAL20]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
 // [...]
 // box access:
 // CHECK:  store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, ptr %{{.*}}, align 8, !tbaa ![[BOX_ACCESS_TAG]]
diff --git a/flang/test/Lower/forall/character-1.f90 b/flang/test/Lower/forall/character-1.f90
index 7a1f4b125a79f..d1e12a8dbdfec 100644
--- a/flang/test/Lower/forall/character-1.f90
+++ b/flang/test/Lower/forall/character-1.f90
@@ -23,11 +23,11 @@ end program test
 
 ! CHECK-LABEL: define internal void @_QFPsub(
 ! CHECK-SAME:    ptr {{[^%]*}}%[[arg:.*]])
-! CHECK: %[[extent:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 7, i32 0, i32 1
+! CHECK: %[[extent:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 7, i64 0, i32 1
 ! CHECK: %[[extval:.*]] = load i64, ptr %[[extent]]
 ! CHECK: %[[elesize:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 1
 ! CHECK: %[[esval:.*]] = load i64, ptr %[[elesize]]
-! CHECK: %[[mul:.*]] = mul i64 %[[esval]], 1
+! CHECK: %[[mul:.*]] = mul i64 1, %[[esval]]
 ! CHECK: %[[mul2:.*]] = mul i64 %[[mul]], %[[extval]]
 ! CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
 ! CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
diff --git a/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir b/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir
index 6b8121b262f47..48b937be86a33 100644
--- a/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir
+++ b/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir
@@ -82,86 +82,82 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vec
 // CHECK-LABEL:   llvm.func @malloc(i64) -> !llvm.ptr
 // CHECK:         llvm.func @free(!llvm.ptr)
 
-// CHECK-LABEL:   llvm.func internal @private_test(
-// CHECK: %[[VAL_0:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(1 : i32) : i32
-// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK-LABEL: llvm.func internal @private_test(
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[VAL_2:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
 // CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(48 : i64) : i64
 // CHECK: %[[HEAP:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr
-// CHECK: %[[STACK:.*]] = llvm.alloca %[[VAL_1]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
-// CHECK: %[[VAL_6:.*]] = llvm.alloca %[[VAL_1]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
-// CHECK: llvm.store %[[VAL_0]], %[[STACK]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
-// CHECK: llvm.store %[[VAL_1]], %[[VAL_6]] : i32, !llvm.ptr
-// CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_6]] : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
-// CHECK: %[[VAL_8:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_9:.*]] = llvm.load %[[VAL_8]] : !llvm.ptr -> i64
-// CHECK: %[[VAL_10:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_11:.*]] = llvm.load %[[VAL_10]] : !llvm.ptr -> i64
-// CHECK: %[[VAL_12:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_13:.*]] = llvm.load %[[VAL_12]] : !llvm.ptr -> i64
-// CHECK: %[[VAL_14:.*]] = llvm.sub %[[VAL_11]], %[[VAL_2]] : i64
-// CHECK: %[[VAL_15:.*]] = omp.map.bounds lower_bound(%[[VAL_2]] : i64) upper_bound(%[[VAL_14]] : i64) extent(%[[VAL_11]] : i64) stride(%[[VAL_13]] : i64) start_idx(%[[VAL_9]] : i64) {stride_in_bytes = true}
-// CHECK: %[[VAL_16:.*]] = llvm.load %[[STACK]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: llvm.store %[[VAL_16]], %[[HEAP]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
-// CHECK: %[[VAL_17:.*]] = llvm.getelementptr %[[HEAP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_18:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[VAL_17]] : !llvm.ptr) bounds(%[[VAL_15]]) -> !llvm.ptr {name = ""}
-// CHECK: %[[VAL_19:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, to) capture(ByRef) members(%[[VAL_18]] : [0] : !llvm.ptr) -> !llvm.ptr
-// CHECK: omp.target nowait map_entries(%[[VAL_7]] -> %[[VAL_20:.*]], %[[VAL_19]] -> %[[VAL_21:.*]], %[[VAL_18]] -> %[[VAL_22:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@privatizer %[[HEAP]] -> %[[VAL_23:.*]] [map_idx=1] : !llvm.ptr) {
-// CHECK:   omp.terminator
+// CHECK: %[[STACK:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_6:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_7:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_7]], %[[STACK]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: llvm.store %[[VAL_0]], %[[VAL_6]] : i32, !llvm.ptr
+// CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_6]] : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+// CHECK: %[[VAL_9:.*]] = llvm.getelementptr %[[STACK]][0, 7, %[[VAL_1]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_10:.*]] = llvm.load %[[VAL_9]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_11:.*]] = llvm.getelementptr %[[STACK]][0, 7, %[[VAL_1]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_12:.*]] = llvm.load %[[VAL_11]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_13:.*]] = llvm.getelementptr %[[STACK]][0, 7, %[[VAL_1]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_14:.*]] = llvm.load %[[VAL_13]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_15:.*]] = llvm.sub %[[VAL_12]], %[[VAL_1]] : i64
+// CHECK: %[[VAL_16:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_15]] : i64) extent(%[[VAL_12]] : i64) stride(%[[VAL_14]] : i64) start_idx(%[[VAL_10]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_17:.*]] = llvm.load %[[STACK]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_17]], %[[HEAP]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: %[[VAL_18:.*]] = llvm.getelementptr %[[HEAP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_19:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[VAL_18]] : !llvm.ptr) bounds(%[[VAL_16]]) -> !llvm.ptr {name = ""}
+// CHECK: %[[VAL_20:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, to) capture(ByRef) members(%[[VAL_19]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: omp.target nowait map_entries(%[[VAL_8]] -> %[[VAL_21:.*]], %[[VAL_20]] -> %[[VAL_22:.*]], %[[VAL_19]] -> %[[VAL_23:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@privatizer %[[HEAP]] -> %[[VAL_24:.*]] [map_idx=1] : !llvm.ptr) {
+// CHECK: omp.terminator
 // CHECK: }
-// CHECK: %[[VAL_24:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_25:.*]] = llvm.load %[[VAL_24]] : !llvm.ptr -> !llvm.ptr
-// CHECK: llvm.call @free(%[[VAL_25]]) : (!llvm.ptr) -> ()
+// CHECK: %[[VAL_26:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_27:.*]] = llvm.load %[[VAL_26]] : !llvm.ptr -> !llvm.ptr
+// CHECK: llvm.call @free(%[[VAL_27]]) : (!llvm.ptr) -> ()
 // CHECK: llvm.return
-// CHECK:         }
+// CHECK: }
 
-// CHECK-LABEL:   llvm.func internal @firstprivate_test(
-// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(4 : i64) : i64
-// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK: %[[VAL_2:.*]] = llvm.mlir.undef :
-// CHECK-SAME: !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(1 : i32) : i32
-// CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK: %[[VAL_5:.*]] = llvm.mlir.constant(48 : i64) : i64
-// CHECK: %[[HEAP:.*]] = llvm.call @malloc(%[[VAL_5]]) : (i64) -> !llvm.ptr
-// CHECK: %[[STACK:.*]] = llvm.alloca %[[VAL_3]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_8:.*]] = llvm.alloca %[[VAL_3]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
-// CHECK: llvm.store %[[VAL_2]], %[[STACK]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
-// CHECK: llvm.store %[[VAL_3]], %[[VAL_8]] : i32, !llvm.ptr
-// CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_8]] : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc)
-// CHECK-SAME: capture(ByCopy) -> !llvm.ptr {name = "i"}
-// CHECK: %[[VAL_10:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 0] : (!llvm.ptr) -> !llvm.ptr,
-// CHECK-SAME: !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_11:.*]] = llvm.load %[[VAL_10]] : !llvm.ptr -> i64
-// CHECK: %[[VAL_12:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 1] : (!llvm.ptr) ->
-// CHECK-SAME: !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_13:.*]] = llvm.load %[[VAL_12]] : !llvm.ptr -> i64
-// CHECK: %[[VAL_14:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_15:.*]] = llvm.load %[[VAL_14]] : !llvm.ptr -> i64
-// CHECK: %[[VAL_16:.*]] = llvm.sub %[[VAL_13]], %[[VAL_4]] : i64
-// CHECK: %[[VAL_17:.*]] = omp.map.bounds lower_bound(%[[VAL_4]] : i64) upper_bound(%[[VAL_16]] : i64) extent(%[[VAL_13]] : i64) stride(%[[VAL_15]] : i64) start_idx(%[[VAL_11]] : i64) {stride_in_bytes = true}
-// CHECK: %[[VAL_18:.*]] = llvm.load %[[STACK]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: llvm.store %[[VAL_18]], %[[HEAP]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
-// CHECK: %[[VAL_19:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_20:.*]] = llvm.sub %[[VAL_16]], %[[VAL_4]] : i64
-// CHECK: %[[VAL_21:.*]] = llvm.add %[[VAL_20]], %[[VAL_1]] : i64
-// CHECK: %[[VAL_22:.*]] = llvm.mul %[[VAL_1]], %[[VAL_21]] : i64
-// CHECK: %[[VAL_23:.*]] = llvm.mul %[[VAL_22]], %[[VAL_0]] : i64
-// CHECK: %[[NEW_DATA_PTR:.*]] = llvm.call @malloc(%[[VAL_23]]) : (i64) -> !llvm.ptr
-// CHECK: %[[OLD_DATA_PTR:.*]] = llvm.load %[[VAL_19]] : !llvm.ptr -> !llvm.ptr
-// CHECK: "llvm.intr.memcpy"(%[[NEW_DATA_PTR]], %[[OLD_DATA_PTR]], %[[VAL_23]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
-// CHECK: %[[VAL_26:.*]] = llvm.getelementptr %[[HEAP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: llvm.store %[[NEW_DATA_PTR]], %[[VAL_26]] : !llvm.ptr, !llvm.ptr
-// CHECK: %[[VAL_27:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef)
-// CHECK-SAME: var_ptr_ptr(%[[VAL_26]] : !llvm.ptr) bounds(%[[VAL_17]]) -> !llvm.ptr {name = ""}
-// CHECK: %[[VAL_28:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>)
-// CHECK-SAME: map_clauses(always, to) capture(ByRef) members(%[[VAL_27]] : [0] : !llvm.ptr) -> !llvm.ptr
-// CHECK: omp.target nowait map_entries(%[[VAL_9]] -> %[[VAL_29:.*]], %[[VAL_28]] -> %[[VAL_30:.*]], %[[VAL_27]] -> %[[VAL_31:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr)
-// CHECK-SAME: private(@firstprivatizer %[[HEAP]] -> %[[VAL_32:.*]] [map_idx=1] : !llvm.ptr) {
-// CHECK:   omp.terminator
+// CHECK-LABEL: llvm.func internal @firstprivate_test(
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[VAL_2:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[HEAP:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr
+// CHECK: %[[STACK:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_6:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_7:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_7]], %[[STACK]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: llvm.store %[[VAL_0]], %[[VAL_6]] : i32, !llvm.ptr
+// CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_6]] : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+// CHECK: %[[VAL_9:.*]] = llvm.getelementptr %[[STACK]][0, 7, %[[VAL_1]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_10:.*]] = llvm.load %[[VAL_9]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_11:.*]] = llvm.getelementptr %[[STACK]][0, 7, %[[VAL_1]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_12:.*]] = llvm.load %[[VAL_11]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_13:.*]] = llvm.getelementptr %[[STACK]][0, 7, %[[VAL_1]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_14:.*]] = llvm.load %[[VAL_13]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_15:.*]] = llvm.sub %[[VAL_12]], %[[VAL_1]] : i64
+// CHECK: %[[VAL_16:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_15]] : i64) extent(%[[VAL_12]] : i64) stride(%[[VAL_14]] : i64) start_idx(%[[VAL_10]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_17:.*]] = llvm.load %[[STACK]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_17]], %[[HEAP]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: %[[VAL_18:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_19:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_20:.*]] = llvm.sub %[[VAL_15]], %[[VAL_1]] : i64
+// CHECK: %[[VAL_21:.*]] = llvm.add %[[VAL_20]], %[[VAL_19]] : i64
+// CHECK: %[[VAL_22:.*]] = llvm.mul %[[VAL_19]], %[[VAL_21]] : i64
+// CHECK: %[[VAL_23:.*]] = llvm.mlir.constant(4 : i64) : i64
+// CHECK: %[[VAL_24:.*]] = llvm.mul %[[VAL_22]], %[[VAL_23]] : i64
+// CHECK: %[[NEW_DATA_PTR:.*]] = llvm.call @malloc(%[[VAL_24]]) : (i64) -> !llvm.ptr
+// CHECK: %[[OLD_DATA_PTR:.*]] = llvm.load %[[VAL_18]] : !llvm.ptr -> !llvm.ptr
+// CHECK: "llvm.intr.memcpy"(%[[NEW_DATA_PTR]], %[[OLD_DATA_PTR]], %[[VAL_24]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// CHECK: %[[VAL_27:.*]] = llvm.getelementptr %[[HEAP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[NEW_DATA_PTR]], %[[VAL_27]] : !llvm.ptr, !llvm.ptr
+// CHECK: %[[VAL_28:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[VAL_27]] : !llvm.ptr) bounds(%[[VAL_16]]) -> !llvm.ptr {name = ""}
+// CHECK: %[[VAL_29:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, to) capture(ByRef) members(%[[VAL_28]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: omp.target nowait map_entries(%[[VAL_8]] -> %[[VAL_30:.*]], %[[VAL_29]] -> %[[VAL_31:.*]], %[[VAL_28]] -> %[[VAL_32:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %[[HEAP]] -> %[[VAL_33:.*]] [map_idx=1] : !llvm.ptr) {
+// CHECK: omp.terminator
 // CHECK: }
-// CHECK: %[[VAL_33:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_34:.*]] = llvm.load %[[VAL_33]] : !llvm.ptr -> !llvm.ptr
-// CHECK: llvm.call @free(%[[VAL_34]]) : (!llvm.ptr) -> ()
+// CHECK: %[[VAL_34:.*]] = llvm.mlir.constant(48 : i32) : i32
+// CHECK: %[[VAL_35:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_36:.*]] = llvm.load %[[VAL_35]] : !llvm.ptr -> !llvm.ptr
+// CHECK: llvm.call @free(%[[VAL_36]]) : (!llvm.ptr) -> ()
 // CHECK: llvm.return
-// CHECK:         }
+// CHECK: }

>From e74fa7de98e870454123eeeef3f108387b99a75e Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 11 Sep 2025 10:45:10 -0500
Subject: [PATCH 09/27] Revert "Fix CHECK stmts in test to account for constant
 folding done by the greedy pattern matcher"

This reverts commit c859bbc8fc63d53514502570af71c6dfeae68d9f.
---
 flang/test/Fir/omp_target_allocmem_freemem.fir | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/flang/test/Fir/omp_target_allocmem_freemem.fir b/flang/test/Fir/omp_target_allocmem_freemem.fir
index aa7b2dce07153..03eb94acb1ac7 100644
--- a/flang/test/Fir/omp_target_allocmem_freemem.fir
+++ b/flang/test/Fir/omp_target_allocmem_freemem.fir
@@ -62,7 +62,7 @@ func.func @omp_target_allocmem_scalar_char_kind() -> () {
 // CHECK-LABEL: define void @omp_target_allocmem_scalar_dynchar(
 // CHECK-SAME: i32 [[TMP0:%.*]]) {
 // CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = mul i64 1, [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = mul i64 1, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0)
 // CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
@@ -80,7 +80,7 @@ func.func @omp_target_allocmem_scalar_dynchar(%l : i32) -> () {
 // CHECK-LABEL: define void @omp_target_allocmem_scalar_dynchar_kind(
 // CHECK-SAME: i32 [[TMP0:%.*]]) {
 // CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = mul i64 2, [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = mul i64 1, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0)
 // CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
@@ -141,7 +141,7 @@ func.func @omp_target_allocmem_array_of_dynchar(%l: i32) -> () {
 
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_nonchar(
 // CHECK-SAME: i64 [[TMP0:%.*]]) {
-// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 12
+// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 12, [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = mul i64 1, [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call ptr @omp_target_alloc(i64 [[TMP3]], i32 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
@@ -157,7 +157,7 @@ func.func @omp_target_allocmem_dynarray_of_nonchar(%e: index) -> () {
 
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_nonchar2(
 // CHECK-SAME: i64 [[TMP0:%.*]]) {
-// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 4
+// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 4, [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], [[TMP0]]
 // CHECK-NEXT:    [[TMP4:%.*]] = mul i64 1, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0)
@@ -174,7 +174,7 @@ func.func @omp_target_allocmem_dynarray_of_nonchar2(%e: index) -> () {
 
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_char(
 // CHECK-SAME: i64 [[TMP0:%.*]]) {
-// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 60
+// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 60, [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = mul i64 1, [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call ptr @omp_target_alloc(i64 [[TMP3]], i32 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
@@ -191,7 +191,7 @@ func.func @omp_target_allocmem_dynarray_of_char(%e : index) -> () {
 
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_char2(
 // CHECK-SAME: i64 [[TMP0:%.*]]) {
-// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 20
+// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 20, [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], [[TMP0]]
 // CHECK-NEXT:    [[TMP4:%.*]] = mul i64 1, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0)
@@ -227,7 +227,7 @@ func.func @omp_target_allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> () {
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_dynchar2(
 // CHECK-SAME: i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) {
 // CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+// CHECK-NEXT:    [[TMP4:%.*]] = mul i64 2, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], [[TMP1]]
 // CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP1]]
 // CHECK-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]

>From 343f7ec16e14435a6b14b676cecf045d8501da56 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 11 Sep 2025 11:09:15 -0500
Subject: [PATCH 10/27] Fix more testcases

---
 flang/test/Fir/boxproc.fir                       | 16 ++++++++++++----
 flang/test/Fir/embox.fir                         |  6 +++---
 .../Integration/OpenMP/map-types-and-sizes.f90   | 14 +++++++-------
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir
index d4c36a4f5b213..97d9b38ed6f40 100644
--- a/flang/test/Fir/boxproc.fir
+++ b/flang/test/Fir/boxproc.fir
@@ -82,8 +82,12 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 // CHECK:         store [1 x i8] c" ", ptr %[[VAL_18]], align 1
 // CHECK:         call void @llvm.init.trampoline(ptr %[[VAL_20]], ptr @_QFtest_proc_dummy_charPgen_message, ptr %[[VAL_2]])
 // CHECK:         %[[VAL_23:.*]] = call ptr @llvm.adjust.trampoline(ptr %[[VAL_20]])
+// CHECK:         %[[VAL_25:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_23]], 0
+// CHECK:         %[[VAL_26:.*]] = insertvalue { ptr, i64 } %[[VAL_25]], i64 10, 1
 // CHECK:         %[[VAL_27:.*]] = call ptr @llvm.stacksave.p0()
-// CHECK:         %[[VAL_30:.*]] = call { ptr, i64 } @_QPget_message(ptr %[[VAL_0]], i64 40, ptr %[[VAL_23]], i64 10)
+// CHECK:         %[[VAL_28:.*]] = extractvalue { ptr, i64 } %[[VAL_26]], 0
+// CHECK:         %[[VAL_29:.*]] = extractvalue { ptr, i64 } %[[VAL_26]], 1
+// CHECK:         %[[VAL_30:.*]] = call { ptr, i64 } @_QPget_message(ptr %[[VAL_0]], i64 40, ptr %[[VAL_28]], i64 %[[VAL_29]])
 // CHECK:         %[[VAL_32:.*]] = call i1 @_FortranAioOutputAscii(ptr %{{.*}}, ptr %[[VAL_0]], i64 40)
 // CHECK:         call void @llvm.stackrestore.p0(ptr %[[VAL_27]])
 
@@ -111,10 +115,14 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 // CHECK-LABEL: define { ptr, i64 } @_QPget_message(ptr
 // CHECK-SAME:                  %[[VAL_0:.*]], i64 %[[VAL_1:.*]], ptr %[[VAL_2:.*]], i64
 // CHECK-SAME:                                                 %[[VAL_3:.*]])
+// CHECK:         %[[VAL_4:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_2]], 0
+// CHECK:         %[[VAL_5:.*]] = insertvalue { ptr, i64 } %[[VAL_4]], i64 %[[VAL_3]], 1
+// CHECK:         %[[VAL_7:.*]] = extractvalue { ptr, i64 } %[[VAL_5]], 0
+// CHECK:         %[[VAL_8:.*]] = extractvalue { ptr, i64 } %[[VAL_5]], 1
 // CHECK:         %[[VAL_9:.*]] = call ptr @llvm.stacksave.p0()
-// CHECK:         %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_3]], align 1
-// CHECK:         %[[VAL_12:.*]] = call { ptr, i64 } %[[VAL_2]](ptr %[[VAL_10]], i64 %[[VAL_3]])
-// CHECK:         %[[VAL_13:.*]] = add i64 %[[VAL_3]], 12
+// CHECK:         %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_8]], align 1
+// CHECK:         %[[VAL_12:.*]] = call { ptr, i64 } %[[VAL_7]](ptr %[[VAL_10]], i64 %[[VAL_8]])
+// CHECK:         %[[VAL_13:.*]] = add i64 %[[VAL_8]], 12
 // CHECK:         %[[VAL_14:.*]] = alloca i8, i64 %[[VAL_13]], align 1
 // CHECK:         call void @llvm.memmove.p0.p0.i64(ptr %[[VAL_14]], ptr {{.*}}, i64 12, i1 false)
 // CHECK:         %[[VAL_18:.*]] = phi i64
diff --git a/flang/test/Fir/embox.fir b/flang/test/Fir/embox.fir
index 11f7457b6873c..0f304cff2c79e 100644
--- a/flang/test/Fir/embox.fir
+++ b/flang/test/Fir/embox.fir
@@ -11,7 +11,7 @@ func.func @_QPtest_callee(%arg0: !fir.box<!fir.array<?xi32>>) {
 func.func @_QPtest_slice() {
 // CHECK:  %[[a1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 // CHECK:  %[[a2:.*]] = alloca [20 x i32], i64 1, align 4
-// CHECK:  %[[a3:.*]] = getelementptr [20 x i32], ptr %[[a2]], i32 0, i64 0
+// CHECK:  %[[a3:.*]] = getelementptr [20 x i32], ptr %[[a2]], i64 0, i64 0
 // CHECK:  %[[a4:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
 // CHECK:  { ptr undef, i64 4, i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]]
 // CHECK: [i64 1, i64 5, i64 8]] }, ptr %[[a3]], 0
@@ -38,7 +38,7 @@ func.func @_QPtest_dt_callee(%arg0: !fir.box<!fir.array<?xi32>>) {
 func.func @_QPtest_dt_slice() {
 // CHECK:  %[[a1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 // CHECK:  %[[a3:.*]] = alloca [20 x %_QFtest_dt_sliceTt], i64 1, align 8
-// CHECK:  %[[a4:.*]] = getelementptr [20 x %_QFtest_dt_sliceTt], ptr %[[a3]], i32 0, i64 0, i32 0
+// CHECK:  %[[a4:.*]] = getelementptr [20 x %_QFtest_dt_sliceTt], ptr %[[a3]], i64 0, i64 0, i32 0
 // CHECK: %[[a5:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
 // CHECK-SAME: { ptr undef, i64 4, i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]]
 // CHECK-SAME: [i64 1, i64 5, i64 16
@@ -73,7 +73,7 @@ func.func @emboxSubstring(%arg0: !fir.ref<!fir.array<2x3x!fir.char<1,4>>>) {
   %0 = fir.shape %c2, %c3 : (index, index) -> !fir.shape<2>
   %1 = fir.slice %c1, %c2, %c1, %c1, %c3, %c1 substr %c1_i64, %c2_i64 : (index, index, index, index, index, index, i64, i64) -> !fir.slice<2>
   %2 = fir.embox %arg0(%0) [%1] : (!fir.ref<!fir.array<2x3x!fir.char<1,4>>>, !fir.shape<2>, !fir.slice<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
-  // CHECK: %[[addr:.*]] = getelementptr [3 x [2 x [4 x i8]]], ptr %[[arg0]], i32 0, i64 0, i64 0, i32 1
+  // CHECK: %[[addr:.*]] = getelementptr [3 x [2 x [4 x i8]]], ptr %[[arg0]], i64 0, i64 0, i64 0, i64 1
   // CHECK: insertvalue {[[descriptorType:.*]]} { ptr undef, i64 2, i32 20240719, i8 2, i8 40, i8 0, i8 0
   // CHECK-SAME: [2 x [3 x i64]] [{{\[}}3 x i64] [i64 1, i64 2, i64 4], [3 x i64] [i64 1, i64 3, i64 8]] }
   // CHECK-SAME: ptr %[[addr]], 0
diff --git a/flang/test/Integration/OpenMP/map-types-and-sizes.f90 b/flang/test/Integration/OpenMP/map-types-and-sizes.f90
index 5ce36ac87ca8c..665be5a8db4d4 100644
--- a/flang/test/Integration/OpenMP/map-types-and-sizes.f90
+++ b/flang/test/Integration/OpenMP/map-types-and-sizes.f90
@@ -545,7 +545,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[ALLOCATABLE_DESC_ALLOCA:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 !CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_derived_type_allocaTone_layer, i64 1, align 8
 !CHECK: %[[MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_derived_type_allocaTone_layer, ptr %[[ALLOCA]], i32 0, i32 4
-!CHECK: %[[DESC_BOUND_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_DESC_ALLOCA]], i32 0, i32 7, i32 0, i32 1
+!CHECK: %[[DESC_BOUND_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_DESC_ALLOCA]], i32 0, i32 7, i64 0, i32 1
 !CHECK: %[[DESC_BOUND_ACCESS_LOAD:.*]] = load i64, ptr %[[DESC_BOUND_ACCESS]], align 8
 !CHECK: %[[OFFSET_UB:.*]] = sub i64 %[[DESC_BOUND_ACCESS_LOAD]], 1
 !CHECK: %[[MEMBER_DESCRIPTOR_BASE_ADDR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[MEMBER_ACCESS]], i32 0, i32 0
@@ -596,7 +596,7 @@ end subroutine mapType_common_block_members
 !CHECK: %{{.*}} = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 0
 !CHECK: %{{.*}} = load ptr, ptr %{{.*}}, align 8
 !CHECK: %{{.*}} = getelementptr %_QFmaptype_alloca_derived_typeTone_layer, ptr %{{.*}}, i32 0, i32 4
-!CHECK: %[[ACCESS_DESC_MEMBER_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[DTYPE_ARRAY_MEMBER_DESC_ALLOCA]], i32 0, i32 7, i32 0, i32 1
+!CHECK: %[[ACCESS_DESC_MEMBER_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[DTYPE_ARRAY_MEMBER_DESC_ALLOCA]], i32 0, i32 7, i64 0, i32 1
 !CHECK: %[[LOAD_DESC_MEMBER_UB:.*]] = load i64, ptr %[[ACCESS_DESC_MEMBER_UB]], align 8
 !CHECK: %[[OFFSET_MEMBER_UB:.*]] = sub i64 %[[LOAD_DESC_MEMBER_UB]], 1
 !CHECK: %[[DTYPE_BASE_ADDR_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[DTYPE_DESC_ALLOCA_2]], i32 0, i32 0
@@ -665,7 +665,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[ALLOCATABLE_MEMBER_ALLOCA:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 !CHECK: %[[DTYPE_DESC_ALLOCA_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, align 8
 !CHECK: %[[DTYPE_DESC_ALLOCA_3:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1, align 8
-!CHECK: %[[ALLOCATABLE_MEMBER_ALLOCA_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_MEMBER_ALLOCA]], i32 0, i32 7, i32 0, i32 1
+!CHECK: %[[ALLOCATABLE_MEMBER_ALLOCA_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_MEMBER_ALLOCA]], i32 0, i32 7, i64 0, i32 1
 !CHECK: %[[ALLOCATABLE_MEMBER_ALLOCA_UB_LOAD:.*]] = load i64, ptr %[[ALLOCATABLE_MEMBER_ALLOCA_UB]], align 8
 !CHECK: %[[ALLOCATABLE_MEMBER_SIZE_CALC_1:.*]] = sub i64 %[[ALLOCATABLE_MEMBER_ALLOCA_UB_LOAD]], 1
 !CHECK: %[[DTYPE_DESC_BASE_ADDR_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[DTYPE_DESC_ALLOCA_2]], i32 0, i32 0
@@ -734,7 +734,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_nested_derived_type_allocaTtop_layer, i64 1, align 8
 !CHECK: %[[NESTED_DTYPE_MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_nested_derived_type_allocaTtop_layer, ptr %[[ALLOCA]], i32 0, i32 6
 !CHECK: %[[NESTED_MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_nested_derived_type_allocaTmiddle_layer, ptr %[[NESTED_DTYPE_MEMBER_ACCESS]], i32 0, i32 2
-!CHECK: %[[ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_MEMBER_ALLOCA]], i32 0, i32 7, i32 0, i32 1
+!CHECK: %[[ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_MEMBER_ALLOCA]], i32 0, i32 7, i64 0, i32 1
 !CHECK: %[[ALLOCATABLE_MEMBER_ADDR_LOAD:.*]] = load i64, ptr %[[ALLOCATABLE_MEMBER_BASE_ADDR]], align 8
 !CHECK: %[[ALLOCATABLE_MEMBER_SIZE_CALC_1:.*]] = sub i64 %[[ALLOCATABLE_MEMBER_ADDR_LOAD]], 1
 !CHECK: %[[NESTED_MEMBER_BASE_ADDR_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 0
@@ -778,9 +778,9 @@ end subroutine mapType_common_block_members
 !CHECK: %[[ALLOCA:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, align 8
 !CHECK: %[[BASE_PTR_1:.*]] = alloca %_QFmaptype_nested_derived_type_member_idxTdtype, i64 1, align 8
 !CHECK: %[[OFF_PTR_1:.*]] = getelementptr %_QFmaptype_nested_derived_type_member_idxTdtype, ptr %[[BASE_PTR_1]], i32 0, i32 1
-!CHECK: %[[BOUNDS_ACC:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA]], i32 0, i32 7, i32 0, i32 1
+!CHECK: %[[BOUNDS_ACC:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA]], i32 0, i32 7, i64 0, i32 1
 !CHECK: %[[BOUNDS_LD:.*]] = load i64, ptr %[[BOUNDS_ACC]], align 8
-!CHECK: %[[BOUNDS_ACC_2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCA_1]], i32 0, i32 7, i32 0, i32 1
+!CHECK: %[[BOUNDS_ACC_2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCA_1]], i32 0, i32 7, i64 0, i32 1
 !CHECK: %[[BOUNDS_LD_2:.*]] = load i64, ptr %[[BOUNDS_ACC_2]], align 8
 !CHECK: %[[BOUNDS_CALC:.*]] = sub i64 %[[BOUNDS_LD_2]], 1
 !CHECK: %[[OFF_PTR_CALC_0:.*]] = sub i64 %[[BOUNDS_LD]], 1
@@ -789,7 +789,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[LOAD_DESC_PTR:.*]] = load ptr, ptr %[[GEP_DESC_PTR]], align 8
 !CHECK: %[[SZ_CALC_1:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA_0]], i32 0, i32 7, i32 0, i32 2
 !CHECK: %[[SZ_CALC_2:.*]] = load i64, ptr %[[SZ_CALC_1]], align 8
-!CHECK: %[[SZ_CALC_3:.*]] = mul nsw i64 %[[SZ_CALC_2]], 1
+!CHECK: %[[SZ_CALC_3:.*]] = mul nsw i64 1, %[[SZ_CALC_2]]
 !CHECK: %[[SZ_CALC_4:.*]] = add nsw i64 %[[SZ_CALC_3]], 0
 !CHECK: %[[SZ_CALC_5:.*]] = getelementptr i8, ptr %[[LOAD_DESC_PTR]], i64 %[[SZ_CALC_4]]
 !CHECK: %[[SZ_CALC_6:.*]] = getelementptr %_QFmaptype_nested_derived_type_member_idxTvertexes, ptr %[[SZ_CALC_5]], i32 0, i32 2

>From 41bb5b2f7e0678c1281b4ca1ee593ea16a0c4a38 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 11 Sep 2025 14:59:14 -0500
Subject: [PATCH 11/27] Reset some tests back to their old states

---
 flang/test/Driver/tco-emit-final-mlir.fir      | 4 ++--
 flang/test/Fir/alloc.fir                       | 2 --
 flang/test/Fir/omp-reduction-embox-codegen.fir | 6 +++---
 flang/test/Fir/pdt.fir                         | 6 +++---
 flang/test/Fir/select.fir                      | 2 +-
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/flang/test/Driver/tco-emit-final-mlir.fir b/flang/test/Driver/tco-emit-final-mlir.fir
index 177810cf41378..75f8f153127af 100644
--- a/flang/test/Driver/tco-emit-final-mlir.fir
+++ b/flang/test/Driver/tco-emit-final-mlir.fir
@@ -13,7 +13,7 @@
 // CHECK: llvm.return
 // CHECK-NOT: func.func
 
-func.func @_QPfoo() -> !fir.ref<i32> {
+func.func @_QPfoo() {
   %1 = fir.alloca i32
-  return %1 : !fir.ref<i32>
+  return
 }
diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir
index 98d0a03790a58..9a0632d9e0172 100644
--- a/flang/test/Fir/alloc.fir
+++ b/flang/test/Fir/alloc.fir
@@ -366,13 +366,11 @@ func.func @allocmem_array_with_holes_dynchar(%arg0: index, %arg1: index) -> !fir
 // CHECK:    %[[VAL_0:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
 // CHECK:    %[[VAL_3:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, i64 1
 // CHECK:    %[[VAL_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
-func.func private @foo(%0: !fir.ref<!fir.class<none>>, %1: !fir.ref<!fir.class<!fir.array<?xnone>>>, %2: !fir.ref<!fir.box<none>>, %3: !fir.ref<!fir.box<!fir.array<?xnone>>>)
 func.func @alloca_unlimited_polymorphic_box() {
   %0 = fir.alloca !fir.class<none>
   %1 = fir.alloca !fir.class<!fir.array<?xnone>>
   %2 = fir.alloca !fir.box<none>
   %3 = fir.alloca !fir.box<!fir.array<?xnone>>
-  fir.call @foo(%0, %1, %2, %3) : (!fir.ref<!fir.class<none>>, !fir.ref<!fir.class<!fir.array<?xnone>>>, !fir.ref<!fir.box<none>>, !fir.ref<!fir.box<!fir.array<?xnone>>>) -> ()
   return
 }
 // Note: allocmem of fir.box are not possible (fir::HeapType::verify does not
diff --git a/flang/test/Fir/omp-reduction-embox-codegen.fir b/flang/test/Fir/omp-reduction-embox-codegen.fir
index e517b1352ff5c..1645e1a407ad4 100644
--- a/flang/test/Fir/omp-reduction-embox-codegen.fir
+++ b/flang/test/Fir/omp-reduction-embox-codegen.fir
@@ -23,14 +23,14 @@ omp.declare_reduction @test_reduction : !fir.ref<!fir.box<i32>> init {
   omp.yield(%0 : !fir.ref<!fir.box<i32>>)
 }
 
-func.func @_QQmain()  -> !fir.ref<!fir.box<i32>> attributes {fir.bindc_name = "reduce"} {
+func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
   %4 = fir.alloca !fir.box<i32>
   omp.parallel reduction(byref @test_reduction %4 -> %arg0 : !fir.ref<!fir.box<i32>>) {
     omp.terminator
   }
-  return %4: !fir.ref<!fir.box<i32>>
+  return
 }
 
 // basically we are testing that there isn't a crash
-// CHECK-LABEL: define ptr @_QQmain
+// CHECK-LABEL: define void @_QQmain
 // CHECK-NEXT:    alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
diff --git a/flang/test/Fir/pdt.fir b/flang/test/Fir/pdt.fir
index 411927aae6bdf..a200cd7e7cc03 100644
--- a/flang/test/Fir/pdt.fir
+++ b/flang/test/Fir/pdt.fir
@@ -96,13 +96,13 @@ func.func @_QTt1P.f2.offset(%0 : i32, %1 : i32) -> i32 {
 
 func.func private @bar(!fir.ref<!fir.char<1,?>>)
 
-// CHECK-LABEL: define ptr @_QPfoo(i32 %0, i32 %1)
-func.func @_QPfoo(%arg0 : i32, %arg1 : i32) -> !fir.ref<!fir.type<_QTt1>> {
+// CHECK-LABEL: define void @_QPfoo(i32 %0, i32 %1)
+func.func @_QPfoo(%arg0 : i32, %arg1 : i32) {
   // CHECK: %[[size:.*]] = call i64 @_QTt1P.mem.size(i32 %0, i32 %1)
   // CHECK: %[[alloc:.*]] = alloca i8, i64 %[[size]]
   %0 = fir.alloca !fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>(%arg0, %arg1 : i32, i32)
   //%2 = fir.coordinate_of %0, f2 : (!fir.ref<!fir.type<_QTt1>>) -> !fir.ref<!fir.char<1,?>>
   %2 = fir.zero_bits !fir.ref<!fir.char<1,?>>
   fir.call @bar(%2) : (!fir.ref<!fir.char<1,?>>) -> ()
-  return %0 : !fir.ref<!fir.type<_QTt1>>
+  return
 }
diff --git a/flang/test/Fir/select.fir b/flang/test/Fir/select.fir
index 6d843e824d33f..5e88048446407 100644
--- a/flang/test/Fir/select.fir
+++ b/flang/test/Fir/select.fir
@@ -64,6 +64,6 @@ func.func @h(%a : i32) -> i32 {
    return %1 : i32
 ^bb6:
    %x = arith.addi %b4, %b3 : i32
-   // CHECK-DAG: ret i32
+   // CHECK: ret i32
    return %x : i32
 }

>From 3264bfeceb19a0be1eabce8f10e05a31c6aa1159 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 11 Sep 2025 15:00:35 -0500
Subject: [PATCH 12/27] Undo empty line removal

---
 flang/test/Fir/alloc.fir | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir
index 9a0632d9e0172..8da8b828c18b9 100644
--- a/flang/test/Fir/alloc.fir
+++ b/flang/test/Fir/alloc.fir
@@ -366,6 +366,7 @@ func.func @allocmem_array_with_holes_dynchar(%arg0: index, %arg1: index) -> !fir
 // CHECK:    %[[VAL_0:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
 // CHECK:    %[[VAL_3:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, i64 1
 // CHECK:    %[[VAL_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
+
 func.func @alloca_unlimited_polymorphic_box() {
   %0 = fir.alloca !fir.class<none>
   %1 = fir.alloca !fir.class<!fir.array<?xnone>>

>From b9fb8ce003dfefe6d6b80556b532e50cf951dbe9 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 11 Sep 2025 15:15:02 -0500
Subject: [PATCH 13/27] Add back some CHECK statements in
 flang/test/Fir/target.fir and flang/test/Lower/allocatable-polymorphic.f90

---
 flang/test/Fir/target.fir                    | 4 ++++
 flang/test/Lower/allocatable-polymorphic.f90 | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/flang/test/Fir/target.fir b/flang/test/Fir/target.fir
index 1e721a09c835e..b04e23a018e7e 100644
--- a/flang/test/Fir/target.fir
+++ b/flang/test/Fir/target.fir
@@ -97,6 +97,10 @@ func.func @call8() {
 // X64-LABEL: define i64 @char1lensum(ptr {{[^%]*}}%0, ptr {{[^%]*}}%1, i64 %2, i64 %3)
 // PPC-LABEL: define i64 @char1lensum(ptr {{[^%]*}}%0, ptr {{[^%]*}}%1, i64 %2, i64 %3)
 func.func @char1lensum(%arg0 : !fir.boxchar<1>, %arg1 : !fir.boxchar<1>) -> i64 {
+  // X64-DAG: %[[p0:.*]] = insertvalue { ptr, i64 } undef, ptr %1, 0
+  // X64-DAG: = insertvalue { ptr, i64 } %[[p0]], i64 %3, 1
+  // X64-DAG: %[[p1:.*]] = insertvalue { ptr, i64 } undef, ptr %0, 0
+  // X64-DAG: = insertvalue { ptr, i64 } %[[p1]], i64 %2, 1
   %1:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1>>, i64)
   %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1>>, i64)
   // I32: %[[add:.*]] = add i64 %
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index 5a28e97054359..e6a8c5e025123 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -606,6 +606,8 @@ program test_alloc
 ! LLVM-COUNT-2:  call void %{{[0-9]*}}()
 
 ! LLVM: call void @llvm.memcpy.p0.p0.i32
+! LLVM: %[[GEP_TDESC_C1:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 7
+! LLVM: %[[TDESC_C1:.*]] = load ptr, ptr %[[GEP_TDESC_C1]]
 ! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1
 ! LLVM: %[[ELEM_SIZE:.*]] = load i64, ptr %[[ELEM_SIZE_GEP]]
 ! LLVM: %[[TYPE_CODE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 4
@@ -618,6 +620,8 @@ program test_alloc
 ! LLVM: call void %{{.*}}(ptr %{{.*}}) 
 
 ! LLVM: call void @llvm.memcpy.p0.p0.i32
+! LLVM: %[[GEP_TDESC_C2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 7
+! LLVM: %[[TDESC_C2:.*]] = load ptr, ptr %[[GEP_TDESC_C2]]
 ! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1
 ! LLVM: %[[ELEM_SIZE:.*]] = load i64, ptr %[[ELEM_SIZE_GEP]]
 ! LLVM: %[[TYPE_CODE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 4

>From 7032caeca24b636e415352ed626461d292470b4d Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Wed, 17 Sep 2025 13:35:12 -0500
Subject: [PATCH 14/27] use the init region to initialize the heap allocated
 private variable

---
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 78 ++++++++++++++++++-
 1 file changed, 75 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index a61e57564657f..71128f2153ce5 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -14,9 +14,11 @@
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Dominance.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/Support/DebugLog.h"
+#include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <utility>
 
@@ -154,11 +156,43 @@ class PrepareForOMPOffloadPrivatizationPass
         rewriter.setInsertionPoint(chainOfOps.front());
         // Copy the value of the local variable into the heap-allocated
         // location.
-        Location loc = chainOfOps.front()->getLoc();
+        Operation *firstOp = chainOfOps.front();
+        Location loc = firstOp->getLoc();
         Type varType = getElemType(varPtr);
-        auto loadVal = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
-        (void)rewriter.create<LLVM::StoreOp>(loc, loadVal.getResult(), heapMem);
 
+
+        // // auto loadVal = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
+        // // (void)rewriter.create<LLVM::StoreOp>(loc, loadVal.getResult(), heapMem);
+        #if 0
+        Region &initRegion = privatizer.getInitRegion();
+        assert(!initRegion.empty() && "initRegion cannot be empty");
+        Block &entryBlock = initRegion.front();
+        Block *insertBlock = firstOp->getBlock();
+        Block *newBlock = insertBlock->splitBlock(firstOp);
+        Region *destRegion = firstOp->getParentRegion();
+        IRMapping irMap;
+        irMap.map(varPtr, entryBlock.getArgument(0));
+        irMap.map(heapMem, entryBlock.getArgument(1));
+
+        LDBG() << "Operation being walked before cloning the init region\n\n";
+        LLVM_DEBUG(llvm::dbgs() << getOperation() << "\n");
+        initRegion.cloneInto(destRegion, Region::iterator(newBlock), irMap);
+        LDBG() << "Operation being walked after cloning the init region\n";
+        LLVM_DEBUG(llvm::dbgs() << getOperation() << "\n");
+        //        rewriter.setInsertionPointToEnd(insertBlock);
+        // LLVM::BrOp::create(rewriter, loc,
+        //            , );
+#else
+        // Todo: Handle boxchar (by value)
+        Region &initRegion = privatizer.getInitRegion();
+        assert(!initRegion.empty() && "initRegion cannot be empty");
+        LLVM::LLVMFuncOp initFunc = createFuncOpForRegion(
+            loc, mod, initRegion,
+            llvm::formatv("{0}_{1}", privatizer.getSymName(), "init").str(),
+            firstOp, rewriter);
+
+        rewriter.create<LLVM::CallOp>(loc, initFunc, ValueRange{varPtr, heapMem});
+#endif
         using ReplacementEntry = std::pair<Operation *, Operation *>;
         llvm::SmallVector<ReplacementEntry> replRecord;
         auto cloneAndMarkForDeletion = [&](Operation *origOp) -> Operation * {
@@ -412,5 +446,43 @@ class PrepareForOMPOffloadPrivatizationPass
     LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
     return rewriter.create<LLVM::CallOp>(loc, mallocFn, ValueRange{size});
   }
+  LLVM::LLVMFuncOp createFuncOpForRegion(Location loc, ModuleOp mod,
+                                         Region &srcRegion,
+                                         llvm::StringRef funcName,
+                                         Operation *insertPt,
+                                         IRRewriter &rewriter) {
+
+    OpBuilder::InsertionGuard guard(rewriter);
+    MLIRContext *ctx = mod.getContext();
+    rewriter.setInsertionPoint(mod.getBody(), mod.getBody()->end());
+    Region clonedRegion;
+    IRMapping mapper;
+    srcRegion.cloneInto(&clonedRegion, mapper);
+    SmallVector<Type> paramTypes = {srcRegion.getArgument(0).getType(),
+                                    srcRegion.getArgument(1).getType()};
+    LDBG() << "paramTypes are \n"
+           << srcRegion.getArgument(0).getType() << "\n"
+           << srcRegion.getArgument(1).getType() << "\n";
+    LLVM::LLVMFunctionType funcType =
+        LLVM::LLVMFunctionType::get(LLVM::LLVMVoidType::get(ctx), paramTypes);
+
+    LDBG() << "funcType is " << funcType << "\n";
+    LLVM::LLVMFuncOp func =
+        LLVM::LLVMFuncOp::create(rewriter, loc, funcName, funcType);
+    func.setAlwaysInline(true);
+    rewriter.inlineRegionBefore(clonedRegion, func.getRegion(),
+                                func.getRegion().end());
+    for (auto &block : func.getRegion().getBlocks()) {
+      if (isa<omp::YieldOp>(block.getTerminator())) {
+        omp::YieldOp yieldOp = cast<omp::YieldOp>(block.getTerminator());
+        rewriter.setInsertionPoint(yieldOp);
+        rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(yieldOp, TypeRange(),
+                                                    Value());
+      }
+    }
+    LDBG() << funcName << " is \n" << func << "\n";
+    LLVM_DEBUG(llvm::dbgs() << "Module is \n" << mod << "\n");
+    return func;
+  }
 };
 } // namespace

>From 269c575fc8a1a33fd1403d4f12973a5e6786d9b6 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Wed, 17 Sep 2025 14:54:56 -0500
Subject: [PATCH 15/27] use the copy region to copy the private variable into
 its heap allocated version

---
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 82 +++----------------
 1 file changed, 12 insertions(+), 70 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index 71128f2153ce5..e0de0434f0587 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -202,6 +202,16 @@ class PrepareForOMPOffloadPrivatizationPass
           return clonedOp;
         };
 
+        if (isFirstPrivate) {
+          Region &copyRegion = privatizer.getCopyRegion();
+          assert(!copyRegion.empty() && "copyRegion cannot be empty");
+          LLVM::LLVMFuncOp copyFunc = createFuncOpForRegion(
+              loc, mod,  copyRegion,
+              llvm::formatv("{0}_{1}", privatizer.getSymName(), "copy").str(),
+              firstOp, rewriter);
+          rewriter.create<LLVM::CallOp>(loc, copyFunc, ValueRange{varPtr, heapMem});
+        }
+
         rewriter.setInsertionPoint(targetOp);
         rewriter.setInsertionPoint(cloneAndMarkForDeletion(mapInfoOperation));
 
@@ -218,76 +228,8 @@ class PrepareForOMPOffloadPrivatizationPass
             if (memberMapInfoOp.getVarPtrPtr()) {
               Operation *varPtrPtrdefOp =
                   memberMapInfoOp.getVarPtrPtr().getDefiningOp();
-
-              // In the case of firstprivate, we have to do the following
-              // 1. Allocate heap memory for the underlying data.
-              // 2. Copy the original underlying data to the new memory
-              // allocated on the heap.
-              // 3. Put this new (heap) address in the originating
-              // struct/descriptor
-
-              // Consider the following sequence of omp.map.info and omp.target
-              // operations.
-              // %0 = llvm.getelementptr %19[0, 0]
-              // %1 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) ...
-              //                   var_ptr_ptr(%0 : !llvm.ptr)  bounds(..)
-              // %2 = omp.map.info var_ptr(%19 : !llvm.ptr, !desc_type)>) ...
-              //                   members(%1 : [0] : !llvm.ptr) -> !llvm.ptr
-              // omp.target nowait map_entries(%2 -> %arg5, %1 -> %arg8 : ..)
-              //                   private(@privatizer %19 -> %arg9 [map_idx=1]
-              //                   : !llvm.ptr) {
-              // We need to allocate memory on the heap for the underlying
-              // pointer which is stored at the var_ptr_ptr operand of %1. Then
-              // we need to copy this pointer to the new heap allocated memory
-              // location. Then, we need to store the address of the new heap
-              // location in the originating struct/descriptor. So, we generate
-              // the following (pseudo) MLIR code (Using the same names of
-              // mlir::Value instances in the example as in the code below)
-              //
-              // %dataMalloc = malloc(totalSize)
-              // %loadDataPtr = load %0 : !llvm.ptr -> !llvm.ptr
-              // memcpy(%dataMalloc, %loadDataPtr, totalSize)
-              // %newVarPtrPtrOp = llvm.getelementptr %heapMem[0, 0]
-              // llvm.store %dataMalloc, %newVarPtrPtrOp
-              // %1.cloned = omp.map.info var_ptr(%heapMem : !llvm.ptr, i32) ...
-              //                          var_ptr_ptr(%newVarPtrPtrOp :
-              //                          !llvm.ptr)
-              // %2.cloned = omp.map.info var_ptr(%heapMem : !llvm.ptr,
-              //                                             !desc_type)>) ...
-              //                          members(%1.cloned : [0] : !llvm.ptr)
-              //             -> !llvm.ptr
-              // omp.target nowait map_entries(%2.cloned -> %arg5,
-              //                               %1.cloned -> %arg8 : ..)
-              //            private(@privatizer %heapMem -> .. [map_idx=1] : ..)
-              //            {
-
-              if (isFirstPrivate) {
-                assert(!memberMapInfoOp.getBounds().empty() &&
-                       "empty bounds on member map of firstprivate variable");
-                Location loc = memberMapInfoOp.getLoc();
-                Value totalSize =
-                    getSizeInBytes(memberMapInfoOp, mod, rewriter);
-                auto dataMalloc =
-                    allocateHeapMem(loc, totalSize, mod, rewriter);
-                auto loadDataPtr = rewriter.create<LLVM::LoadOp>(
-                    loc, memberMapInfoOp.getVarPtrPtr().getType(),
-                    memberMapInfoOp.getVarPtrPtr());
-                (void)rewriter.create<LLVM::MemcpyOp>(
-                    loc, dataMalloc.getResult(), loadDataPtr.getResult(),
-                    totalSize, /*isVolatile=*/false);
-                Operation *newVarPtrPtrOp = rewriter.clone(*varPtrPtrdefOp);
-                rewriter.replaceAllUsesExcept(memberMapInfoOp.getVarPtrPtr(),
-                                              newVarPtrPtrOp->getOpResult(0),
-                                              loadDataPtr);
-                rewriter.modifyOpInPlace(newVarPtrPtrOp, [&]() {
-                  newVarPtrPtrOp->replaceUsesOfWith(varPtr, heapMem);
-                });
-                (void)rewriter.create<LLVM::StoreOp>(
-                    loc, dataMalloc.getResult(), newVarPtrPtrOp->getResult(0));
-              } else {
-                rewriter.setInsertionPoint(
-                    cloneAndMarkForDeletion(varPtrPtrdefOp));
-              }
+              rewriter.setInsertionPoint(
+                  cloneAndMarkForDeletion(varPtrPtrdefOp));
             }
           }
         }

>From 3109e4f52218e262d50c510028658b299ab05987 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Wed, 17 Sep 2025 23:30:21 -0500
Subject: [PATCH 16/27] Fix for boxchars is working

---
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 96 ++++++++++---------
 1 file changed, 53 insertions(+), 43 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index e0de0434f0587..0f7084a935a09 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -100,7 +100,8 @@ class PrepareForOMPOffloadPrivatizationPass
         // For boxchars this won't be a pointer. But, MapsForPrivatizedSymbols
         // should have mapped the pointer the boxchar so use that as varPtr.
         Value varPtr = privVar;
-        if (!isa<LLVM::LLVMPointerType>(privVar.getType()))
+        bool isPrivatizedByValue = !isa<LLVM::LLVMPointerType>(privVar.getType());
+        if (isPrivatizedByValue)
           varPtr = mapInfoOp.getVarPtr();
 
         assert(isa<LLVM::LLVMPointerType>(varPtr.getType()));
@@ -119,7 +120,7 @@ class PrepareForOMPOffloadPrivatizationPass
         // location. We'll inser that load later after we have updated
         // the malloc'd location with the contents of the original
         // variable.
-        if (isa<LLVM::LLVMPointerType>(privVar.getType()))
+        if (!isPrivatizedByValue)
           newPrivVars.push_back(heapMem);
 
         // Find the earliest insertion point for the copy. This will be before
@@ -154,13 +155,12 @@ class PrepareForOMPOffloadPrivatizationPass
         });
 
         rewriter.setInsertionPoint(chainOfOps.front());
-        // Copy the value of the local variable into the heap-allocated
-        // location.
+
         Operation *firstOp = chainOfOps.front();
         Location loc = firstOp->getLoc();
         Type varType = getElemType(varPtr);
 
-
+        LDBG() << "varType = " << varType << "\n";
         // // auto loadVal = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
         // // (void)rewriter.create<LLVM::StoreOp>(loc, loadVal.getResult(), heapMem);
         #if 0
@@ -184,36 +184,56 @@ class PrepareForOMPOffloadPrivatizationPass
         //            , );
 #else
         // Todo: Handle boxchar (by value)
-        Region &initRegion = privatizer.getInitRegion();
-        assert(!initRegion.empty() && "initRegion cannot be empty");
-        LLVM::LLVMFuncOp initFunc = createFuncOpForRegion(
-            loc, mod, initRegion,
-            llvm::formatv("{0}_{1}", privatizer.getSymName(), "init").str(),
-            firstOp, rewriter);
 
-        rewriter.create<LLVM::CallOp>(loc, initFunc, ValueRange{varPtr, heapMem});
+        // Create a llvm.func for 'region' that is marked always_inline and call it.
+        auto createAlwaysInlineFuncAndCallIt = [&](Region &region,
+                                                   llvm::StringRef funcName,
+                                                   Value mold,
+                                                   Value arg1) -> Value {
+          assert(!region.empty() && "region cannot be empty");
+          LLVM::LLVMFuncOp func = createFuncOpForRegion(
+              loc, mod, region,
+              funcName,
+              firstOp, rewriter);
+          auto call = rewriter.create<LLVM::CallOp>(loc, func, ValueRange{mold, arg1});
+          LDBG() << "inside createAlwaysInlineFuncAndCallIt\n";
+          return call.getResult();
+        };
+        Value moldArg, newArg;
+        if (isPrivatizedByValue) {
+          moldArg = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
+          newArg = rewriter.create<LLVM::LoadOp>(loc, varType, heapMem);
+        } else {
+          moldArg = varPtr;
+          newArg = heapMem;
+        }
+        Value initializedVal = createAlwaysInlineFuncAndCallIt(
+            privatizer.getInitRegion(),
+            llvm::formatv("{0}_{1}", privatizer.getSymName(), "init").str(),
+            moldArg, newArg);
+        LDBG() << "initializedVal = " << initializedVal << "\n";
 #endif
-        using ReplacementEntry = std::pair<Operation *, Operation *>;
-        llvm::SmallVector<ReplacementEntry> replRecord;
-        auto cloneAndMarkForDeletion = [&](Operation *origOp) -> Operation * {
+        if (isFirstPrivate)
+          initializedVal = createAlwaysInlineFuncAndCallIt(
+              privatizer.getCopyRegion(),
+              llvm::formatv("{0}_{1}", privatizer.getSymName(), "copy").str(),
+              moldArg, initializedVal);
+
+        if (isPrivatizedByValue)
+          (void)rewriter.create<LLVM::StoreOp>(loc, initializedVal, heapMem);
+
+        auto cloneModifyAndErase = [&](Operation *origOp) -> Operation * {
           Operation *clonedOp = rewriter.clone(*origOp);
           rewriter.replaceAllOpUsesWith(origOp, clonedOp);
-          replRecord.push_back(std::make_pair(origOp, clonedOp));
+          rewriter.modifyOpInPlace(clonedOp, [&]() {
+            clonedOp->replaceUsesOfWith(varPtr, heapMem);
+          });
+          rewriter.eraseOp(origOp);
           return clonedOp;
         };
 
-        if (isFirstPrivate) {
-          Region &copyRegion = privatizer.getCopyRegion();
-          assert(!copyRegion.empty() && "copyRegion cannot be empty");
-          LLVM::LLVMFuncOp copyFunc = createFuncOpForRegion(
-              loc, mod,  copyRegion,
-              llvm::formatv("{0}_{1}", privatizer.getSymName(), "copy").str(),
-              firstOp, rewriter);
-          rewriter.create<LLVM::CallOp>(loc, copyFunc, ValueRange{varPtr, heapMem});
-        }
-
         rewriter.setInsertionPoint(targetOp);
-        rewriter.setInsertionPoint(cloneAndMarkForDeletion(mapInfoOperation));
+        rewriter.setInsertionPoint(cloneModifyAndErase(mapInfoOperation));
 
         // Fix any members that may use varPtr to now use heapMem
         if (!mapInfoOp.getMembers().empty()) {
@@ -221,34 +241,23 @@ class PrepareForOMPOffloadPrivatizationPass
             Operation *memberOperation = member.getDefiningOp();
             if (!usesVarPtr(memberOperation))
               continue;
-            rewriter.setInsertionPoint(
-                cloneAndMarkForDeletion(memberOperation));
+            rewriter.setInsertionPoint(cloneModifyAndErase(memberOperation));
 
             auto memberMapInfoOp = cast<omp::MapInfoOp>(memberOperation);
             if (memberMapInfoOp.getVarPtrPtr()) {
               Operation *varPtrPtrdefOp =
                   memberMapInfoOp.getVarPtrPtr().getDefiningOp();
-              rewriter.setInsertionPoint(
-                  cloneAndMarkForDeletion(varPtrPtrdefOp));
+              rewriter.setInsertionPoint(cloneModifyAndErase(varPtrPtrdefOp));
             }
           }
         }
 
-        for (auto repl : replRecord) {
-          Operation *origOp = repl.first;
-          Operation *clonedOp = repl.second;
-          rewriter.modifyOpInPlace(clonedOp, [&]() {
-            clonedOp->replaceUsesOfWith(varPtr, heapMem);
-          });
-          rewriter.eraseOp(origOp);
-        }
-
         // If the type of the private variable is not a pointer,
         // which is typically the case with !fir.boxchar types, then
         // we need to ensure that the new private variable is also
         // not a pointer. Insert a load from heapMem right before
         // targetOp.
-        if (!isa<LLVM::LLVMPointerType>(privVar.getType())) {
+        if (isPrivatizedByValue) {
           rewriter.setInsertionPoint(targetOp);
           auto newPrivVar = rewriter.create<LLVM::LoadOp>(mapInfoOp.getLoc(),
                                                           varType, heapMem);
@@ -402,11 +411,12 @@ class PrepareForOMPOffloadPrivatizationPass
     srcRegion.cloneInto(&clonedRegion, mapper);
     SmallVector<Type> paramTypes = {srcRegion.getArgument(0).getType(),
                                     srcRegion.getArgument(1).getType()};
+    Type resultType = srcRegion.getArgument(0).getType();
     LDBG() << "paramTypes are \n"
            << srcRegion.getArgument(0).getType() << "\n"
            << srcRegion.getArgument(1).getType() << "\n";
     LLVM::LLVMFunctionType funcType =
-        LLVM::LLVMFunctionType::get(LLVM::LLVMVoidType::get(ctx), paramTypes);
+        LLVM::LLVMFunctionType::get(resultType, paramTypes);
 
     LDBG() << "funcType is " << funcType << "\n";
     LLVM::LLVMFuncOp func =
@@ -419,7 +429,7 @@ class PrepareForOMPOffloadPrivatizationPass
         omp::YieldOp yieldOp = cast<omp::YieldOp>(block.getTerminator());
         rewriter.setInsertionPoint(yieldOp);
         rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(yieldOp, TypeRange(),
-                                                    Value());
+                                                    yieldOp.getResults().front());
       }
     }
     LDBG() << funcName << " is \n" << func << "\n";

>From d5b9c279abdf67ffccff2f7d8b5a5da4824a5a1f Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 18 Sep 2025 14:45:08 -0500
Subject: [PATCH 17/27] Adjust
 mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir after change
 to using the init and copy regions

---
 .../omp-offload-privatization-prepare.mlir    | 131 +++++-------------
 1 file changed, 38 insertions(+), 93 deletions(-)

diff --git a/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir b/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir
index 48b937be86a33..236cc6dfa6031 100644
--- a/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir
+++ b/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir
@@ -2,51 +2,21 @@
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>} {
   llvm.func @free(!llvm.ptr)
-  omp.private {type = private} @privatizer : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> init {
+  llvm.func @malloc(i64) -> !llvm.ptr
+
+  omp.private {type = firstprivate} @firstprivatizer : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> init {
   ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
-    %0 = llvm.mlir.constant(48 : i32) : i32
-    "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    %0 = llvm.mlir.constant(48 : i64) : i64
+    %1 = llvm.call @malloc(%0) : (i64) -> !llvm.ptr
+    %2 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.store %1, %2 : !llvm.ptr, !llvm.ptr
     omp.yield(%arg1 : !llvm.ptr)
-  }
-
-  omp.private {type = firstprivate} @firstprivatizer : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> copy {
+  } copy {
   ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
     %0 = llvm.mlir.constant(48 : i32) : i32
     "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
     omp.yield(%arg1 : !llvm.ptr)
   }
-
-  llvm.func internal @private_test(%arg0: !llvm.ptr {fir.bindc_name = "ptr0"}, %arg1: !llvm.ptr {fir.bindc_name = "ptr1"}) {
-    %0 = llvm.mlir.constant(1 : i32) : i32
-    %1 = llvm.mlir.constant(0 : index) : i64
-    %5 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-    %19 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
-    %21 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
-    %33 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-    llvm.store %33, %19 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
-    llvm.store %0, %21 : i32, !llvm.ptr
-    %124 = omp.map.info var_ptr(%21 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
-    %150 = llvm.getelementptr %19[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-    %151 = llvm.load %150 : !llvm.ptr -> i64
-    %152 = llvm.getelementptr %19[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-    %153 = llvm.load %152 : !llvm.ptr -> i64
-    %154 = llvm.getelementptr %19[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-    %155 = llvm.load %154 : !llvm.ptr -> i64
-    %156 = llvm.sub %153, %1 : i64
-    %157 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%156 : i64) extent(%153 : i64) stride(%155 : i64) start_idx(%151 : i64) {stride_in_bytes = true}
-    %158 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-    %159 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%158 : !llvm.ptr) bounds(%157) -> !llvm.ptr {name = ""}
-    %160 = omp.map.info var_ptr(%19 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%159 : [0] : !llvm.ptr) -> !llvm.ptr
-    omp.target nowait map_entries(%124 -> %arg2, %160 -> %arg5, %159 -> %arg8 : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@privatizer %19 -> %arg9 [map_idx=1] : !llvm.ptr) {
-      omp.terminator
-    }
-    %166 = llvm.mlir.constant(48 : i32) : i32
-    %167 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-    %168 = llvm.load %167 : !llvm.ptr -> !llvm.ptr
-    llvm.call @free(%168) : (!llvm.ptr) -> ()
-    llvm.return
-  }
-
   llvm.func internal @firstprivate_test(%arg0: !llvm.ptr {fir.bindc_name = "ptr0"}, %arg1: !llvm.ptr {fir.bindc_name = "ptr1"}) {
     %0 = llvm.mlir.constant(1 : i32) : i32
     %1 = llvm.mlir.constant(0 : index) : i64
@@ -79,10 +49,12 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vec
   }
 }
 
-// CHECK-LABEL:   llvm.func @malloc(i64) -> !llvm.ptr
-// CHECK:         llvm.func @free(!llvm.ptr)
+// CHECK-LABEL:       llvm.func @free(!llvm.ptr)
+// CHECK: llvm.func @malloc(i64) -> !llvm.ptr
 
-// CHECK-LABEL: llvm.func internal @private_test(
+// CHECK-LABEL: llvm.func internal @firstprivate_test(
+// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr {fir.bindc_name = "ptr0"},
+// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr {fir.bindc_name = "ptr1"}) {
 // CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(0 : index) : i64
 // CHECK: %[[VAL_2:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
@@ -102,62 +74,35 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vec
 // CHECK: %[[VAL_14:.*]] = llvm.load %[[VAL_13]] : !llvm.ptr -> i64
 // CHECK: %[[VAL_15:.*]] = llvm.sub %[[VAL_12]], %[[VAL_1]] : i64
 // CHECK: %[[VAL_16:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_15]] : i64) extent(%[[VAL_12]] : i64) stride(%[[VAL_14]] : i64) start_idx(%[[VAL_10]] : i64) {stride_in_bytes = true}
-// CHECK: %[[VAL_17:.*]] = llvm.load %[[STACK]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: llvm.store %[[VAL_17]], %[[HEAP]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
-// CHECK: %[[VAL_18:.*]] = llvm.getelementptr %[[HEAP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_19:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[VAL_18]] : !llvm.ptr) bounds(%[[VAL_16]]) -> !llvm.ptr {name = ""}
-// CHECK: %[[VAL_20:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, to) capture(ByRef) members(%[[VAL_19]] : [0] : !llvm.ptr) -> !llvm.ptr
-// CHECK: omp.target nowait map_entries(%[[VAL_8]] -> %[[VAL_21:.*]], %[[VAL_20]] -> %[[VAL_22:.*]], %[[VAL_19]] -> %[[VAL_23:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@privatizer %[[HEAP]] -> %[[VAL_24:.*]] [map_idx=1] : !llvm.ptr) {
+// CHECK: %[[VAL_17:.*]] = llvm.call @firstprivatizer_init(%[[STACK]], %[[HEAP]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_18:.*]] = llvm.call @firstprivatizer_copy(%[[STACK]], %[[VAL_17]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_19:.*]] = llvm.getelementptr %[[HEAP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_20:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[VAL_19]] : !llvm.ptr) bounds(%[[VAL_16]]) -> !llvm.ptr {name = ""}
+// CHECK: %[[VAL_21:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, to) capture(ByRef) members(%[[VAL_20]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: omp.target nowait map_entries(%[[VAL_8]] -> %[[VAL_22:.*]], %[[VAL_21]] -> %[[VAL_23:.*]], %[[VAL_20]] -> %[[VAL_24:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %[[HEAP]] -> %[[VAL_25:.*]] [map_idx=1] : !llvm.ptr) {
 // CHECK: omp.terminator
 // CHECK: }
-// CHECK: %[[VAL_26:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_27:.*]] = llvm.load %[[VAL_26]] : !llvm.ptr -> !llvm.ptr
-// CHECK: llvm.call @free(%[[VAL_27]]) : (!llvm.ptr) -> ()
+// CHECK: %[[VAL_26:.*]] = llvm.mlir.constant(48 : i32) : i32
+// CHECK: %[[VAL_27:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_28:.*]] = llvm.load %[[VAL_27]] : !llvm.ptr -> !llvm.ptr
+// CHECK: llvm.call @free(%[[VAL_28]]) : (!llvm.ptr) -> ()
 // CHECK: llvm.return
 // CHECK: }
 
-// CHECK-LABEL: llvm.func internal @firstprivate_test(
-// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i32) : i32
-// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK: %[[VAL_2:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(48 : i64) : i64
-// CHECK: %[[HEAP:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr
-// CHECK: %[[STACK:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
-// CHECK: %[[VAL_6:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
-// CHECK: %[[VAL_7:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: llvm.store %[[VAL_7]], %[[STACK]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
-// CHECK: llvm.store %[[VAL_0]], %[[VAL_6]] : i32, !llvm.ptr
-// CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_6]] : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
-// CHECK: %[[VAL_9:.*]] = llvm.getelementptr %[[STACK]][0, 7, %[[VAL_1]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_10:.*]] = llvm.load %[[VAL_9]] : !llvm.ptr -> i64
-// CHECK: %[[VAL_11:.*]] = llvm.getelementptr %[[STACK]][0, 7, %[[VAL_1]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_12:.*]] = llvm.load %[[VAL_11]] : !llvm.ptr -> i64
-// CHECK: %[[VAL_13:.*]] = llvm.getelementptr %[[STACK]][0, 7, %[[VAL_1]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_14:.*]] = llvm.load %[[VAL_13]] : !llvm.ptr -> i64
-// CHECK: %[[VAL_15:.*]] = llvm.sub %[[VAL_12]], %[[VAL_1]] : i64
-// CHECK: %[[VAL_16:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_15]] : i64) extent(%[[VAL_12]] : i64) stride(%[[VAL_14]] : i64) start_idx(%[[VAL_10]] : i64) {stride_in_bytes = true}
-// CHECK: %[[VAL_17:.*]] = llvm.load %[[STACK]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: llvm.store %[[VAL_17]], %[[HEAP]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
-// CHECK: %[[VAL_18:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_19:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK: %[[VAL_20:.*]] = llvm.sub %[[VAL_15]], %[[VAL_1]] : i64
-// CHECK: %[[VAL_21:.*]] = llvm.add %[[VAL_20]], %[[VAL_19]] : i64
-// CHECK: %[[VAL_22:.*]] = llvm.mul %[[VAL_19]], %[[VAL_21]] : i64
-// CHECK: %[[VAL_23:.*]] = llvm.mlir.constant(4 : i64) : i64
-// CHECK: %[[VAL_24:.*]] = llvm.mul %[[VAL_22]], %[[VAL_23]] : i64
-// CHECK: %[[NEW_DATA_PTR:.*]] = llvm.call @malloc(%[[VAL_24]]) : (i64) -> !llvm.ptr
-// CHECK: %[[OLD_DATA_PTR:.*]] = llvm.load %[[VAL_18]] : !llvm.ptr -> !llvm.ptr
-// CHECK: "llvm.intr.memcpy"(%[[NEW_DATA_PTR]], %[[OLD_DATA_PTR]], %[[VAL_24]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
-// CHECK: %[[VAL_27:.*]] = llvm.getelementptr %[[HEAP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: llvm.store %[[NEW_DATA_PTR]], %[[VAL_27]] : !llvm.ptr, !llvm.ptr
-// CHECK: %[[VAL_28:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[VAL_27]] : !llvm.ptr) bounds(%[[VAL_16]]) -> !llvm.ptr {name = ""}
-// CHECK: %[[VAL_29:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, to) capture(ByRef) members(%[[VAL_28]] : [0] : !llvm.ptr) -> !llvm.ptr
-// CHECK: omp.target nowait map_entries(%[[VAL_8]] -> %[[VAL_30:.*]], %[[VAL_29]] -> %[[VAL_31:.*]], %[[VAL_28]] -> %[[VAL_32:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %[[HEAP]] -> %[[VAL_33:.*]] [map_idx=1] : !llvm.ptr) {
-// CHECK: omp.terminator
+// CHECK-LABEL: llvm.func @firstprivatizer_init(
+// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr,
+// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[VAL_1:.*]] = llvm.call @malloc(%[[VAL_0]]) : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_2:.*]] = llvm.getelementptr %[[ARG1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_1]], %[[VAL_2]] : !llvm.ptr, !llvm.ptr
+// CHECK: llvm.return %[[ARG1]] : !llvm.ptr
 // CHECK: }
-// CHECK: %[[VAL_34:.*]] = llvm.mlir.constant(48 : i32) : i32
-// CHECK: %[[VAL_35:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_36:.*]] = llvm.load %[[VAL_35]] : !llvm.ptr -> !llvm.ptr
-// CHECK: llvm.call @free(%[[VAL_36]]) : (!llvm.ptr) -> ()
-// CHECK: llvm.return
+
+// CHECK-LABEL: llvm.func @firstprivatizer_copy(
+// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr,
+// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i32) : i32
+// CHECK: "llvm.intr.memcpy"(%[[ARG1]], %[[ARG0]], %[[VAL_0]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+// CHECK: llvm.return %[[ARG1]] : !llvm.ptr
 // CHECK: }

>From 8bd4359856d718c53fc4667fbe024ce86d29e3dc Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 18 Sep 2025 14:52:33 -0500
Subject: [PATCH 18/27] Address minor review comments

---
 mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td | 2 +-
 mlir/lib/Tools/mlir-opt/MlirOptMain.cpp               | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
index 1ba67caba05be..a768ad011c448 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
@@ -83,6 +83,6 @@ def PrepareForOMPOffloadPrivatizationPass : Pass<"omp-offload-privatization-prep
       allocatables, then the corresponding `omp::MapInfoOp` needs to be fixed up to map the new heap-allocated
       variable and not the original variable.
     }];
-  let dependentDialects = ["LLVM::LLVMDialect", "mlir::omp::OpenMPDialect"];
+  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 #endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index 60c5406bdd197..de714d8b740af 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -624,7 +624,6 @@ LogicalResult mlir::MlirOptMain(llvm::raw_ostream &outputStream,
   // We use the thread-pool this context is creating, and avoid
   // creating any thread when disabled.
   MLIRContext threadPoolCtx;
-
   if (threadPoolCtx.isMultithreadingEnabled())
     threadPool = &threadPoolCtx.getThreadPool();
 

>From a393ff18664ab2b278246e14a6b3e056fd51260b Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Wed, 24 Sep 2025 13:45:33 -0500
Subject: [PATCH 19/27] Handle the case where varPtr is a blockargument. - Take
 the varType from the mapInfoOp

---
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 47 ++++++++++++-------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index 0f7084a935a09..d604cc6ad120c 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -105,7 +105,7 @@ class PrepareForOMPOffloadPrivatizationPass
           varPtr = mapInfoOp.getVarPtr();
 
         assert(isa<LLVM::LLVMPointerType>(varPtr.getType()));
-        Value heapMem = allocateHeapMem(targetOp, varPtr, mod, rewriter);
+        Value heapMem = allocateHeapMem(targetOp, varPtr, mapInfoOp.getVarType(), mod, rewriter);
         if (!heapMem)
           targetOp.emitError(
               "Unable to allocate heap memory when trying to move "
@@ -129,8 +129,13 @@ class PrepareForOMPOffloadPrivatizationPass
         // instead.
         Operation *varPtrDefiningOp = varPtr.getDefiningOp();
         DenseSet<Operation *> users;
-        users.insert(varPtrDefiningOp->user_begin(),
-                     varPtrDefiningOp->user_end());
+        if (varPtrDefiningOp) {
+          users.insert(varPtrDefiningOp->user_begin(),
+                       varPtrDefiningOp->user_end());
+        } else {
+          auto blockArg = cast<BlockArgument>(varPtr);
+          users.insert(blockArg.user_begin(), blockArg.user_end());
+        }
 
         auto usesVarPtr = [&users](Operation *op) -> bool {
           return users.count(op);
@@ -149,6 +154,7 @@ class PrepareForOMPOffloadPrivatizationPass
               chainOfOps.push_back(memberMap.getVarPtrPtr().getDefiningOp());
           }
         }
+
         DominanceInfo dom;
         llvm::sort(chainOfOps, [&](Operation *l, Operation *r) {
           return dom.dominates(l, r);
@@ -158,9 +164,9 @@ class PrepareForOMPOffloadPrivatizationPass
 
         Operation *firstOp = chainOfOps.front();
         Location loc = firstOp->getLoc();
-        Type varType = getElemType(varPtr);
+        //        Type varType = getElemType(varPtr);
+        Type varType = mapInfoOp.getVarType();
 
-        LDBG() << "varType = " << varType << "\n";
         // // auto loadVal = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
         // // (void)rewriter.create<LLVM::StoreOp>(loc, loadVal.getResult(), heapMem);
         #if 0
@@ -183,8 +189,6 @@ class PrepareForOMPOffloadPrivatizationPass
         // LLVM::BrOp::create(rewriter, loc,
         //            , );
 #else
-        // Todo: Handle boxchar (by value)
-
         // Create a llvm.func for 'region' that is marked always_inline and call it.
         auto createAlwaysInlineFuncAndCallIt = [&](Region &region,
                                                    llvm::StringRef funcName,
@@ -196,7 +200,6 @@ class PrepareForOMPOffloadPrivatizationPass
               funcName,
               firstOp, rewriter);
           auto call = rewriter.create<LLVM::CallOp>(loc, func, ValueRange{mold, arg1});
-          LDBG() << "inside createAlwaysInlineFuncAndCallIt\n";
           return call.getResult();
         };
         Value moldArg, newArg;
@@ -207,13 +210,17 @@ class PrepareForOMPOffloadPrivatizationPass
           moldArg = varPtr;
           newArg = heapMem;
         }
-        Value initializedVal = createAlwaysInlineFuncAndCallIt(
+
+        Value initializedVal;
+        if (!privatizer.getInitRegion().empty())
+          initializedVal = createAlwaysInlineFuncAndCallIt(
             privatizer.getInitRegion(),
             llvm::formatv("{0}_{1}", privatizer.getSymName(), "init").str(),
             moldArg, newArg);
-        LDBG() << "initializedVal = " << initializedVal << "\n";
+        else
+          initializedVal = newArg;
 #endif
-        if (isFirstPrivate)
+        if (isFirstPrivate && !privatizer.getCopyRegion().empty())
           initializedVal = createAlwaysInlineFuncAndCallIt(
               privatizer.getCopyRegion(),
               llvm::formatv("{0}_{1}", privatizer.getSymName(), "copy").str(),
@@ -370,20 +377,28 @@ class PrepareForOMPOffloadPrivatizationPass
   }
 
   template <typename OpTy>
-  Value allocateHeapMem(OpTy targetOp, Value privVar, ModuleOp mod,
+  Value allocateHeapMem(OpTy targetOp, Value privVar, Type varType, ModuleOp mod,
                         IRRewriter &rewriter) const {
+    OpBuilder::InsertionGuard guard(rewriter);
     Value varPtr = privVar;
     Operation *definingOp = varPtr.getDefiningOp();
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPoint(definingOp);
+    BlockArgument blockArg;
+    if (!definingOp) {
+      blockArg = mlir::dyn_cast<BlockArgument>(varPtr);
+      rewriter.setInsertionPointToStart(blockArg.getParentBlock());
+    } else {
+      rewriter.setInsertionPoint(definingOp);
+    }
+    Location loc = definingOp ? definingOp->getLoc() : blockArg.getLoc();
     LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
 
-    Location loc = definingOp->getLoc();
-    Type varType = getElemType(varPtr);
+    // Type varType = getElemType(varPtr);
     assert(mod.getDataLayoutSpec() &&
            "MLIR module with no datalayout spec not handled yet");
+
     const DataLayout &dl = DataLayout(mod);
     std::int64_t distance = getSizeInBytes(dl, varType);
+
     Value sizeBytes = rewriter.create<LLVM::ConstantOp>(
         loc, mallocFn.getFunctionType().getParamType(0), distance);
 

>From 28ec66a8d990e4e4f904c9841c2645671baa9dbc Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Wed, 24 Sep 2025 14:27:04 -0500
Subject: [PATCH 20/27] clean up the pass a little bit

---
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 25 ++-----------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index d604cc6ad120c..4d43064398e90 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -167,28 +167,6 @@ class PrepareForOMPOffloadPrivatizationPass
         //        Type varType = getElemType(varPtr);
         Type varType = mapInfoOp.getVarType();
 
-        // // auto loadVal = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
-        // // (void)rewriter.create<LLVM::StoreOp>(loc, loadVal.getResult(), heapMem);
-        #if 0
-        Region &initRegion = privatizer.getInitRegion();
-        assert(!initRegion.empty() && "initRegion cannot be empty");
-        Block &entryBlock = initRegion.front();
-        Block *insertBlock = firstOp->getBlock();
-        Block *newBlock = insertBlock->splitBlock(firstOp);
-        Region *destRegion = firstOp->getParentRegion();
-        IRMapping irMap;
-        irMap.map(varPtr, entryBlock.getArgument(0));
-        irMap.map(heapMem, entryBlock.getArgument(1));
-
-        LDBG() << "Operation being walked before cloning the init region\n\n";
-        LLVM_DEBUG(llvm::dbgs() << getOperation() << "\n");
-        initRegion.cloneInto(destRegion, Region::iterator(newBlock), irMap);
-        LDBG() << "Operation being walked after cloning the init region\n";
-        LLVM_DEBUG(llvm::dbgs() << getOperation() << "\n");
-        //        rewriter.setInsertionPointToEnd(insertBlock);
-        // LLVM::BrOp::create(rewriter, loc,
-        //            , );
-#else
         // Create a llvm.func for 'region' that is marked always_inline and call it.
         auto createAlwaysInlineFuncAndCallIt = [&](Region &region,
                                                    llvm::StringRef funcName,
@@ -202,6 +180,7 @@ class PrepareForOMPOffloadPrivatizationPass
           auto call = rewriter.create<LLVM::CallOp>(loc, func, ValueRange{mold, arg1});
           return call.getResult();
         };
+
         Value moldArg, newArg;
         if (isPrivatizedByValue) {
           moldArg = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
@@ -219,7 +198,7 @@ class PrepareForOMPOffloadPrivatizationPass
             moldArg, newArg);
         else
           initializedVal = newArg;
-#endif
+
         if (isFirstPrivate && !privatizer.getCopyRegion().empty())
           initializedVal = createAlwaysInlineFuncAndCallIt(
               privatizer.getCopyRegion(),

>From d95799dd5f85736511e0c930d979d2b2ad1bf8b7 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Wed, 24 Sep 2025 14:39:09 -0500
Subject: [PATCH 21/27] Do not include OpenMPOffloadPrivatizationPrepare.h in
 Pipelines.h. Include in Pipelines.cpp instead

---
 flang/include/flang/Optimizer/Passes/Pipelines.h | 1 -
 flang/lib/Optimizer/Passes/Pipelines.cpp         | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index 4d4d30e69cdd7..fd8c43cc88a19 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -22,7 +22,6 @@
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
-#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 6c9e0648fede8..184b72a2a29da 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -10,6 +10,7 @@
 /// common to flang and the test tools.
 
 #include "flang/Optimizer/Passes/Pipelines.h"
+#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
 #include "llvm/Support/CommandLine.h"
 
 /// Force setting the no-alias attribute on fuction arguments when possible.

>From 08297195dd5935bd6c9c416bd8a0efd6cdcc3bcd Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 25 Sep 2025 12:18:32 -0500
Subject: [PATCH 22/27] Move PrepareForOMPOffloadPrivatizationPass from
 Transforms in LLVMIR dialect to Transforms in OpenMP Dialect

---
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  4 +--
 .../mlir/Dialect/LLVMIR/Transforms/Passes.td  | 12 ---------
 .../mlir/Dialect/OpenMP/CMakeLists.txt        |  2 ++
 .../Dialect/OpenMP/Transforms/CMakeLists.txt  |  5 ++++
 .../OpenMPOffloadPrivatizationPrepare.h       | 12 ++++-----
 .../mlir/Dialect/OpenMP/Transforms/Passes.h   | 26 +++++++++++++++++++
 .../mlir/Dialect/OpenMP/Transforms/Passes.td  | 26 +++++++++++++++++++
 .../Dialect/LLVMIR/Transforms/CMakeLists.txt  |  1 -
 mlir/lib/Dialect/OpenMP/CMakeLists.txt        |  2 ++
 .../Dialect/OpenMP/Transforms/CMakeLists.txt  | 14 ++++++++++
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 10 +++----
 mlir/lib/RegisterAllPasses.cpp                |  2 ++
 .../omp-offload-privatization-prepare.mlir    |  6 ++---
 13 files changed, 93 insertions(+), 29 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
 rename mlir/include/mlir/Dialect/{LLVMIR => OpenMP}/Transforms/OpenMPOffloadPrivatizationPrepare.h (64%)
 create mode 100644 mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
 create mode 100644 mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
 create mode 100644 mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
 rename mlir/lib/Dialect/{LLVMIR => OpenMP}/Transforms/OpenMPOffloadPrivatizationPrepare.cpp (98%)
 rename mlir/test/Dialect/{LLVMIR => OpenMP}/omp-offload-privatization-prepare.mlir (96%)

diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 184b72a2a29da..ab9ec5d6d28f6 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -10,7 +10,7 @@
 /// common to flang and the test tools.
 
 #include "flang/Optimizer/Passes/Pipelines.h"
-#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
+#include "mlir/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.h"
 #include "llvm/Support/CommandLine.h"
 
 /// Force setting the no-alias attribute on fuction arguments when possible.
@@ -417,7 +417,7 @@ void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
   // context of deferred target tasks.
   addNestedPassConditionally<mlir::LLVM::LLVMFuncOp>(
       pm, disableFirToLlvmIr, [&]() {
-        return mlir::LLVM::createPrepareForOMPOffloadPrivatizationPass();
+        return mlir::omp::createPrepareForOMPOffloadPrivatizationPass();
       });
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
index a768ad011c448..961909d5c8d27 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
@@ -73,16 +73,4 @@ def DIScopeForLLVMFuncOpPass : Pass<"ensure-debug-info-scope-on-llvm-func", "::m
   ];
 }
 
-def PrepareForOMPOffloadPrivatizationPass : Pass<"omp-offload-privatization-prepare", "::mlir::LLVM::LLVMFuncOp"> {
-    let summary = "Prepare OpenMP maps for privatization for deferred target tasks";
-    let description = [{
-      When generating LLVMIR for privatized variables in an OpenMP offloading directive (eg. omp::TargetOp)
-      that creates a deferred target task (when the nowait clause is used), we need to copy the privatized
-      variable out of the stack of the generating task and into the heap so that the deferred target task
-      can still access it. However, if such a privatized variable is also mapped, typically the case for
-      allocatables, then the corresponding `omp::MapInfoOp` needs to be fixed up to map the new heap-allocated
-      variable and not the original variable.
-    }];
-  let dependentDialects = ["LLVM::LLVMDialect"];
-}
 #endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
index a65c6b1d3c96b..39ecc7370c17f 100644
--- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(Transforms)
+
 set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Frontend/OpenMP/OMP.td)
 mlir_tablegen(OmpCommon.td --gen-directive-decl --directives-dialect=OpenMP)
 add_public_tablegen_target(omp_common_td)
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000..22f0d92ea4cbf
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name OpenMP)
+add_public_tablegen_target(MLIROpenMPPassIncGen)
+
+add_mlir_doc(Passes OpenMPPasses ./ -gen-pass-doc)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h b/mlir/include/mlir/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.h
similarity index 64%
rename from mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h
rename to mlir/include/mlir/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.h
index 86aad5c593025..a8b3f7c717f70 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.h
@@ -6,17 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_DIALECT_LLVMIR_TRANSFORMS_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS_H
-#define MLIR_DIALECT_LLVMIR_TRANSFORMS_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS_H
+#ifndef MLIR_DIALECT_OPENMP_TRANSFORMS_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS_H
+#define MLIR_DIALECT_OPENMP_TRANSFORMS_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS_H
 
 #include <memory>
 
 namespace mlir {
 class Pass;
-namespace LLVM {
+namespace omp {
 #define GEN_PASS_DECL_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS
-#include "mlir/Dialect/LLVMIR/Transforms/Passes.h.inc"
-} // namespace LLVM
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
+} // namespace omp
 } // namespace mlir
 
-#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS_H
+#endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS_H
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
new file mode 100644
index 0000000000000..37d2eb907470c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
@@ -0,0 +1,26 @@
+//===- Passes.h - LLVM Pass Construction and Registration -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H
+#define MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+
+namespace omp {
+
+/// Generate the code for registering conversion passes.
+#define GEN_PASS_DECL
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
+
+} // namespace LLVM
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES_H
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
new file mode 100644
index 0000000000000..2d98f97ee197e
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
@@ -0,0 +1,26 @@
+//===-- Passes.td - LLVM pass definition file --------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES
+#define MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def PrepareForOMPOffloadPrivatizationPass : Pass<"omp-offload-privatization-prepare", "::mlir::LLVM::LLVMFuncOp"> {
+    let summary = "Prepare OpenMP maps for privatization for deferred target tasks";
+    let description = [{
+      When generating LLVMIR for privatized variables in an OpenMP offloading directive (eg. omp::TargetOp)
+      that creates a deferred target task (when the nowait clause is used), we need to copy the privatized
+      variable out of the stack of the generating task and into the heap so that the deferred target task
+      can still access it. However, if such a privatized variable is also mapped, typically the case for
+      allocatables, then the corresponding `omp::MapInfoOp` needs to be fixed up to map the new heap-allocated
+      variable and not the original variable.
+    }];
+  let dependentDialects = ["LLVM::LLVMDialect"];
+}
+#endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
index 729f5191cd557..37a45d478a1fb 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
@@ -7,7 +7,6 @@ add_mlir_dialect_library(MLIRLLVMIRTransforms
   LegalizeForExport.cpp
   OptimizeForNVVM.cpp
   RequestCWrappers.cpp
-  OpenMPOffloadPrivatizationPrepare.cpp
 
   DEPENDS
   MLIRLLVMPassIncGen
diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
index 57a6d3445c151..f3c02da458508 100644
--- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(Transforms)
+
 add_mlir_dialect_library(MLIROpenMPDialect
   IR/OpenMPDialect.cpp
 
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000..b9b8eda9ed51b
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_mlir_dialect_library(MLIROpenMPTransforms
+  OpenMPOffloadPrivatizationPrepare.cpp
+
+  DEPENDS
+  MLIROpenMPPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRFuncDialect
+  MLIRLLVMDialect
+  MLIROpenMPDialect
+  MLIRPass
+  MLIRTransforms
+  )
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
similarity index 98%
rename from mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
rename to mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index 4d43064398e90..88b2a98692cf6 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
+#include "mlir/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
@@ -31,12 +31,12 @@
 #define DEBUG_TYPE "omp-prepare-for-offload-privatization"
 
 namespace mlir {
-namespace LLVM {
+namespace omp {
 
 #define GEN_PASS_DEF_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS
-#include "mlir/Dialect/LLVMIR/Transforms/Passes.h.inc"
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
 
-} // namespace LLVM
+} // namespace omp
 } // namespace mlir
 
 using namespace mlir;
@@ -47,7 +47,7 @@ namespace {
 //===----------------------------------------------------------------------===//
 
 class PrepareForOMPOffloadPrivatizationPass
-    : public LLVM::impl::PrepareForOMPOffloadPrivatizationPassBase<
+  : public omp::impl::PrepareForOMPOffloadPrivatizationPassBase<
           PrepareForOMPOffloadPrivatizationPass> {
 
   void runOnOperation() override {
diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp
index c67b24226ae45..9400be43dba09 100644
--- a/mlir/lib/RegisterAllPasses.cpp
+++ b/mlir/lib/RegisterAllPasses.cpp
@@ -33,6 +33,7 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/NVGPU/Transforms/Passes.h"
 #include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
 #include "mlir/Dialect/Quant/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/Transforms/Passes.h"
 #include "mlir/Dialect/SPIRV/Transforms/Passes.h"
@@ -80,6 +81,7 @@ void mlir::registerAllPasses() {
   memref::registerMemRefPasses();
   shard::registerShardPasses();
   ml_program::registerMLProgramPasses();
+  omp::registerOpenMPPasses();
   quant::registerQuantPasses();
   registerSCFPasses();
   registerShapePasses();
diff --git a/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir
similarity index 96%
rename from mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir
rename to mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir
index 236cc6dfa6031..2fd8bf0eee9e9 100644
--- a/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir
+++ b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --mlir-disable-threading -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s
+// RUN: mlir-opt -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>} {
   llvm.func @free(!llvm.ptr)
@@ -77,8 +77,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vec
 // CHECK: %[[VAL_17:.*]] = llvm.call @firstprivatizer_init(%[[STACK]], %[[HEAP]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr
 // CHECK: %[[VAL_18:.*]] = llvm.call @firstprivatizer_copy(%[[STACK]], %[[VAL_17]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr
 // CHECK: %[[VAL_19:.*]] = llvm.getelementptr %[[HEAP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
-// CHECK: %[[VAL_20:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[VAL_19]] : !llvm.ptr) bounds(%[[VAL_16]]) -> !llvm.ptr {name = ""}
-// CHECK: %[[VAL_21:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, to) capture(ByRef) members(%[[VAL_20]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_20:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses({{.*}}to{{.*}}) capture(ByRef) var_ptr_ptr(%[[VAL_19]] : !llvm.ptr) bounds(%[[VAL_16]]) -> !llvm.ptr {name = ""}
+// CHECK: %[[VAL_21:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses({{.*}}always{{.*}}to{{.*}}) capture(ByRef) members(%[[VAL_20]] : [0] : !llvm.ptr) -> !llvm.ptr
 // CHECK: omp.target nowait map_entries(%[[VAL_8]] -> %[[VAL_22:.*]], %[[VAL_21]] -> %[[VAL_23:.*]], %[[VAL_20]] -> %[[VAL_24:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %[[HEAP]] -> %[[VAL_25:.*]] [map_idx=1] : !llvm.ptr) {
 // CHECK: omp.terminator
 // CHECK: }

>From 01d1f694a50eabb1cd7dd32a7e95133b5ac63a42 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 25 Sep 2025 15:01:52 -0500
Subject: [PATCH 23/27] Add a lit test for boxchars

---
 .../omp-offload-privatization-prepare.mlir    | 142 +++++++++++++++++-
 1 file changed, 141 insertions(+), 1 deletion(-)

diff --git a/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir
index 2fd8bf0eee9e9..77b669f1df2ca 100644
--- a/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir
+++ b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --mlir-disable-threading -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>} {
   llvm.func @free(!llvm.ptr)
@@ -17,6 +17,39 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vec
     "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
     omp.yield(%arg1 : !llvm.ptr)
   }
+  omp.private {type = firstprivate} @private_eye : i32 copy {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.load %arg0 : !llvm.ptr -> i32
+    llvm.store %0, %arg1 : i32, !llvm.ptr
+    omp.yield(%arg1 : !llvm.ptr)
+  }
+  omp.private {type = firstprivate} @boxchar_firstprivate : !llvm.struct<(ptr, i64)> init {
+  ^bb0(%arg0: !llvm.struct<(ptr, i64)>, %arg1: !llvm.struct<(ptr, i64)>):
+    %0 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)>
+    %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)>
+    %8 = llvm.call @malloc(%1) {bindc_name = "", uniq_name = ""} : (i64) -> !llvm.ptr
+    %9 = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+    %10 = llvm.insertvalue %8, %9[0] : !llvm.struct<(ptr, i64)>
+    %11 = llvm.insertvalue %1, %10[1] : !llvm.struct<(ptr, i64)>
+    omp.yield(%11 : !llvm.struct<(ptr, i64)>)
+  } copy {
+  ^bb0(%arg0: !llvm.struct<(ptr, i64)>, %arg1: !llvm.struct<(ptr, i64)>):
+    %3 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)>
+    %4 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)>
+    %5 = llvm.extractvalue %arg1[0] : !llvm.struct<(ptr, i64)>
+    %6 = llvm.extractvalue %arg1[1] : !llvm.struct<(ptr, i64)>
+    %7 = llvm.icmp "slt" %6, %4 : i64
+    %8 = llvm.select %7, %6, %4 : i1, i64
+    "llvm.intr.memmove"(%5, %3, %8) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
+    omp.yield(%arg1 : !llvm.struct<(ptr, i64)>)
+  } dealloc {
+  ^bb0(%arg0: !llvm.struct<(ptr, i64)>):
+    %0 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)>
+    %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)>
+    llvm.call @free(%0) : (!llvm.ptr) -> ()
+    omp.yield
+  }
+
   llvm.func internal @firstprivate_test(%arg0: !llvm.ptr {fir.bindc_name = "ptr0"}, %arg1: !llvm.ptr {fir.bindc_name = "ptr1"}) {
     %0 = llvm.mlir.constant(1 : i32) : i32
     %1 = llvm.mlir.constant(0 : index) : i64
@@ -47,6 +80,42 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vec
     llvm.call @free(%168) : (!llvm.ptr) -> ()
     llvm.return
   }
+
+  llvm.func @target_boxchar_(%arg0: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_boxchar", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+    %2 = llvm.mlir.constant(1 : i64) : i64
+    %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64)> : (i64) -> !llvm.ptr
+    %4 = llvm.mlir.constant(1 : index) : i64
+    %5 = llvm.mlir.constant(0 : index) : i64
+    %6 = llvm.mlir.constant(0 : i32) : i32
+    %7 = llvm.mlir.constant(1 : i64) : i64
+    %8 = llvm.mlir.constant(1 : i64) : i64
+    %9 = llvm.load %arg0 : !llvm.ptr -> i32
+    %10 = llvm.icmp "sgt" %9, %6 : i32
+    %11 = llvm.select %10, %9, %6 : i1, i32
+    %12 = llvm.mlir.constant(1 : i64) : i64
+    %13 = llvm.sext %11 : i32 to i64
+    %14 = llvm.alloca %13 x i8 {bindc_name = "char_var"} : (i64) -> !llvm.ptr
+    %15 = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+    %16 = llvm.sext %11 : i32 to i64
+    %17 = llvm.insertvalue %14, %15[0] : !llvm.struct<(ptr, i64)>
+    %18 = llvm.insertvalue %16, %17[1] : !llvm.struct<(ptr, i64)>
+    llvm.store %18, %3 : !llvm.struct<(ptr, i64)>, !llvm.ptr
+    %19 = llvm.load %3 : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+    %20 = llvm.extractvalue %19[0] : !llvm.struct<(ptr, i64)>
+    %21 = llvm.extractvalue %19[1] : !llvm.struct<(ptr, i64)>
+    %22 = llvm.sub %21, %4 : i64
+    %23 = omp.map.bounds lower_bound(%5 : i64) upper_bound(%22 : i64) extent(%21 : i64) stride(%4 : i64) start_idx(%5 : i64) {stride_in_bytes = true}
+    %24 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)>
+    %25 = omp.map.info var_ptr(%3 : !llvm.ptr, i8) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%24 : !llvm.ptr) bounds(%23) -> !llvm.ptr
+    %26 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.struct<(ptr, i64)>) map_clauses(to) capture(ByRef) members(%25 : [0] : !llvm.ptr) -> !llvm.ptr
+    %27 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr
+    omp.target nowait map_entries(%26 -> %arg1, %27 -> %arg2, %25 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@boxchar_firstprivate %18 -> %arg4 [map_idx=0], @private_eye %1 -> %arg5 [map_idx=1] : !llvm.struct<(ptr, i64)>, !llvm.ptr) {
+      omp.terminator
+    }
+    llvm.return
+  }
 }
 
 // CHECK-LABEL:       llvm.func @free(!llvm.ptr)
@@ -89,6 +158,51 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vec
 // CHECK: llvm.return
 // CHECK: }
 
+// CHECK-LABEL:   llvm.func @target_boxchar_(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_boxchar", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(16 : i64) : i64
+// CHECK: %[[VAL_4:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_5:.*]] = llvm.alloca %[[VAL_2]] x !llvm.struct<(ptr, i64)> : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[VAL_8:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[VAL_9:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_10:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_11:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> i32
+// CHECK: %[[VAL_12:.*]] = llvm.icmp "sgt" %[[VAL_11]], %[[VAL_8]] : i32
+// CHECK: %[[VAL_13:.*]] = llvm.select %[[VAL_12]], %[[VAL_11]], %[[VAL_8]] : i1, i32
+// CHECK: %[[VAL_14:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_15:.*]] = llvm.sext %[[VAL_13]] : i32 to i64
+// CHECK: %[[VAL_16:.*]] = llvm.alloca %[[VAL_15]] x i8 {bindc_name = "char_var"} : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_17:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_18:.*]] = llvm.sext %[[VAL_13]] : i32 to i64
+// CHECK: %[[VAL_19:.*]] = llvm.insertvalue %[[VAL_16]], %[[VAL_17]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_20:.*]] = llvm.insertvalue %[[VAL_18]], %[[VAL_19]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: llvm.store %[[VAL_20]], %[[VAL_5]] : !llvm.struct<(ptr, i64)>, !llvm.ptr
+// CHECK: %[[VAL_21:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_22:.*]] = llvm.extractvalue %[[VAL_21]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_23:.*]] = llvm.extractvalue %[[VAL_21]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_24:.*]] = llvm.sub %[[VAL_23]], %[[VAL_6]] : i64
+// CHECK: %[[VAL_25:.*]] = omp.map.bounds lower_bound(%[[VAL_7]] : i64) upper_bound(%[[VAL_24]] : i64) extent(%[[VAL_23]] : i64) stride(%[[VAL_6]] : i64) start_idx(%[[VAL_7]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_26:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_27:.*]] = llvm.load %[[VAL_4]] : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_28:.*]] = llvm.call @boxchar_firstprivate_init(%[[VAL_26]], %[[VAL_27]]) : (!llvm.struct<(ptr, i64)>, !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_29:.*]] = llvm.call @boxchar_firstprivate_copy(%[[VAL_26]], %[[VAL_28]]) : (!llvm.struct<(ptr, i64)>, !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)>
+// CHECK: llvm.store %[[VAL_29]], %[[VAL_4]] : !llvm.struct<(ptr, i64)>, !llvm.ptr
+// CHECK: %[[VAL_30:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr
+// CHECK: %[[VAL_31:.*]] = llvm.getelementptr %[[VAL_4]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_32:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !llvm.ptr, i8) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%[[VAL_31]] : !llvm.ptr) bounds(%[[VAL_25]]) -> !llvm.ptr
+// CHECK: %[[VAL_33:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !llvm.ptr, !llvm.struct<(ptr, i64)>) map_clauses(to) capture(ByRef) members(%[[VAL_32]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_34:.*]] = llvm.load %[[VAL_4]] : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+// CHECK: omp.target nowait map_entries(%[[VAL_33]] -> %[[VAL_35:.*]], %[[VAL_30]] -> %[[VAL_36:.*]], %[[VAL_32]] -> %[[VAL_37:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@boxchar_firstprivate %[[VAL_34]] -> %[[VAL_38:.*]] [map_idx=0], @private_eye %[[VAL_1]] -> %[[VAL_39:.*]] [map_idx=1] : !llvm.struct<(ptr, i64)>, !llvm.ptr) {
+// CHECK: omp.terminator
+// CHECK: }
+// CHECK: llvm.return
+// CHECK: }
+
 // CHECK-LABEL: llvm.func @firstprivatizer_init(
 // CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr,
 // CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} {
@@ -106,3 +220,29 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vec
 // CHECK: "llvm.intr.memcpy"(%[[ARG1]], %[[ARG0]], %[[VAL_0]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
 // CHECK: llvm.return %[[ARG1]] : !llvm.ptr
 // CHECK: }
+
+
+// CHECK-LABEL:   llvm.func @boxchar_firstprivate_init(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_2:.*]] = llvm.call @malloc(%[[VAL_1]]) {bindc_name = "", uniq_name = ""} : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_4:.*]] = llvm.insertvalue %[[VAL_2]], %[[VAL_3]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_5:.*]] = llvm.insertvalue %[[VAL_1]], %[[VAL_4]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: llvm.return %[[VAL_5]] : !llvm.struct<(ptr, i64)>
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @boxchar_firstprivate_copy(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_2:.*]] = llvm.extractvalue %[[ARG1]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_3:.*]] = llvm.extractvalue %[[ARG1]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_4:.*]] = llvm.icmp "slt" %[[VAL_3]], %[[VAL_1]] : i64
+// CHECK: %[[VAL_5:.*]] = llvm.select %[[VAL_4]], %[[VAL_3]], %[[VAL_1]] : i1, i64
+// CHECK: "llvm.intr.memmove"(%[[VAL_2]], %[[VAL_0]], %[[VAL_5]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// CHECK: llvm.return %[[ARG1]] : !llvm.struct<(ptr, i64)>
+// CHECK: }

>From d5955725ac6ab2e14d212eaa45a9146780445b4d Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 26 Sep 2025 12:18:03 -0500
Subject: [PATCH 24/27] Make createFuncForRegionAndCallIt take an arrayref as
 argumen

---
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 39 ++++++++-----------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index 88b2a98692cf6..1a6befbe057c2 100644
--- a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -167,17 +167,15 @@ class PrepareForOMPOffloadPrivatizationPass
         //        Type varType = getElemType(varPtr);
         Type varType = mapInfoOp.getVarType();
 
-        // Create a llvm.func for 'region' that is marked always_inline and call it.
-        auto createAlwaysInlineFuncAndCallIt = [&](Region &region,
-                                                   llvm::StringRef funcName,
-                                                   Value mold,
-                                                   Value arg1) -> Value {
+        // Create a llvm.func for 'region' that is marked always_inline and call
+        // it.
+        auto createAlwaysInlineFuncAndCallIt =
+            [&](Region &region, llvm::StringRef funcName,
+                llvm::ArrayRef<Value> args) -> Value {
           assert(!region.empty() && "region cannot be empty");
-          LLVM::LLVMFuncOp func = createFuncOpForRegion(
-              loc, mod, region,
-              funcName,
-              firstOp, rewriter);
-          auto call = rewriter.create<LLVM::CallOp>(loc, func, ValueRange{mold, arg1});
+          LLVM::LLVMFuncOp func =
+              createFuncOpForRegion(loc, mod, region, funcName, rewriter);
+          auto call = rewriter.create<LLVM::CallOp>(loc, func, args);
           return call.getResult();
         };
 
@@ -195,7 +193,7 @@ class PrepareForOMPOffloadPrivatizationPass
           initializedVal = createAlwaysInlineFuncAndCallIt(
             privatizer.getInitRegion(),
             llvm::formatv("{0}_{1}", privatizer.getSymName(), "init").str(),
-            moldArg, newArg);
+            {moldArg, newArg});
         else
           initializedVal = newArg;
 
@@ -203,7 +201,7 @@ class PrepareForOMPOffloadPrivatizationPass
           initializedVal = createAlwaysInlineFuncAndCallIt(
               privatizer.getCopyRegion(),
               llvm::formatv("{0}_{1}", privatizer.getSymName(), "copy").str(),
-              moldArg, initializedVal);
+              {moldArg, initializedVal});
 
         if (isPrivatizedByValue)
           (void)rewriter.create<LLVM::StoreOp>(loc, initializedVal, heapMem);
@@ -394,21 +392,17 @@ class PrepareForOMPOffloadPrivatizationPass
   LLVM::LLVMFuncOp createFuncOpForRegion(Location loc, ModuleOp mod,
                                          Region &srcRegion,
                                          llvm::StringRef funcName,
-                                         Operation *insertPt,
                                          IRRewriter &rewriter) {
 
     OpBuilder::InsertionGuard guard(rewriter);
-    MLIRContext *ctx = mod.getContext();
     rewriter.setInsertionPoint(mod.getBody(), mod.getBody()->end());
     Region clonedRegion;
     IRMapping mapper;
     srcRegion.cloneInto(&clonedRegion, mapper);
-    SmallVector<Type> paramTypes = {srcRegion.getArgument(0).getType(),
-                                    srcRegion.getArgument(1).getType()};
-    Type resultType = srcRegion.getArgument(0).getType();
-    LDBG() << "paramTypes are \n"
-           << srcRegion.getArgument(0).getType() << "\n"
-           << srcRegion.getArgument(1).getType() << "\n";
+
+    SmallVector<Type> paramTypes;
+    llvm::copy(srcRegion.getArgumentTypes(), std::back_inserter(paramTypes));
+    Type resultType =  srcRegion.getArgument(0).getType();
     LLVM::LLVMFunctionType funcType =
         LLVM::LLVMFunctionType::get(resultType, paramTypes);
 
@@ -422,8 +416,9 @@ class PrepareForOMPOffloadPrivatizationPass
       if (isa<omp::YieldOp>(block.getTerminator())) {
         omp::YieldOp yieldOp = cast<omp::YieldOp>(block.getTerminator());
         rewriter.setInsertionPoint(yieldOp);
-        rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(yieldOp, TypeRange(),
-                                                    yieldOp.getResults().front());
+        if (!isa<LLVM::LLVMVoidType>(resultType))
+          rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(yieldOp, TypeRange(),
+                                                      yieldOp.getOperands());
       }
     }
     LDBG() << funcName << " is \n" << func << "\n";

>From bf949fafbad384dfc871450ada88f62a59f64986 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 26 Sep 2025 12:53:47 -0500
Subject: [PATCH 25/27] clean up

---
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 50 ++++++-------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index 1a6befbe057c2..8369358310673 100644
--- a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -98,14 +98,17 @@ class PrepareForOMPOffloadPrivatizationPass
         // Allocate heap memory that corresponds to the type of memory
         // pointed to by varPtr
         // For boxchars this won't be a pointer. But, MapsForPrivatizedSymbols
-        // should have mapped the pointer the boxchar so use that as varPtr.
+        // should have mapped the pointer to the boxchar so use that as varPtr.
         Value varPtr = privVar;
-        bool isPrivatizedByValue = !isa<LLVM::LLVMPointerType>(privVar.getType());
+        Type varType = mapInfoOp.getVarType();
+        bool isPrivatizedByValue =
+            !isa<LLVM::LLVMPointerType>(privVar.getType());
         if (isPrivatizedByValue)
           varPtr = mapInfoOp.getVarPtr();
 
         assert(isa<LLVM::LLVMPointerType>(varPtr.getType()));
-        Value heapMem = allocateHeapMem(targetOp, varPtr, mapInfoOp.getVarType(), mod, rewriter);
+        Value heapMem = allocateHeapMem(targetOp, varPtr,
+                                        varType, mod, rewriter);
         if (!heapMem)
           targetOp.emitError(
               "Unable to allocate heap memory when trying to move "
@@ -123,6 +126,8 @@ class PrepareForOMPOffloadPrivatizationPass
         if (!isPrivatizedByValue)
           newPrivVars.push_back(heapMem);
 
+        // We now need to copy the original private variable into the newly
+        // allocated location in the heap.
         // Find the earliest insertion point for the copy. This will be before
         // the first in the list of omp::MapInfoOp instances that use varPtr.
         // After the copy these omp::MapInfoOp instances will refer to heapMem
@@ -136,10 +141,10 @@ class PrepareForOMPOffloadPrivatizationPass
           auto blockArg = cast<BlockArgument>(varPtr);
           users.insert(blockArg.user_begin(), blockArg.user_end());
         }
-
         auto usesVarPtr = [&users](Operation *op) -> bool {
           return users.count(op);
         };
+
         SmallVector<Operation *> chainOfOps;
         chainOfOps.push_back(mapInfoOperation);
         if (!mapInfoOp.getMembers().empty()) {
@@ -164,8 +169,6 @@ class PrepareForOMPOffloadPrivatizationPass
 
         Operation *firstOp = chainOfOps.front();
         Location loc = firstOp->getLoc();
-        //        Type varType = getElemType(varPtr);
-        Type varType = mapInfoOp.getVarType();
 
         // Create a llvm.func for 'region' that is marked always_inline and call
         // it.
@@ -206,6 +209,8 @@ class PrepareForOMPOffloadPrivatizationPass
         if (isPrivatizedByValue)
           (void)rewriter.create<LLVM::StoreOp>(loc, initializedVal, heapMem);
 
+        // clone origOp, replace all uses of varPtr with heapMem and
+        // erase origOp.
         auto cloneModifyAndErase = [&](Operation *origOp) -> Operation * {
           Operation *clonedOp = rewriter.clone(*origOp);
           rewriter.replaceAllOpUsesWith(origOp, clonedOp);
@@ -216,6 +221,9 @@ class PrepareForOMPOffloadPrivatizationPass
           return clonedOp;
         };
 
+        // Now that we have set up the heap-allocated copy of the private
+        // variable, rewrite all the uses of the original variable with
+        // the heap-allocated variable.
         rewriter.setInsertionPoint(targetOp);
         rewriter.setInsertionPoint(cloneModifyAndErase(mapInfoOperation));
 
@@ -280,25 +288,6 @@ class PrepareForOMPOffloadPrivatizationPass
     return privatizer;
   }
 
-  template <typename OpType>
-  Type getElemType(OpType op) const {
-    return op.getElemType();
-  }
-
-  Type getElemType(Value varPtr) const {
-    Operation *definingOp = unwrapAddrSpaceCast(varPtr.getDefiningOp());
-    assert((isa<LLVM::AllocaOp, LLVM::GEPOp>(definingOp)) &&
-           "getElemType in PrepareForOMPOffloadPrivatizationPass can deal only "
-           "with Alloca or GEP for now");
-    if (auto allocaOp = dyn_cast<LLVM::AllocaOp>(definingOp))
-      return getElemType(allocaOp);
-    // TODO: get rid of this because GEPOp.getElemType() is not the right thing
-    // to use.
-    if (auto gepOp = dyn_cast<LLVM::GEPOp>(definingOp))
-      return getElemType(gepOp);
-    return Type{};
-  }
-
   Operation *unwrapAddrSpaceCast(Operation *op) const {
     if (!isa<LLVM::AddrSpaceCastOp>(op))
       return op;
@@ -353,9 +342,8 @@ class PrepareForOMPOffloadPrivatizationPass
     return mallocCall.value();
   }
 
-  template <typename OpTy>
-  Value allocateHeapMem(OpTy targetOp, Value privVar, Type varType, ModuleOp mod,
-                        IRRewriter &rewriter) const {
+  Value allocateHeapMem(omp::TargetOp targetOp, Value privVar, Type varType,
+                        ModuleOp mod, IRRewriter &rewriter) const {
     OpBuilder::InsertionGuard guard(rewriter);
     Value varPtr = privVar;
     Operation *definingOp = varPtr.getDefiningOp();
@@ -369,7 +357,6 @@ class PrepareForOMPOffloadPrivatizationPass
     Location loc = definingOp ? definingOp->getLoc() : blockArg.getLoc();
     LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
 
-    // Type varType = getElemType(varPtr);
     assert(mod.getDataLayoutSpec() &&
            "MLIR module with no datalayout spec not handled yet");
 
@@ -384,11 +371,6 @@ class PrepareForOMPOffloadPrivatizationPass
     return mallocCallOp.getResult();
   }
 
-  LLVM::CallOp allocateHeapMem(Location loc, Value size,
-                               ModuleOp mod, IRRewriter &rewriter) const {
-    LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
-    return rewriter.create<LLVM::CallOp>(loc, mallocFn, ValueRange{size});
-  }
   LLVM::LLVMFuncOp createFuncOpForRegion(Location loc, ModuleOp mod,
                                          Region &srcRegion,
                                          llvm::StringRef funcName,

>From 72a769fff63ac24e5541cfb661904ca1c52a6f6e Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 26 Sep 2025 15:57:02 -0500
Subject: [PATCH 26/27] make clang-format happy

---
 .../mlir/Dialect/OpenMP/Transforms/Passes.h   |  2 +-
 .../OpenMPOffloadPrivatizationPrepare.cpp     | 33 +++++++------------
 2 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
index 37d2eb907470c..cb83d43e700dc 100644
--- a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
@@ -20,7 +20,7 @@ namespace omp {
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
 
-} // namespace LLVM
+} // namespace omp
 } // namespace mlir
 
 #endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES_H
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index 8369358310673..a35acf8dc3e6b 100644
--- a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -47,7 +47,7 @@ namespace {
 //===----------------------------------------------------------------------===//
 
 class PrepareForOMPOffloadPrivatizationPass
-  : public omp::impl::PrepareForOMPOffloadPrivatizationPassBase<
+    : public omp::impl::PrepareForOMPOffloadPrivatizationPassBase<
           PrepareForOMPOffloadPrivatizationPass> {
 
   void runOnOperation() override {
@@ -85,8 +85,7 @@ class PrepareForOMPOffloadPrivatizationPass
         bool isFirstPrivate = privatizer.getDataSharingType() ==
                               omp::DataSharingClauseType::FirstPrivate;
 
-        Value mappedValue =
-            targetOp.getMappedValueForPrivateVar(privVarIdx);
+        Value mappedValue = targetOp.getMappedValueForPrivateVar(privVarIdx);
         Operation *mapInfoOperation = mappedValue.getDefiningOp();
         auto mapInfoOp = cast<omp::MapInfoOp>(mapInfoOperation);
 
@@ -107,8 +106,8 @@ class PrepareForOMPOffloadPrivatizationPass
           varPtr = mapInfoOp.getVarPtr();
 
         assert(isa<LLVM::LLVMPointerType>(varPtr.getType()));
-        Value heapMem = allocateHeapMem(targetOp, varPtr,
-                                        varType, mod, rewriter);
+        Value heapMem =
+            allocateHeapMem(targetOp, varPtr, varType, mod, rewriter);
         if (!heapMem)
           targetOp.emitError(
               "Unable to allocate heap memory when trying to move "
@@ -194,9 +193,9 @@ class PrepareForOMPOffloadPrivatizationPass
         Value initializedVal;
         if (!privatizer.getInitRegion().empty())
           initializedVal = createAlwaysInlineFuncAndCallIt(
-            privatizer.getInitRegion(),
-            llvm::formatv("{0}_{1}", privatizer.getSymName(), "init").str(),
-            {moldArg, newArg});
+              privatizer.getInitRegion(),
+              llvm::formatv("{0}_{1}", privatizer.getSymName(), "init").str(),
+              {moldArg, newArg});
         else
           initializedVal = newArg;
 
@@ -288,18 +287,9 @@ class PrepareForOMPOffloadPrivatizationPass
     return privatizer;
   }
 
-  Operation *unwrapAddrSpaceCast(Operation *op) const {
-    if (!isa<LLVM::AddrSpaceCastOp>(op))
-      return op;
-    LLVM::AddrSpaceCastOp addrSpaceCastOp =
-        cast<LLVM::AddrSpaceCastOp>(op);
-    return unwrapAddrSpaceCast(addrSpaceCastOp.getArg().getDefiningOp());
-  }
-
   // Get the (compile-time constant) size of varType as per the
   // given DataLayout dl.
-  std::int64_t getSizeInBytes(const DataLayout &dl,
-                              Type varType) const {
+  std::int64_t getSizeInBytes(const DataLayout &dl, Type varType) const {
     llvm::TypeSize size = dl.getTypeSize(varType);
     unsigned short alignment = dl.getTypeABIAlignment(varType);
     return llvm::alignTo(size, alignment);
@@ -308,11 +298,10 @@ class PrepareForOMPOffloadPrivatizationPass
   // Generate code to get the size of data being mapped from the bounds
   // of mapInfoOp
   Value getSizeInBytes(omp::MapInfoOp mapInfoOp, ModuleOp mod,
-                             IRRewriter &rewriter) const {
+                       IRRewriter &rewriter) const {
     Location loc = mapInfoOp.getLoc();
     Type llvmInt64Ty = rewriter.getI64Type();
-    Value constOne =
-        rewriter.create<LLVM::ConstantOp>(loc, llvmInt64Ty, 1);
+    Value constOne = rewriter.create<LLVM::ConstantOp>(loc, llvmInt64Ty, 1);
     Value elementCount = constOne;
     // TODO: Consider using  boundsOp.getExtent() if available.
     for (auto bounds : mapInfoOp.getBounds()) {
@@ -384,7 +373,7 @@ class PrepareForOMPOffloadPrivatizationPass
 
     SmallVector<Type> paramTypes;
     llvm::copy(srcRegion.getArgumentTypes(), std::back_inserter(paramTypes));
-    Type resultType =  srcRegion.getArgument(0).getType();
+    Type resultType = srcRegion.getArgument(0).getType();
     LLVM::LLVMFunctionType funcType =
         LLVM::LLVMFunctionType::get(resultType, paramTypes);
 

>From 061669f6f558338e872ac04542e1392a9c52f5a7 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 29 Sep 2025 10:50:42 -0500
Subject: [PATCH 27/27] Add some more comments and fix a typo

---
 .../Transforms/OpenMPOffloadPrivatizationPrepare.cpp     | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
index a35acf8dc3e6b..d400420688f77 100644
--- a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
+++ b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -119,7 +119,7 @@ class PrepareForOMPOffloadPrivatizationPass
         // simply record the newly allocated malloc location as the
         // new private variable. If, however, the type is not a pointer
         // then, we need to load the value from the newly allocated
-        // location. We'll inser that load later after we have updated
+        // location. We'll insert that load later after we have updated
         // the malloc'd location with the contents of the original
         // variable.
         if (!isPrivatizedByValue)
@@ -360,6 +360,10 @@ class PrepareForOMPOffloadPrivatizationPass
     return mallocCallOp.getResult();
   }
 
+  // Create a function for srcRegion and attribute it to be always_inline.
+  // The big assumption here is that srcRegion is one of init or copy regions
+  // of a omp::PrivateClauseop. Accordingly, the return type is assumed
+  // to be the same as the types of the two arguments of the region itself.
   LLVM::LLVMFuncOp createFuncOpForRegion(Location loc, ModuleOp mod,
                                          Region &srcRegion,
                                          llvm::StringRef funcName,
@@ -377,7 +381,6 @@ class PrepareForOMPOffloadPrivatizationPass
     LLVM::LLVMFunctionType funcType =
         LLVM::LLVMFunctionType::get(resultType, paramTypes);
 
-    LDBG() << "funcType is " << funcType << "\n";
     LLVM::LLVMFuncOp func =
         LLVM::LLVMFuncOp::create(rewriter, loc, funcName, funcType);
     func.setAlwaysInline(true);
@@ -392,8 +395,6 @@ class PrepareForOMPOffloadPrivatizationPass
                                                       yieldOp.getOperands());
       }
     }
-    LDBG() << funcName << " is \n" << func << "\n";
-    LLVM_DEBUG(llvm::dbgs() << "Module is \n" << mod << "\n");
     return func;
   }
 };



More information about the Mlir-commits mailing list