[llvm-branch-commits] [flang] [mlir] [Flang][OpenMP] Add pass to replace allocas with device shared memory (PR #161863)

Mon Apr 27 04:46:47 PDT 2026

https://github.com/skatrak updated https://github.com/llvm/llvm-project/pull/161863

>From 0083db0ba72e11724f5dca91134c70802bb064c6 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof at amd.com>
Date: Tue, 16 Sep 2025 13:45:43 +0100
Subject: [PATCH] [Flang][OpenMP] Add pass to replace allocas with device
 shared memory

This patch introduces a new OpenMP MLIR pass, only for target device modules,
that identifies `llvm.alloca` operations that should use device shared
memory and replaces them with pairs of `omp.alloc_shared_mem` and
`omp.free_shared_mem` operations.

This works in conjunction to the MLIR to LLVM IR translation pass' handling of
privatization, mapping and reductions in the OpenMP dialect to properly select
the right memory space for allocations based on where they are made and where
they are used.

This pass, in particular, handles explicit stack allocations in MLIR, whereas
the aforementioned translation pass takes care of implicit ones represented by
entry block arguments.
---
 .../include/flang/Optimizer/Support/InitFIR.h |   2 +
 flang/lib/Optimizer/Passes/Pipelines.cpp      |   7 +
 flang/test/Fir/basic-program.fir              |   2 +
 mlir/docs/Passes.md                           |   4 +
 .../mlir/Dialect/OpenMP/Transforms/Passes.h   |   6 +-
 .../mlir/Dialect/OpenMP/Transforms/Passes.td  |  18 ++
 mlir/lib/Dialect/OpenMP/CMakeLists.txt        |  23 +-
 mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt     |  21 ++
 .../Dialect/OpenMP/Transforms/CMakeLists.txt  |  10 +
 .../OpenMP/Transforms/StackToShared.cpp       | 196 ++++++++++++++++++
 mlir/test/Dialect/OpenMP/stack-to-shared.mlir | 149 +++++++++++++
 11 files changed, 415 insertions(+), 23 deletions(-)
 create mode 100644 mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt
 create mode 100644 mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp
 create mode 100644 mlir/test/Dialect/OpenMP/stack-to-shared.mlir

diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h
index d77d82feddd84..6051dbb07fad7 100644
--- a/flang/include/flang/Optimizer/Support/InitFIR.h
+++ b/flang/include/flang/Optimizer/Support/InitFIR.h
@@ -34,6 +34,7 @@
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Passes.h"
 #include "mlir/InitAllDialects.h"
@@ -106,6 +107,7 @@ inline void loadDialects(mlir::MLIRContext &context) {
 /// but is a smaller set since we aren't using many of the passes found there.
 inline void registerMLIRPassesForFortranTools() {
   mlir::acc::registerOpenACCPasses();
+  mlir::omp::registerOpenMPPasses();
   mlir::registerCanonicalizerPass();
   mlir::registerCSEPass();
   mlir::affine::registerAffineLoopFusionPass();
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index bc95fcba512a4..920d6f86a355e 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -13,6 +13,7 @@
 #include "flang/Optimizer/OpenACC/Passes.h"
 #include "mlir/Conversion/Passes.h"
 #include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
 #include "llvm/Support/CommandLine.h"
 
 /// Force setting the no-alias attribute on fuction arguments when possible.
@@ -440,6 +441,12 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
   }
 
   fir::addFIRToLLVMPass(pm, config);
+
+  // Convert applicable OpenMP stack allocations to shared memory allocations
+  // for GPU targets. This pass must run after any alloca-generating passes to
+  // ensure all are adequately accounted for.
+  if (config.EnableOpenMP && !config.EnableOpenMPSimd)
+    pm.addPass(mlir::omp::createStackToSharedPass());
 }
 
 /// Create a pass pipeline for lowering from MLIR to LLVM IR
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 5f84395b36037..1e26b388267b6 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -178,5 +178,7 @@ func.func @_QQmain() {
 // PASSES-NEXT:  LowerNontemporalPass
 // PASSES-NEXT: FIRToLLVMLowering
 // PASSES-NEXT: ReconcileUnrealizedCasts
+// PASSES-NEXT: 'llvm.func' Pipeline
+// PASSES-NEXT:  StackToSharedPass
 // PASSES-NEXT: PrepareForOMPOffloadPrivatizationPass
 // PASSES-NEXT: LLVMIRLoweringPass
diff --git a/mlir/docs/Passes.md b/mlir/docs/Passes.md
index bf9552100efe9..0226add6d0838 100644
--- a/mlir/docs/Passes.md
+++ b/mlir/docs/Passes.md
@@ -76,6 +76,10 @@ This document describes the available MLIR passes and their contracts.
 
 [include "MemRefPasses.md"]
 
+## 'omp' Dialect Passes
+
+[include "OpenMPPasses.md"]
+
 ## 'shard' Dialect Passes
 
 [include "ShardPasses.md"]
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
index 21b6d1f466558..ddbe662be69fc 100644
--- a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
@@ -13,6 +13,10 @@
 
 namespace mlir {
 
+namespace LLVM {
+class LLVMFuncOp;
+} // namespace LLVM
+
 namespace omp {
 
 /// Generate the code for registering conversion passes.
@@ -23,4 +27,4 @@ namespace omp {
 } // namespace omp
 } // namespace mlir
 
-#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES_H
+#endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
index 43d84b7fa4bf5..e6321ef58b45f 100644
--- a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
@@ -34,4 +34,22 @@ def PrepareForOMPOffloadPrivatizationPass : Pass<"omp-offload-privatization-prep
     }];
   let dependentDialects = ["LLVM::LLVMDialect"];
 }
+
+def StackToSharedPass : Pass<"omp-stack-to-shared", "mlir::LLVM::LLVMFuncOp"> {
+  let summary = "Replaces stack allocations target devices with shared memory.";
+  let description = [{
+    This pass replaces `llvm.alloca` operations located in a non-SPMD target
+    region and then potentially used inside of an `omp.parallel` region with
+    `omp.alloc_shared_mem` and `omp.free_shared_mem`. This is also done for
+    top-level function `llvm.alloca`s used in the same way when the parent
+    function is a target device function.
+
+    This ensures that explicit private allocations, intended to be shared across
+    threads, use the proper memory space on a target device while supporting the
+    case of parallel regions indirectly reached from within a target region via
+    function calls.
+  }];
+  let dependentDialects = ["mlir::omp::OpenMPDialect"];
+}
+
 #endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES
diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
index 41c68b4244590..9f57627c321fb 100644
--- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
@@ -1,23 +1,2 @@
+add_subdirectory(IR)
 add_subdirectory(Transforms)
-
-add_mlir_dialect_library(MLIROpenMPDialect
-  IR/OpenMPDialect.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
-
-  DEPENDS
-  omp_gen
-  MLIROpenMPOpsIncGen
-  MLIROpenMPOpsInterfacesIncGen
-  MLIROpenMPTypeInterfacesIncGen
-
-  LINK_COMPONENTS
-  TargetParser
-
-  LINK_LIBS PUBLIC
-  MLIRIR
-  MLIRLLVMDialect
-  MLIRFuncDialect
-  MLIROpenACCMPCommon
-  )
diff --git a/mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt
new file mode 100644
index 0000000000000..1beea2098d3bb
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_mlir_dialect_library(MLIROpenMPDialect
+  OpenMPDialect.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
+
+  DEPENDS
+  omp_gen
+  MLIROpenMPOpsIncGen
+  MLIROpenMPOpsInterfacesIncGen
+  MLIROpenMPTypeInterfacesIncGen
+
+  LINK_COMPONENTS
+  TargetParser
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRLLVMDialect
+  MLIRFuncDialect
+  MLIROpenACCMPCommon
+  )
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
index 9b11d4b87e8df..e989cb2945f31 100644
--- a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
@@ -1,15 +1,25 @@
 add_mlir_dialect_library(MLIROpenMPTransforms
   MarkDeclareTarget.cpp
   OpenMPOffloadPrivatizationPrepare.cpp
+  StackToShared.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
 
   DEPENDS
+  omp_gen
   MLIROpenMPPassIncGen
+  MLIROpenMPOpsIncGen
+  MLIROpenMPOpsInterfacesIncGen
+  MLIROpenMPTypeInterfacesIncGen
 
   LINK_LIBS PUBLIC
   MLIRFunctionInterfaces
   MLIRIR
   MLIRLLVMDialect
+  MLIROpenACCMPCommon
   MLIROpenMPDialect
   MLIRPass
+  MLIRSupport
   MLIRTransforms
   )
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp b/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp
new file mode 100644
index 0000000000000..0edccf53a2031
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp
@@ -0,0 +1,196 @@
+//===- StackToShared.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements transforms to swap stack allocations on the target
+// device with device shared memory where applicable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace omp {
+#define GEN_PASS_DEF_STACKTOSHAREDPASS
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
+} // namespace omp
+} // namespace mlir
+
+using namespace mlir;
+
+/// When a use takes place inside an omp.parallel region and it's not as a
+/// private clause argument, or when it is a reduction argument passed to
+/// omp.parallel or a function call argument, then the defining allocation is
+/// eligible for replacement with shared memory.
+static bool allocaUseRequiresDeviceSharedMem(const OpOperand &use) {
+  Operation *owner = use.getOwner();
+  if (auto parallelOp = dyn_cast<omp::ParallelOp>(owner)) {
+    if (llvm::is_contained(parallelOp.getReductionVars(), use.get()))
+      return true;
+  } else if (auto callOp = dyn_cast<CallOpInterface>(owner)) {
+    if (llvm::is_contained(callOp.getArgOperands(), use.get()))
+      return true;
+  }
+
+  // If it is used directly inside of a parallel region, it has to be replaced
+  // unless the use is a private clause.
+  if (owner->getParentOfType<omp::ParallelOp>()) {
+    if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(owner)) {
+      if (auto privateSyms =
+              cast_or_null<ArrayAttr>(owner->getAttr("private_syms"))) {
+        for (auto [var, sym] :
+             llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
+          if (var != use.get())
+            continue;
+
+          auto moduleOp = owner->getParentOfType<ModuleOp>();
+          auto privateOp = cast<omp::PrivateClauseOp>(
+              moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
+          return privateOp.getDataSharingType() !=
+                 omp::DataSharingClauseType::Private;
+        }
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+static bool shouldReplaceAllocaWithUses(const Operation::use_range &uses) {
+  // Check direct uses and also follow hlfir.declare/fir.convert uses.
+  for (const OpOperand &use : uses) {
+    Operation *owner = use.getOwner();
+    if (llvm::isa<LLVM::AddrSpaceCastOp, LLVM::GEPOp>(owner)) {
+      if (shouldReplaceAllocaWithUses(owner->getUses()))
+        return true;
+    } else if (allocaUseRequiresDeviceSharedMem(use)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// TODO: Refactor the logic in `shouldReplaceAllocaWithDeviceSharedMem`,
+// `shouldReplaceAllocaWithUses` and `allocaUseRequiresDeviceSharedMem` to
+// be reusable by the MLIR to LLVM IR translation stage, as something very
+// similar is also implemented there to choose between allocas and device
+// shared memory allocations when processing OpenMP reductions, mapping and
+// privatization.
+static bool shouldReplaceAllocaWithDeviceSharedMem(Operation &op) {
+  auto offloadIface = op.getParentOfType<omp::OffloadModuleInterface>();
+  if (!offloadIface || !offloadIface.getIsTargetDevice())
+    return false;
+
+  auto targetOp = op.getParentOfType<omp::TargetOp>();
+
+  // It must be inside of a generic omp.target or in a target device function,
+  // and not inside of omp.parallel.
+  if (auto parallelOp = op.getParentOfType<omp::ParallelOp>()) {
+    if (!targetOp || targetOp->isProperAncestor(parallelOp))
+      return false;
+  }
+
+  if (targetOp) {
+    if (targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) !=
+        omp::TargetExecMode::generic)
+      return false;
+  } else {
+    auto declTargetIface = op.getParentOfType<omp::DeclareTargetInterface>();
+    if (!declTargetIface || !declTargetIface.isDeclareTarget() ||
+        declTargetIface.getDeclareTargetDeviceType() ==
+            omp::DeclareTargetDeviceType::host)
+      return false;
+  }
+
+  return shouldReplaceAllocaWithUses(op.getUses());
+}
+
+static void insertDeviceSharedMemDeallocation(OpBuilder &builder,
+                                              TypeAttr elemType,
+                                              Value arraySize,
+                                              IntegerAttr alignment,
+                                              Value allocVal) {
+  Block *allocaBlock = allocVal.getParentBlock();
+  DominanceInfo domInfo;
+  for (Block &block : allocVal.getParentRegion()->getBlocks()) {
+    Operation *terminator = block.getTerminator();
+    if (!terminator->hasSuccessors() &&
+        domInfo.dominates(allocaBlock, &block)) {
+      builder.setInsertionPoint(terminator);
+      omp::FreeSharedMemOp::create(builder, allocVal.getLoc(), elemType,
+                                   arraySize, alignment, allocVal);
+    }
+  }
+}
+
+namespace {
+class StackToSharedPass
+    : public omp::impl::StackToSharedPassBase<StackToSharedPass> {
+public:
+  StackToSharedPass() = default;
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    OpBuilder builder(context);
+
+    LLVM::LLVMFuncOp funcOp = getOperation();
+    auto offloadIface = funcOp->getParentOfType<omp::OffloadModuleInterface>();
+    if (!offloadIface || !offloadIface.getIsTargetDevice())
+      return;
+
+    llvm::SmallVector<Operation *> toBeDeleted;
+    funcOp->walk([&](LLVM::AllocaOp allocaOp) {
+      if (!shouldReplaceAllocaWithDeviceSharedMem(*allocaOp))
+        return;
+      // Replace llvm.alloca with omp.alloc_shared_mem.
+      Type resultType = allocaOp.getResult().getType();
+
+      // TODO: The handling of non-default address spaces might need to be
+      // improved. This currently only handles the case where an alloca to
+      // non-default address space is only used by a single addrspacecast to
+      // default address space.
+      bool nonDefaultAddrSpace = false;
+      if (auto llvmPtrType = dyn_cast<LLVM::LLVMPointerType>(resultType))
+        nonDefaultAddrSpace = llvmPtrType.getAddressSpace() != 0;
+
+      builder.setInsertionPoint(allocaOp);
+      auto sharedAllocOp = omp::AllocSharedMemOp::create(
+          builder, allocaOp->getLoc(), LLVM::LLVMPointerType::get(context),
+          allocaOp.getElemTypeAttr(), allocaOp.getArraySize(),
+          allocaOp.getAlignmentAttr());
+      if (nonDefaultAddrSpace) {
+        assert(allocaOp->hasOneUse() && " unsupported non-default address "
+                                        "space alloca with multiple uses");
+        auto asCastOp =
+            cast<LLVM::AddrSpaceCastOp>(*allocaOp->getUsers().begin());
+        asCastOp.replaceAllUsesWith(sharedAllocOp.getOperation());
+        // Delete later because we can't delete the cast op before the top-level
+        // iteration visits it. Also, the alloca can't be deleted before because
+        // it's used by it.
+        toBeDeleted.push_back(asCastOp);
+        toBeDeleted.push_back(allocaOp);
+      } else {
+        allocaOp.replaceAllUsesWith(sharedAllocOp.getOperation());
+        allocaOp.erase();
+      }
+
+      // Create a new omp.free_shared_mem for the allocated buffer prior to
+      // exiting the region.
+      insertDeviceSharedMemDeallocation(
+          builder, allocaOp.getElemTypeAttr(), allocaOp.getArraySize(),
+          allocaOp.getAlignmentAttr(), sharedAllocOp.getResult());
+    });
+    for (Operation *op : toBeDeleted)
+      op->erase();
+  }
+};
+} // namespace
diff --git a/mlir/test/Dialect/OpenMP/stack-to-shared.mlir b/mlir/test/Dialect/OpenMP/stack-to-shared.mlir
new file mode 100644
index 0000000000000..d14528e4f396a
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/stack-to-shared.mlir
@@ -0,0 +1,149 @@
+// RUN: mlir-opt --omp-stack-to-shared %s | FileCheck %s
+
+module attributes {omp.is_target_device = true} {
+
+omp.declare_reduction @add_f32 : f32
+init {
+^bb0(%arg: f32):
+  %0 = llvm.mlir.constant(0.0 : f32) : f32
+  omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+  %1 = llvm.fadd %arg0, %arg1 : f32
+  omp.yield (%1 : f32)
+}
+atomic {
+^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr):
+  %2 = llvm.load %arg3 : !llvm.ptr -> f32
+  llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32
+  omp.yield
+}
+omp.private {type = private} @privatizer_i32 : i32
+omp.private {type = firstprivate} @firstprivatizer_f32 : f32 copy {
+^bb0(%arg0: f32, %arg1: f32):
+  omp.yield(%arg0 : f32)
+}
+
+llvm.func @foo(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>}
+
+// CHECK-LABEL: llvm.func @device_func(
+// CHECK-SAME:  %[[N:.*]]: i64, %[[COND:.*]]: i1)
+llvm.func @device_func(%arg0: i64, %cond: i1) attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
+  // CHECK: %[[ALLOC0:.*]] = omp.alloc_shared_mem %[[N]] x i64 : (i64) -> !llvm.ptr
+  %0 = llvm.alloca %arg0 x i64 : (i64) -> !llvm.ptr
+  // CHECK: %[[ALLOC1:.*]] = omp.alloc_shared_mem %[[N]] x f32 : (i64) align(128) -> !llvm.ptr
+  %1 = llvm.alloca %arg0 x f32 {alignment = 128} : (i64) -> !llvm.ptr
+  // CHECK: %[[ALLOC2:.*]] = omp.alloc_shared_mem %[[N]] x vector<16xf32> : (i64) -> !llvm.ptr
+  %2 = llvm.alloca %arg0 x vector<16xf32> : (i64) -> !llvm.ptr
+  // CHECK: %[[ALLOC3:.*]] = omp.alloc_shared_mem %[[N]] x i32 : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr<5>
+  %4 = llvm.addrspacecast %3 : !llvm.ptr<5> to !llvm.ptr
+
+  // CHECK: %[[ALLOC4:.*]] = llvm.alloca %[[N]] x i32 : (i64) -> !llvm.ptr
+  %5 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr
+  // CHECK: %[[ALLOC5:.*]] = llvm.alloca %[[N]] x i32 : (i64) -> !llvm.ptr
+  %6 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr
+  // CHECK: llvm.cond_br %[[COND]], ^[[IF:.*]], ^[[ELSE:.*]]
+  llvm.cond_br %cond, ^if, ^else
+
+// CHECK: ^[[IF]]:
+^if:
+  // CHECK: omp.parallel reduction(@add_f32 %[[ALLOC0]] -> %{{.*}} : !llvm.ptr)
+  omp.parallel reduction(@add_f32 %0 -> %arg1 : !llvm.ptr) {
+    // CHECK: %{{.*}} = llvm.load %[[ALLOC2]]
+    %7 = llvm.load %2 : !llvm.ptr -> vector<16xf32>
+    // CHECK: %{{.*}} = llvm.alloca
+    %8 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr
+    // CHECK: omp.wsloop private(@privatizer_i32 %[[ALLOC4]] -> %{{.*}}, @firstprivatizer_f32 %[[ALLOC1]] -> %{{.*}} : !llvm.ptr, !llvm.ptr)
+    omp.wsloop private(@privatizer_i32 %5 -> %arg2, @firstprivatizer_f32 %1 -> %arg3 : !llvm.ptr, !llvm.ptr) {
+      omp.loop_nest (%arg4) : i64 = (%arg0) to (%arg0) inclusive step (%arg0) {
+        llvm.call @foo(%arg1) : (!llvm.ptr) -> ()
+        llvm.call @foo(%8) : (!llvm.ptr) -> ()
+        llvm.call @foo(%arg2) : (!llvm.ptr) -> ()
+        llvm.call @foo(%arg3) : (!llvm.ptr) -> ()
+        omp.yield
+      }
+    }
+    omp.terminator
+  }
+  // CHECK: llvm.br ^[[EXIT:.*]]
+  llvm.br ^exit
+
+// CHECK: ^[[ELSE]]:
+^else:
+  // CHECK: llvm.call @foo(%[[ALLOC3]]) : (!llvm.ptr) -> ()
+  llvm.call @foo(%4) : (!llvm.ptr) -> ()
+  // CHECK: %{{.*}} = llvm.load %[[ALLOC5]]
+  %8 = llvm.load %6 : !llvm.ptr -> i32
+  // CHECK: llvm.br ^[[EXIT]]
+  llvm.br ^exit
+
+// CHECK: ^[[EXIT]]:
+^exit:
+  // CHECK: omp.free_shared_mem [%[[N]] x i64 : (i64)] %[[ALLOC0]] : !llvm.ptr
+  // CHECK: omp.free_shared_mem [%[[N]] x f32 : (i64) align(128)] %[[ALLOC1]] : !llvm.ptr
+  // CHECK: omp.free_shared_mem [%[[N]] x vector<16xf32> : (i64)] %[[ALLOC2]] : !llvm.ptr
+  // CHECK: omp.free_shared_mem [%[[N]] x i32 : (i64)] %[[ALLOC3]] : !llvm.ptr
+  // CHECK-NOT: omp.free_shared_mem
+  // CHECK: llvm.return
+  llvm.return
+}
+
+// CHECK-LABEL: llvm.func @host_func(
+// CHECK-SAME:  %[[N:.*]]: i64)
+llvm.func @host_func(%arg0: i64) {
+  // CHECK: %[[ALLOC0:.*]] = llvm.alloca %[[N]] x i32 : (i64) -> !llvm.ptr
+  %0 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr
+  // CHECK: omp.parallel
+  omp.parallel {
+    // CHECK: llvm.call @foo(%[[ALLOC0]]) : (!llvm.ptr) -> ()
+    llvm.call @foo(%0) : (!llvm.ptr) -> ()
+    // CHECK: omp.target
+    omp.target {
+      %c0 = llvm.mlir.constant(1 : i64) : i64
+      // CHECK: %[[ALLOC1:.*]] = omp.alloc_shared_mem [[ALLOC1_SIZE:.*]] -> !llvm.ptr
+      %1 = llvm.alloca %c0 x i32 : (i64) -> !llvm.ptr
+      // CHECK-NEXT: llvm.call @foo(%[[ALLOC1]]) : (!llvm.ptr) -> ()
+      llvm.call @foo(%1) : (!llvm.ptr) -> ()
+      // CHECK-NEXT: omp.free_shared_mem [[[ALLOC1_SIZE]]] %[[ALLOC1]] : !llvm.ptr
+      // CHECK-NEXT: omp.terminator
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: llvm.func @target_spmd(
+llvm.func @target_spmd() {
+  // CHECK-NOT: omp.alloc_shared_mem
+  // CHECK-NOT: omp.free_shared_mem
+  omp.target {
+    %c = llvm.mlir.constant(1 : i64) : i64
+    %0 = llvm.alloca %c x i32 : (i64) -> !llvm.ptr
+    omp.teams {
+      %1 = llvm.alloca %c x i32 : (i64) -> !llvm.ptr
+      omp.parallel {
+        %2 = llvm.alloca %c x i32 : (i64) -> !llvm.ptr
+        %3 = llvm.load %0 : !llvm.ptr -> i32
+        %4 = llvm.load %1 : !llvm.ptr -> i32
+        omp.distribute {
+          omp.wsloop {
+            omp.loop_nest (%arg0) : i64 = (%c) to (%c) inclusive step (%c) {
+              %5 = llvm.load %2 : !llvm.ptr -> i32
+              omp.yield
+            }
+          } {omp.composite}
+        } {omp.composite}
+        omp.terminator
+      } {omp.composite}
+      omp.terminator
+    }
+    omp.terminator
+  }
+  // CHECK: return
+  llvm.return
+}
+
+}