[flang-commits] [flang] [flang][OpenACC] add pass to bufferize fir.box recipes (PR #163543)

via flang-commits flang-commits at lists.llvm.org
Wed Oct 15 03:48:35 PDT 2025


https://github.com/jeanPerier created https://github.com/llvm/llvm-project/pull/163543

When working on privatization, it is easier to work with fir.box explicitly in memory, otherwise, there is no way to express that the fir.box will end-up being a descriptor address in FIR which makes it hard to deal with data management.

However, introducing fir.ref<fir.box> early can pessimize early HLFIR optimization because it is harder to reason about the aliasing of `fir.ref<fir.box>` because of the extra memory indirection.

This patch introduces a pass that turns acc `!fir.box<T>` recipes into `!fir.ref<!fir.box<T>>` recipes and updated the related recipe usages to use `!fir.ref<!fir.box<T>>` (creating new alloca+store+load).

It is added to flang and not OpenACC because it is specific to the `fir.box` type, so it makes little sense to make it an OpenACC generic pass and to create a new OpenACC dialect type interface for this use case.

>From 5874af3617f15c18bd31349312e0e89334d76a5e Mon Sep 17 00:00:00 2001
From: Jean Perier <jperier at nvidia.com>
Date: Wed, 15 Oct 2025 03:39:55 -0700
Subject: [PATCH] [flang][OpenACC] add pass to bufferize box recipes

---
 flang/include/flang/Optimizer/CMakeLists.txt  |   1 +
 .../flang/Optimizer/OpenACC/CMakeLists.txt    |   4 +
 .../include/flang/Optimizer/OpenACC/Passes.h  |  33 ++
 .../include/flang/Optimizer/OpenACC/Passes.td |  36 ++
 flang/lib/Optimizer/OpenACC/CMakeLists.txt    |   1 +
 .../Transforms/ACCRecipeBufferization.cpp     | 191 +++++++++++
 .../OpenACC/Transforms/CMakeLists.txt         |  11 +
 .../Fir/OpenACC/recipe-bufferization.mlir     | 316 ++++++++++++++++++
 flang/tools/fir-opt/CMakeLists.txt            |   1 +
 flang/tools/fir-opt/fir-opt.cpp               |   2 +
 10 files changed, 596 insertions(+)
 create mode 100644 flang/include/flang/Optimizer/OpenACC/CMakeLists.txt
 create mode 100644 flang/include/flang/Optimizer/OpenACC/Passes.h
 create mode 100644 flang/include/flang/Optimizer/OpenACC/Passes.td
 create mode 100644 flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
 create mode 100644 flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
 create mode 100644 flang/test/Fir/OpenACC/recipe-bufferization.mlir

diff --git a/flang/include/flang/Optimizer/CMakeLists.txt b/flang/include/flang/Optimizer/CMakeLists.txt
index 3336ac935e101..68af52f1b8dc7 100644
--- a/flang/include/flang/Optimizer/CMakeLists.txt
+++ b/flang/include/flang/Optimizer/CMakeLists.txt
@@ -2,4 +2,5 @@ add_subdirectory(CodeGen)
 add_subdirectory(Dialect)
 add_subdirectory(HLFIR)
 add_subdirectory(Transforms)
+add_subdirectory(OpenACC)
 add_subdirectory(OpenMP)
diff --git a/flang/include/flang/Optimizer/OpenACC/CMakeLists.txt b/flang/include/flang/Optimizer/OpenACC/CMakeLists.txt
new file mode 100644
index 0000000000000..a032488569b19
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name FIROpenACC)
+
+add_public_tablegen_target(FIROpenACCPassesIncGen)
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.h b/flang/include/flang/Optimizer/OpenACC/Passes.h
new file mode 100644
index 0000000000000..0627cc8ce4a6d
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.h
@@ -0,0 +1,33 @@
+//===- Passes.h - OpenACC pass entry points -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header declares the OpenACC passes specific to Fortran and FIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_OPENACC_PASSES_H
+#define FORTRAN_OPTIMIZER_OPENACC_PASSES_H
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+#include <memory>
+
+namespace fir {
+namespace acc {
+#define GEN_PASS_DECL
+#define GEN_PASS_REGISTRATION
+#include "flang/Optimizer/OpenACC/Passes.h.inc"
+
+std::unique_ptr<mlir::Pass> createACCRecipeBufferizationPass();
+
+} // namespace acc
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_OPENACC_PASSES_H
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.td b/flang/include/flang/Optimizer/OpenACC/Passes.td
new file mode 100644
index 0000000000000..3c127b30aa9b8
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.td
@@ -0,0 +1,36 @@
+//===-- Passes.td - flang OpenACC pass definitions -----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_OPENACC_PASSES
+#define FORTRAN_OPTIMIZER_OPENACC_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def ACCRecipeBufferization
+    : Pass<"fir-acc-recipe-bufferization", "mlir::ModuleOp"> {
+  let summary = "Rewrite acc.*.recipe box values to ref<box> and update uses";
+  let description = [{
+    Bufferizes OpenACC recipes that operate on fir.box<T> so their type and
+    region block arguments become fir.ref<fir.box<T>> instead. This applies to
+    acc.private.recipe, acc.firstprivate.recipe (including copy region), and
+    acc.reduction.recipe (including combiner region).
+
+    For affected regions, the pass inserts required loads at the beginning of
+    the region to preserve original uses after argument type changes. For yields
+    of box values, the pass allocates a local fir.ref<fir.box<T>> and stores the
+    yielded fir.box<T> into it so the region yields a reference to a box.
+
+    For acc.private, acc.firstprivate, and acc.reduction operations that use a
+    bufferized recipe, the pass allocates a host-side fir.ref<fir.box<T>> before
+    the data op and rewires the data op to use the new memory. Other users of
+    the original data operation result (outside the paired compute op) are
+    updated to load through the reference.
+  }];
+}
+
+#endif // FORTRAN_OPTIMIZER_OPENACC_PASSES
diff --git a/flang/lib/Optimizer/OpenACC/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
index fc23e64eeb7a4..790b9fdb1589a 100644
--- a/flang/lib/Optimizer/OpenACC/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(Support)
+add_subdirectory(Transforms)
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
new file mode 100644
index 0000000000000..4840a999ecd27
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
@@ -0,0 +1,191 @@
+//===- ACCRecipeBufferization.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Bufferize OpenACC recipes that yield fir.box<T> to operate on
+// fir.ref<fir.box<T>> and update uses accordingly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/OpenACC/Passes.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Visitors.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+namespace fir::acc {
+#define GEN_PASS_DEF_ACCRECIPEBUFFERIZATION
+#include "flang/Optimizer/OpenACC/Passes.h.inc"
+} // namespace fir::acc
+
+namespace {
+
+class BufferizeInterface {
+public:
+  static std::optional<mlir::Type> mustBufferize(mlir::Type recipeType) {
+    if (auto boxTy = llvm::dyn_cast<fir::BaseBoxType>(recipeType))
+      return fir::ReferenceType::get(boxTy);
+    return std::nullopt;
+  }
+
+  static mlir::Operation *load(mlir::OpBuilder &builder, mlir::Location loc,
+                               mlir::Value value) {
+    return builder.create<fir::LoadOp>(loc, value);
+  }
+
+  static mlir::Value placeInMemory(mlir::OpBuilder &builder, mlir::Location loc,
+                                   mlir::Value value) {
+    auto alloca = builder.create<fir::AllocaOp>(loc, value.getType());
+    builder.create<fir::StoreOp>(loc, value, alloca);
+    return alloca;
+  }
+};
+
+static void bufferizeRegionArgsAndYields(mlir::Region &region,
+                                         mlir::Location loc, mlir::Type oldType,
+                                         mlir::Type newType) {
+  if (region.empty())
+    return;
+
+  mlir::OpBuilder builder(&region);
+  for (mlir::BlockArgument arg : region.getArguments()) {
+    if (arg.getType() == oldType) {
+      arg.setType(newType);
+      if (!arg.use_empty()) {
+        mlir::Operation *loadOp = BufferizeInterface::load(builder, loc, arg);
+        arg.replaceAllUsesExcept(loadOp->getResult(0), loadOp);
+      }
+    }
+  }
+  if (auto yield =
+          llvm::dyn_cast<mlir::acc::YieldOp>(region.back().getTerminator())) {
+    llvm::SmallVector<mlir::Value> newOperands;
+    newOperands.reserve(yield.getNumOperands());
+    bool changed = false;
+    for (mlir::Value oldYieldArg : yield.getOperands()) {
+      if (oldYieldArg.getType() == oldType) {
+        builder.setInsertionPoint(yield);
+        mlir::Value alloca =
+            BufferizeInterface::placeInMemory(builder, loc, oldYieldArg);
+        newOperands.push_back(alloca);
+        changed = true;
+      } else {
+        newOperands.push_back(oldYieldArg);
+      }
+    }
+    if (changed)
+      yield->setOperands(newOperands);
+  }
+}
+
+static void updateRecipeUse(mlir::ArrayAttr recipes, mlir::ValueRange operands,
+                            llvm::StringRef recipeSymName,
+                            mlir::Operation *computeOp) {
+  if (!recipes)
+    return;
+  for (auto [recipeSym, oldRes] : llvm::zip(recipes, operands)) {
+    if (llvm::cast<mlir::SymbolRefAttr>(recipeSym).getLeafReference() !=
+        recipeSymName)
+      continue;
+
+    mlir::Operation *dataOp = oldRes.getDefiningOp();
+    assert(dataOp && "dataOp must be paired with computeOp");
+    mlir::Location loc = dataOp->getLoc();
+    mlir::OpBuilder builder(dataOp);
+    llvm::TypeSwitch<mlir::Operation *, void>(dataOp)
+        .Case<mlir::acc::PrivateOp, mlir::acc::FirstprivateOp,
+              mlir::acc::ReductionOp>([&](auto privateOp) {
+          builder.setInsertionPointAfterValue(privateOp.getVar());
+          mlir::Value alloca = BufferizeInterface::placeInMemory(
+              builder, loc, privateOp.getVar());
+          privateOp.getVarMutable().assign(alloca);
+          privateOp.getAccVar().setType(alloca.getType());
+        });
+
+    llvm::SmallVector<mlir::Operation *> users(oldRes.getUsers().begin(),
+                                               oldRes.getUsers().end());
+    for (mlir::Operation *useOp : users) {
+      if (useOp == computeOp)
+        continue;
+      builder.setInsertionPoint(useOp);
+      mlir::Operation *load = BufferizeInterface::load(builder, loc, oldRes);
+      useOp->replaceUsesOfWith(oldRes, load->getResult(0));
+    }
+  }
+}
+
+class ACCRecipeBufferization
+    : public fir::acc::impl::ACCRecipeBufferizationBase<
+          ACCRecipeBufferization> {
+public:
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+
+    llvm::SmallVector<llvm::StringRef> recipeNames;
+    module.walk([&](mlir::Operation *recipe) {
+      llvm::TypeSwitch<mlir::Operation *, void>(recipe)
+          .Case<mlir::acc::PrivateRecipeOp, mlir::acc::FirstprivateRecipeOp,
+                mlir::acc::ReductionRecipeOp>([&](auto recipe) {
+            mlir::Type oldType = recipe.getType();
+            auto bufferizedType =
+                BufferizeInterface::mustBufferize(recipe.getType());
+            if (!bufferizedType)
+              return;
+            recipe.setTypeAttr(mlir::TypeAttr::get(*bufferizedType));
+            mlir::Location loc = recipe.getLoc();
+            using RecipeOp = decltype(recipe);
+            bufferizeRegionArgsAndYields(recipe.getInitRegion(), loc, oldType,
+                                         *bufferizedType);
+            if constexpr (std::is_same_v<RecipeOp,
+                                         mlir::acc::FirstprivateRecipeOp>)
+              bufferizeRegionArgsAndYields(recipe.getCopyRegion(), loc, oldType,
+                                           *bufferizedType);
+            if constexpr (std::is_same_v<RecipeOp,
+                                         mlir::acc::ReductionRecipeOp>)
+              bufferizeRegionArgsAndYields(recipe.getCombinerRegion(), loc,
+                                           oldType, *bufferizedType);
+            bufferizeRegionArgsAndYields(recipe.getDestroyRegion(), loc,
+                                         oldType, *bufferizedType);
+            recipeNames.push_back(recipe.getSymName());
+          });
+    });
+    if (recipeNames.empty())
+      return;
+
+    module.walk([&](mlir::Operation *op) {
+      llvm::TypeSwitch<mlir::Operation *, void>(op)
+          .Case<mlir::acc::LoopOp, mlir::acc::ParallelOp, mlir::acc::SerialOp>(
+              [&](auto computeOp) {
+                for (llvm::StringRef recipeName : recipeNames) {
+                  if (computeOp.getPrivatizationRecipes())
+                    updateRecipeUse(computeOp.getPrivatizationRecipesAttr(),
+                                    computeOp.getPrivateOperands(), recipeName,
+                                    op);
+                  if (computeOp.getFirstprivatizationRecipes())
+                    updateRecipeUse(
+                        computeOp.getFirstprivatizationRecipesAttr(),
+                        computeOp.getFirstprivateOperands(), recipeName, op);
+                  if (computeOp.getReductionRecipes())
+                    updateRecipeUse(computeOp.getReductionRecipesAttr(),
+                                    computeOp.getReductionOperands(),
+                                    recipeName, op);
+                }
+              });
+    });
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> fir::acc::createACCRecipeBufferizationPass() {
+  return std::make_unique<ACCRecipeBufferization>();
+}
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000..3a2d945f8ac5a
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_flang_library(FIROpenACCTransforms
+  ACCRecipeBufferization.cpp
+
+  DEPENDS
+  FIROpenACCPassesIncGen
+
+  LINK_LIBS
+  MLIRIR
+  FIRDialect
+  MLIROpenACCDialect
+)
diff --git a/flang/test/Fir/OpenACC/recipe-bufferization.mlir b/flang/test/Fir/OpenACC/recipe-bufferization.mlir
new file mode 100644
index 0000000000000..c4f96f63d5076
--- /dev/null
+++ b/flang/test/Fir/OpenACC/recipe-bufferization.mlir
@@ -0,0 +1,316 @@
+// RUN: fir-opt %s --fir-acc-recipe-bufferization -split-input-file | FileCheck %s
+
+// -----
+
+acc.private.recipe @priv_ref_box : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %1 = fir.allocmem i32
+  %2 = fir.embox %1 : (!fir.heap<i32>) -> !fir.box<i32>
+  acc.yield %2 : !fir.box<i32>
+} destroy {
+^bb0(%arg0: !fir.box<i32>, %arg1: !fir.box<i32>):
+  %0 = fir.box_addr %arg1 : (!fir.box<i32>) -> !fir.ref<i32>
+  %1 = fir.convert %0 : (!fir.ref<i32>) -> !fir.heap<i32>
+  fir.freemem %1 : !fir.heap<i32>
+  acc.yield
+}
+
+// CHECK-LABEL: acc.private.recipe @priv_ref_box : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[ARG:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX:.*]] = fir.embox
+// CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX]] to %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%[[DARG0:.*]]: !fir.ref<!fir.box<i32>>, %[[DARG1:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LD1:.*]] = fir.load %[[DARG1]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[ADDR:.*]] = fir.box_addr %[[LD1]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[CVT:.*]] = fir.convert %[[ADDR]] : (!fir.ref<i32>) -> !fir.heap<i32>
+
+// -----
+
+// Test private recipe without destroy region.
+
+acc.private.recipe @priv_ref_box_no_destroy : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %1 = fir.alloca i32
+  %2 = fir.embox %1 : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %2 : !fir.box<i32>
+}
+
+// CHECK-LABEL: acc.private.recipe @priv_ref_box_no_destroy : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[ARG:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX:.*]] = fir.embox
+// CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX]] to %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK: }
+
+// -----
+
+// Firstprivate recipe with destroy region.
+acc.firstprivate.recipe @fp_ref_box : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.allocmem i32
+  %1 = fir.embox %0 : (!fir.heap<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} copy {
+^bb0(%src: !fir.box<i32>, %dst: !fir.box<i32>):
+  %s_addr = fir.box_addr %src : (!fir.box<i32>) -> !fir.ref<i32>
+  %val = fir.load %s_addr : !fir.ref<i32>
+  %d_addr = fir.box_addr %dst : (!fir.box<i32>) -> !fir.ref<i32>
+  fir.store %val to %d_addr : !fir.ref<i32>
+  acc.yield
+} destroy {
+^bb0(%arg0: !fir.box<i32>, %arg1: !fir.box<i32>):
+  acc.yield
+}
+
+// CHECK-LABEL: acc.firstprivate.recipe @fp_ref_box : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[IARG:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX_FP:.*]] = fir.embox
+// CHECK:   %[[ALLOCA_FP:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX_FP]] to %[[ALLOCA_FP]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA_FP]] : !fir.ref<!fir.box<i32>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.box<i32>>, %[[DST:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LSRC:.*]] = fir.load %[[SRC]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LDST:.*]] = fir.load %[[DST]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[SADDR:.*]] = fir.box_addr %[[LSRC]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[VAL:.*]] = fir.load %[[SADDR]] : !fir.ref<i32>
+// CHECK:   %[[DADDR:.*]] = fir.box_addr %[[LDST]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   fir.store %[[VAL]] to %[[DADDR]] : !fir.ref<i32>
+// CHECK: } destroy {
+// CHECK: ^bb0(%[[FDARG0:.*]]: !fir.ref<!fir.box<i32>>, %[[FDARG1:.*]]: !fir.ref<!fir.box<i32>>)
+
+// -----
+
+// Firstprivate recipe without destroy region.
+acc.firstprivate.recipe @fp_ref_box_no_destroy : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.alloca i32
+  %1 = fir.embox %0 : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} copy {
+^bb0(%src: !fir.box<i32>, %dst: !fir.box<i32>):
+  %s_addr = fir.box_addr %src : (!fir.box<i32>) -> !fir.ref<i32>
+  %val = fir.load %s_addr : !fir.ref<i32>
+  %d_addr = fir.box_addr %dst : (!fir.box<i32>) -> !fir.ref<i32>
+  fir.store %val to %d_addr : !fir.ref<i32>
+  acc.yield
+}
+
+// CHECK-LABEL: acc.firstprivate.recipe @fp_ref_box_no_destroy : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[IARG2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX_FP2:.*]] = fir.embox
+// CHECK:   %[[ALLOCA_FP2:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX_FP2]] to %[[ALLOCA_FP2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA_FP2]] : !fir.ref<!fir.box<i32>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC2:.*]]: !fir.ref<!fir.box<i32>>, %[[DST2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LSRC2:.*]] = fir.load %[[SRC2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LDST2:.*]] = fir.load %[[DST2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[SADDR2:.*]] = fir.box_addr %[[LSRC2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[VAL2:.*]] = fir.load %[[SADDR2]] : !fir.ref<i32>
+// CHECK:   %[[DADDR2:.*]] = fir.box_addr %[[LDST2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   fir.store %[[VAL2]] to %[[DADDR2]] : !fir.ref<i32>
+
+// -----
+
+// Reduction recipe with destroy region.
+acc.reduction.recipe @red_ref_box : !fir.box<i32> reduction_operator <add> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.allocmem i32
+  %1 = fir.embox %0 : (!fir.heap<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} combiner {
+^bb0(%lhs: !fir.box<i32>, %rhs: !fir.box<i32>):
+  %l_addr = fir.box_addr %lhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %l_val = fir.load %l_addr : !fir.ref<i32>
+  %r_addr = fir.box_addr %rhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %r_val = fir.load %r_addr : !fir.ref<i32>
+  %sum = arith.addi %l_val, %r_val : i32
+  %tmp = fir.alloca i32
+  fir.store %sum to %tmp : !fir.ref<i32>
+  %new = fir.embox %tmp : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %new : !fir.box<i32>
+} destroy {
+^bb0(%arg0: !fir.box<i32>, %arg1: !fir.box<i32>):
+  acc.yield
+}
+
+// CHECK-LABEL: acc.reduction.recipe @red_ref_box : !fir.ref<!fir.box<i32>> reduction_operator <add> init
+// CHECK: ^bb0(%[[IARGR:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOXR:.*]] = fir.embox
+// CHECK:   %[[ALLOCAR:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOXR]] to %[[ALLOCAR]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCAR]] : !fir.ref<!fir.box<i32>>
+// CHECK: } combiner {
+// CHECK: ^bb0(%[[LHS:.*]]: !fir.ref<!fir.box<i32>>, %[[RHS:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LLHS:.*]] = fir.load %[[LHS]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LRHS:.*]] = fir.load %[[RHS]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LADDR:.*]] = fir.box_addr %[[LLHS]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[LVAL:.*]] = fir.load %[[LADDR]] : !fir.ref<i32>
+// CHECK:   %[[RADDR:.*]] = fir.box_addr %[[LRHS]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[RVAL:.*]] = fir.load %[[RADDR]] : !fir.ref<i32>
+// CHECK:   %[[SUM:.*]] = arith.addi %[[LVAL]], %[[RVAL]] : i32
+// CHECK:   %[[I32ALLOCA:.*]] = fir.alloca i32
+// CHECK:   fir.store %[[SUM]] to %[[I32ALLOCA]] : !fir.ref<i32>
+// CHECK:   %[[NEWBOX:.*]] = fir.embox %[[I32ALLOCA]] : (!fir.ref<i32>) -> !fir.box<i32>
+// CHECK:   %[[BOXALLOCA:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[NEWBOX]] to %[[BOXALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[BOXALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%[[RD0:.*]]: !fir.ref<!fir.box<i32>>, %[[RD1:.*]]: !fir.ref<!fir.box<i32>>)
+
+// -----
+
+// Reduction recipe without destroy region.
+acc.reduction.recipe @red_ref_box_no_destroy : !fir.box<i32> reduction_operator <add> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.alloca i32
+  %1 = fir.embox %0 : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} combiner {
+^bb0(%lhs: !fir.box<i32>, %rhs: !fir.box<i32>):
+  %l_addr = fir.box_addr %lhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %l_val = fir.load %l_addr : !fir.ref<i32>
+  %r_addr = fir.box_addr %rhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %r_val = fir.load %r_addr : !fir.ref<i32>
+  %sum = arith.addi %l_val, %r_val : i32
+  %tmp = fir.alloca i32
+  fir.store %sum to %tmp : !fir.ref<i32>
+  %new = fir.embox %tmp : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %new : !fir.box<i32>
+}
+
+// CHECK-LABEL: acc.reduction.recipe @red_ref_box_no_destroy : !fir.ref<!fir.box<i32>> reduction_operator <add> init
+// CHECK: ^bb0(%[[IARGR2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOXR2:.*]] = fir.embox
+// CHECK:   %[[ALLOCAR2:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOXR2]] to %[[ALLOCAR2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCAR2]] : !fir.ref<!fir.box<i32>>
+// CHECK: } combiner {
+// CHECK: ^bb0(%[[LHS2:.*]]: !fir.ref<!fir.box<i32>>, %[[RHS2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LLHS2:.*]] = fir.load %[[LHS2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LRHS2:.*]] = fir.load %[[RHS2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LADDR2:.*]] = fir.box_addr %[[LLHS2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[LVAL2:.*]] = fir.load %[[LADDR2]] : !fir.ref<i32>
+// CHECK:   %[[RADDR2:.*]] = fir.box_addr %[[LRHS2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[RVAL2:.*]] = fir.load %[[RADDR2]] : !fir.ref<i32>
+// CHECK:   %[[SUM2:.*]] = arith.addi %[[LVAL2]], %[[RVAL2]] : i32
+// CHECK:   %[[I32ALLOCA2:.*]] = fir.alloca i32
+// CHECK:   fir.store %[[SUM2]] to %[[I32ALLOCA2]] : !fir.ref<i32>
+// CHECK:   %[[NEWBOX2:.*]] = fir.embox %[[I32ALLOCA2]] : (!fir.ref<i32>) -> !fir.box<i32>
+// CHECK:   %[[BOXALLOCA2:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[NEWBOX2]] to %[[BOXALLOCA2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[BOXALLOCA2]] : !fir.ref<!fir.box<i32>>
+
+// -----
+
+// Comprehensive tests that also test recipe usages updates.
+
+acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
+^bb0(%arg0: !fir.ref<i32>):
+  %0 = fir.alloca i32
+  %1 = fir.declare %0 {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  acc.yield %1 : !fir.ref<i32>
+}
+acc.private.recipe @privatization_box_Uxf32 : !fir.box<!fir.array<?xf32>> init {
+^bb0(%arg0: !fir.box<!fir.array<?xf32>>):
+  %c0 = arith.constant 0 : index
+  %0:3 = fir.box_dims %arg0, %c0 : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+  %1 = fir.shape %0#1 : (index) -> !fir.shape<1>
+  %2 = fir.allocmem !fir.array<?xf32>, %0#1 {bindc_name = ".tmp", uniq_name = ""}
+  %3 = fir.declare %2(%1) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xf32>>
+  %4 = fir.embox %3(%1) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+  acc.yield %4 : !fir.box<!fir.array<?xf32>>
+} destroy {
+^bb0(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: !fir.box<!fir.array<?xf32>>):
+  %0 = fir.box_addr %arg1 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+  %1 = fir.convert %0 : (!fir.ref<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+  fir.freemem %1 : !fir.heap<!fir.array<?xf32>>
+  acc.terminator
+}
+func.func @_QPfoo(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+  %c200_i32 = arith.constant 200 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFfooEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+  acc.parallel combined(loop) {
+    %4 = acc.private var(%3 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {name = "x"}
+    %5 = acc.private varPtr(%2 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+    acc.loop combined(parallel) private(@privatization_box_Uxf32 -> %4 : !fir.box<!fir.array<?xf32>>, @privatization_ref_i32 -> %5 : !fir.ref<i32>) control(%arg1 : i32) = (%c1_i32 : i32) to (%c200_i32 : i32)  step (%c1_i32 : i32) {
+      %6 = fir.dummy_scope : !fir.dscope
+      %7 = fir.declare %4 dummy_scope %6 {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+      %8 = fir.declare %5 {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+      %9 = fir.convert %arg1 : (i32) -> f32
+      %10 = fir.convert %arg1 : (i32) -> i64
+      %11 = fir.array_coor %7 %10 : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+      fir.store %9 to %11 : !fir.ref<f32>
+      acc.yield
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  return
+}
+
+// CHECK-LABEL:   acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+// CHECK:           %[[VAL_1:.*]] = fir.alloca i32
+// CHECK:           %[[VAL_2:.*]] = fir.declare %[[VAL_1]] {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           acc.yield %[[VAL_2]] : !fir.ref<i32>
+// CHECK:         }
+
+// CHECK-LABEL:   acc.private.recipe @privatization_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> init {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>):
+// CHECK:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = fir.allocmem !fir.array<?xf32>, %[[VAL_3]]#1 {bindc_name = ".tmp", uniq_name = ""}
+// CHECK:           %[[VAL_6:.*]] = fir.declare %[[VAL_5]](%[[VAL_4]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]](%[[VAL_4]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+// CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
+// CHECK:           fir.store %[[VAL_7]] to %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           acc.yield %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+
+// CHECK-LABEL:   } destroy {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>):
+// CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+// CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:           fir.freemem %[[VAL_4]] : !fir.heap<!fir.array<?xf32>>
+// CHECK:           acc.terminator
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @_QPfoo(
+// CHECK-SAME:                      %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 200 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFfooEi"}
+// CHECK:           %[[VAL_4:.*]] = fir.declare %[[VAL_3]] {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_5:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+// CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
+// CHECK:           fir.store %[[VAL_5]] to %[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           acc.parallel combined(loop) {
+// CHECK:             %[[VAL_7:.*]] = acc.private varPtr(%[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xf32>>>) -> !fir.ref<!fir.box<!fir.array<?xf32>>> {name = "x"}
+// CHECK:             %[[VAL_8:.*]] = acc.private varPtr(%[[VAL_4]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+// CHECK:             acc.loop combined(parallel) private(@privatization_box_Uxf32 -> %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, @privatization_ref_i32 -> %[[VAL_8]] : !fir.ref<i32>) control(%[[VAL_9:.*]] : i32) = (%[[VAL_1]] : i32) to (%[[VAL_0]] : i32)  step (%[[VAL_1]] : i32) {
+// CHECK:               %[[VAL_10:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:               %[[VAL_12:.*]] = fir.declare %[[VAL_11]] dummy_scope %[[VAL_10]] {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+// CHECK:               %[[VAL_13:.*]] = fir.declare %[[VAL_8]] {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:               %[[VAL_14:.*]] = fir.convert %[[VAL_9]] : (i32) -> f32
+// CHECK:               %[[VAL_15:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
+// CHECK:               %[[VAL_16:.*]] = fir.array_coor %[[VAL_12]] %[[VAL_15]] : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+// CHECK:               fir.store %[[VAL_14]] to %[[VAL_16]] : !fir.ref<f32>
+// CHECK:               acc.yield
+// CHECK:             } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+// CHECK:             acc.yield
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
diff --git a/flang/tools/fir-opt/CMakeLists.txt b/flang/tools/fir-opt/CMakeLists.txt
index 4ee9752727b87..c5bd4390a4b78 100644
--- a/flang/tools/fir-opt/CMakeLists.txt
+++ b/flang/tools/fir-opt/CMakeLists.txt
@@ -22,6 +22,7 @@ target_link_libraries(fir-opt PRIVATE
   HLFIRDialect
   HLFIRTransforms
   FIROpenACCSupport
+  FIROpenACCTransforms
   FIROpenMPSupport
   FlangOpenMPTransforms
   FIRAnalysis
diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp
index d66fc3f08bdf8..b0b277b88dbe2 100644
--- a/flang/tools/fir-opt/fir-opt.cpp
+++ b/flang/tools/fir-opt/fir-opt.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "flang/Optimizer/CodeGen/CodeGen.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenACC/Passes.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/Support/InitFIR.h"
 #include "flang/Optimizer/Transforms/Passes.h"
@@ -37,6 +38,7 @@ int main(int argc, char **argv) {
   fir::registerOptTransformPasses();
   hlfir::registerHLFIRPasses();
   flangomp::registerFlangOpenMPPasses();
+  fir::acc::registerFIROpenACCPasses();
 #ifdef FLANG_INCLUDE_TESTS
   fir::test::registerTestFIRAliasAnalysisPass();
   fir::test::registerTestFIROpenACCInterfacesPass();



More information about the flang-commits mailing list