[Mlir-commits] [flang] [mlir] [flang][acc] Add ACCOptimizeFirstprivateMap pass (PR #178546)

Razvan Lupusoru llvmlistbot at llvm.org
Thu Jan 29 10:45:07 PST 2026


https://github.com/razvanlupusoru updated https://github.com/llvm/llvm-project/pull/178546

>From 43bcd202e5b152b2507b4dfaab96c81aca7b3b80 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Wed, 28 Jan 2026 15:59:26 -0800
Subject: [PATCH 1/6] [flang][acc] Add ACCOptimizeFirstprivateMap pass

This pass optimizes acc.firstprivate_map operations generated
during OpenACC recipe materialization when acc.firstprivate is
materialized into the mapping and a private allocation inside region.
The optimization applies to scalar variables of trivial types
(integers, reals, logicals) as long as they are not optional.

The pass hoists loads from the firstprivate variable to before the
compute region, converting the firstprivate copy to a pass-by-value
pattern. This eliminates the need for runtime copying the
firstprivate variable since only its value is needed for initializing
private copies.
---
 .../include/flang/Optimizer/OpenACC/Passes.h  |   1 +
 .../include/flang/Optimizer/OpenACC/Passes.td |  12 +
 .../OpenACC/Support/FIROpenACCOpsInterfaces.h |   6 +-
 .../OpenACC/Support/FIROpenACCUtils.h         |   8 +
 .../Support/FIROpenACCTypeInterfaces.cpp      |  39 +--
 .../OpenACC/Support/FIROpenACCUtils.cpp       |  35 +++
 .../Transforms/ACCOptimizeFirstprivateMap.cpp | 190 ++++++++++++++
 .../OpenACC/Transforms/CMakeLists.txt         |   3 +-
 .../OpenACC/acc-optimize-firstprivate-map.fir | 241 ++++++++++++++++++
 .../Dialect/OpenACC/OpenACCOpsInterfaces.td   |   8 +
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       |   6 +-
 11 files changed, 510 insertions(+), 39 deletions(-)
 create mode 100644 flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp
 create mode 100644 flang/test/Transforms/OpenACC/acc-optimize-firstprivate-map.fir

diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.h b/flang/include/flang/Optimizer/OpenACC/Passes.h
index 64ddb84e63c3e..fd1c4db8d78c0 100644
--- a/flang/include/flang/Optimizer/OpenACC/Passes.h
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.h
@@ -29,6 +29,7 @@ namespace acc {
 #include "flang/Optimizer/OpenACC/Passes.h.inc"
 
 std::unique_ptr<mlir::Pass> createACCInitializeFIRAnalysesPass();
+std::unique_ptr<mlir::Pass> createACCOptimizeFirstprivateMapPass();
 std::unique_ptr<mlir::Pass> createACCRecipeBufferizationPass();
 std::unique_ptr<mlir::Pass> createACCUseDeviceCanonicalizerPass();
 
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.td b/flang/include/flang/Optimizer/OpenACC/Passes.td
index 8579a471d9a56..ea324b0ae5f7f 100644
--- a/flang/include/flang/Optimizer/OpenACC/Passes.td
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.td
@@ -70,4 +70,16 @@ def ACCUseDeviceCanonicalizer
   let dependentDialects = ["mlir::acc::OpenACCDialect", "fir::FIROpsDialect"];
 }
 
+def ACCOptimizeFirstprivateMap
+    : Pass<"acc-optimize-firstprivate-map", "mlir::func::FuncOp"> {
+  let summary = "Optimize firstprivate mapping";
+  let description = [{
+    This pass optimizes acc firstprivate mapping operations by hoisting
+    loads from the mapped variable to before the compute region. This enables
+    pass-by-value instead of using global memory mapping through the
+    runtime.
+  }];
+  let dependentDialects = ["mlir::acc::OpenACCDialect", "fir::FIROpsDialect"];
+}
+
 #endif // FORTRAN_OPTIMIZER_OPENACC_PASSES
diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h
index 4847f3920eec1..7a68ee6234ece 100644
--- a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h
+++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h
@@ -94,7 +94,11 @@ struct OutlineRematerializationModel
 template <typename Op>
 struct OffloadRegionModel
     : public mlir::acc::OffloadRegionOpInterface::ExternalModel<
-          OffloadRegionModel<Op>, Op> {};
+          OffloadRegionModel<Op>, Op> {
+  mlir::Region &getOffloadRegion(mlir::Operation *op) const {
+    return mlir::cast<Op>(op).getRegion();
+  }
+};
 
 /// External model for fir::OperationMoveOpInterface.
 /// This interface provides methods to identify whether
diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h
index 79f5626df4d24..360f4eb000a9e 100644
--- a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h
+++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h
@@ -88,6 +88,14 @@ createOrGetReductionRecipe(mlir::OpBuilder &builder, mlir::Location loc,
                            llvm::SmallVector<mlir::Value> &dataBoundOps,
                            mlir::Attribute fastMathAttr = {});
 
+/// Walks through operations that forward or view their operand and returns
+/// the original defining value. This strips operations like fir.convert,
+/// ViewLikeOpInterface, and optionally fir.declare/hlfir.declare.
+/// \param value The value to trace back to its origin
+/// \param stripDeclare If true (default), also strips declare operations
+/// \return The original value after stripping all intermediate operations
+mlir::Value getOriginalDef(mlir::Value value, bool stripDeclare = true);
+
 } // namespace acc
 } // namespace fir
 
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
index 7619a21fc0477..2c855ecbe99e1 100644
--- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h"
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/DirectivesCommon.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
@@ -1211,40 +1212,6 @@ template mlir::Value OpenACCPointerLikeModel<fir::LLVMPointerType>::genAllocate(
     llvm::StringRef varName, mlir::Type varType, mlir::Value originalVar,
     bool &needsFree) const;
 
-static mlir::Value stripCasts(mlir::Value value, bool stripDeclare = true) {
-  mlir::Value currentValue = value;
-
-  while (currentValue) {
-    auto *definingOp = currentValue.getDefiningOp();
-    if (!definingOp)
-      break;
-
-    if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(definingOp)) {
-      currentValue = convertOp.getValue();
-      continue;
-    }
-
-    if (auto viewLike = mlir::dyn_cast<mlir::ViewLikeOpInterface>(definingOp)) {
-      currentValue = viewLike.getViewSource();
-      continue;
-    }
-
-    if (stripDeclare) {
-      if (auto declareOp = mlir::dyn_cast<hlfir::DeclareOp>(definingOp)) {
-        currentValue = declareOp.getMemref();
-        continue;
-      }
-
-      if (auto declareOp = mlir::dyn_cast<fir::DeclareOp>(definingOp)) {
-        currentValue = declareOp.getMemref();
-        continue;
-      }
-    }
-    break;
-  }
-
-  return currentValue;
-}
 
 template <typename Ty>
 bool OpenACCPointerLikeModel<Ty>::genFree(
@@ -1273,7 +1240,7 @@ bool OpenACCPointerLikeModel<Ty>::genFree(
   mlir::Value valueToInspect = allocRes ? allocRes : varToFree;
 
   // Strip casts and declare operations to find the original allocation
-  mlir::Value strippedValue = stripCasts(valueToInspect);
+  mlir::Value strippedValue = fir::acc::getOriginalDef(valueToInspect);
   mlir::Operation *originalAlloc = strippedValue.getDefiningOp();
 
   // If we found an AllocMemOp (heap allocation), free it
@@ -1511,7 +1478,7 @@ static bool hasCUDADeviceAttrOnFuncArg(mlir::BlockArgument blockArg) {
 /// Shared implementation for checking if a value represents device data.
 static bool isDeviceDataImpl(mlir::Value var) {
   // Strip casts to find the underlying value.
-  mlir::Value currentVal = stripCasts(var, /*stripDeclare=*/false);
+  mlir::Value currentVal = fir::acc::getOriginalDef(var, /*stripDeclare=*/false);
 
   if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(currentVal))
     return hasCUDADeviceAttrOnFuncArg(blockArg);
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp
index 5199271da3a2a..890ffe78db1c9 100644
--- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp
@@ -619,3 +619,38 @@ mlir::SymbolRefAttr fir::acc::createOrGetReductionRecipe(
   mlir::acc::YieldOp::create(builder, loc, dest);
   return mlir::SymbolRefAttr::get(builder.getContext(), recipe.getSymName());
 }
+
+mlir::Value fir::acc::getOriginalDef(mlir::Value value, bool stripDeclare) {
+  mlir::Value currentValue = value;
+
+  while (currentValue) {
+    auto *definingOp = currentValue.getDefiningOp();
+    if (!definingOp)
+      break;
+
+    if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(definingOp)) {
+      currentValue = convertOp.getValue();
+      continue;
+    }
+
+    if (auto viewLike = mlir::dyn_cast<mlir::ViewLikeOpInterface>(definingOp)) {
+      currentValue = viewLike.getViewSource();
+      continue;
+    }
+
+    if (stripDeclare) {
+      if (auto declareOp = mlir::dyn_cast<hlfir::DeclareOp>(definingOp)) {
+        currentValue = declareOp.getMemref();
+        continue;
+      }
+
+      if (auto declareOp = mlir::dyn_cast<fir::DeclareOp>(definingOp)) {
+        currentValue = declareOp.getMemref();
+        continue;
+      }
+    }
+    break;
+  }
+
+  return currentValue;
+}
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp
new file mode 100644
index 0000000000000..b0444be73862f
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp
@@ -0,0 +1,190 @@
+//===- ACCOptimizeFirstprivateMap.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes firstprivate mapping operations (acc.firstprivate_map).
+// The optimization hoists loads from the firstprivate variable to before the
+// compute region, effectively converting the firstprivate copy to a
+// pass-by-value pattern. This eliminates the need for runtime copying into
+// global memory.
+//
+// Example transformation:
+//
+//   Before:
+//     %decl = fir.declare %alloca : !fir.ref<i32>
+//     %fp = acc.firstprivate_map varPtr(%decl) -> !fir.ref<i32>
+//     acc.parallel {
+//       %val = fir.load %fp : !fir.ref<i32>  // load inside region
+//       ...
+//     }
+//
+//   After:
+//     %decl = fir.declare %alloca : !fir.ref<i32>
+//     %val = fir.load %decl : !fir.ref<i32>  // load hoisted before region
+//     acc.parallel {
+//       ...  // uses %val directly
+//     }
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/FortranVariableInterface.h"
+#include "flang/Optimizer/OpenACC/Passes.h"
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace fir::acc {
+#define GEN_PASS_DEF_ACCOPTIMIZEFIRSTPRIVATEMAP
+#include "flang/Optimizer/OpenACC/Passes.h.inc"
+} // namespace fir::acc
+
+using namespace mlir;
+
+namespace {
+
+/// Returns the enclosing offload region interface, or nullptr if not inside
+/// one.
+static acc::OffloadRegionOpInterface getEnclosingOffloadRegion(Operation *op) {
+  Operation *parent = op->getParentOp();
+  while (parent) {
+    if (auto offloadOp = dyn_cast<acc::OffloadRegionOpInterface>(parent))
+      return offloadOp;
+    parent = parent->getParentOp();
+  }
+  return nullptr;
+}
+
+/// Returns true if the value is defined by an OpenACC data clause operation.
+static bool isDefinedByDataClause(Value value) {
+  Operation *defOp = value.getDefiningOp();
+  if (!defOp)
+    return false;
+  return acc::getDataClause(defOp).has_value();
+}
+
+/// Returns true if the value is defined inside the given offload region.
+/// This handles both operation results and block arguments.
+static bool isDefinedInsideRegion(Value value,
+                                  acc::OffloadRegionOpInterface offloadOp) {
+  Region *valueRegion = value.getParentRegion();
+  if (!valueRegion)
+    return false;
+  return offloadOp.getOffloadRegion().isAncestor(valueRegion);
+}
+
+/// Returns true if the variable may be optional.
+static bool mayBeOptionalVariable(Value var) {
+  // Don't strip declare ops - we need to check the optional attribute on them.
+  Value originalDef = fir::acc::getOriginalDef(var, /*stripDeclare=*/false);
+  if (auto varIface = dyn_cast_or_null<fir::FortranVariableOpInterface>(
+          originalDef.getDefiningOp()))
+    return varIface.isOptional();
+  // If the defining op is an alloca, it's a local variable and not optional.
+  if (isa_and_nonnull<fir::AllocaOp, fir::AllocMemOp>(
+          originalDef.getDefiningOp()))
+    return false;
+  // Conservative: if we can't determine, assume it may be optional.
+  return true;
+}
+
+/// Returns true if the type is a reference to a trivial type.
+/// Note that this does not allow fir.heap, fir.ptr, or fir.llvm_ptr
+/// types - since we would need to check if the load is valid via
+/// a null-check to enable the optimization.
+static bool isRefToTrivialType(Type type) {
+  if (!mlir::isa<fir::ReferenceType>(type))
+    return false;
+  return fir::isa_trivial(fir::unwrapRefType(type));
+}
+
+static void hoistLoads(acc::FirstprivateMapInitialOp firstprivateInitOp,
+                       Value var, Value accVar) {
+  llvm::SmallVector<fir::LoadOp> loadsToHoist;
+  for (Operation *user : accVar.getUsers()) {
+    if (auto loadOp = dyn_cast<fir::LoadOp>(user))
+      loadsToHoist.push_back(loadOp);
+  }
+
+  for (fir::LoadOp loadOp : loadsToHoist) {
+    loadOp.getMemrefMutable().assign(var);
+    loadOp->moveBefore(firstprivateInitOp);
+  }
+}
+
+class ACCOptimizeFirstprivateMap
+    : public fir::acc::impl::ACCOptimizeFirstprivateMapBase<
+          ACCOptimizeFirstprivateMap> {
+public:
+  void runOnOperation() override {
+    func::FuncOp funcOp = getOperation();
+
+    // Collect all firstprivate_map ops first to avoid modifying IR during walk.
+    llvm::SmallVector<acc::FirstprivateMapInitialOp> firstprivateOps;
+    funcOp.walk([&](acc::FirstprivateMapInitialOp op) {
+      firstprivateOps.push_back(op);
+    });
+
+    llvm::SmallVector<acc::FirstprivateMapInitialOp> opsToErase;
+
+    for (acc::FirstprivateMapInitialOp firstprivateInitOp : firstprivateOps) {
+      Value var = firstprivateInitOp.getVar();
+
+      if (auto offloadOp = getEnclosingOffloadRegion(firstprivateInitOp)) {
+        // Inside an offload region.
+        if (isDefinedByDataClause(var) ||
+            isDefinedInsideRegion(var, offloadOp)) {
+          // The variable is already mapped or defined locally - just replace
+          // uses and erase.
+          firstprivateInitOp.getAccVar().replaceAllUsesWith(var);
+          opsToErase.push_back(firstprivateInitOp);
+        } else {
+          // Variable is defined outside - hoist the op out of the region,
+          // then apply optimization.
+          firstprivateInitOp->moveBefore(offloadOp);
+          if (optimizeFirstprivateMapping(firstprivateInitOp))
+            opsToErase.push_back(firstprivateInitOp);
+        }
+      } else {
+        // Outside offload region, apply type-restricted optimization
+        // to pre-load before the compute region.
+        if (optimizeFirstprivateMapping(firstprivateInitOp))
+          opsToErase.push_back(firstprivateInitOp);
+      }
+    }
+
+    for (auto op : opsToErase)
+      op.erase();
+  }
+
+private:
+  /// Returns true if the operation was optimized and can be erased.
+  static bool optimizeFirstprivateMapping(
+      acc::FirstprivateMapInitialOp firstprivateInitOp) {
+    Value var = firstprivateInitOp.getVar();
+    Value accVar = firstprivateInitOp.getAccVar();
+
+    // Only optimize references to trivial types.
+    if (!isRefToTrivialType(var.getType()))
+      return false;
+
+    // Avoid hoisting optional variables as they may be
+    // null and thus not safe to access.
+    if (mayBeOptionalVariable(var))
+      return false;
+
+    hoistLoads(firstprivateInitOp, var, accVar);
+    return true;
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> fir::acc::createACCOptimizeFirstprivateMapPass() {
+  return std::make_unique<ACCOptimizeFirstprivateMap>();
+}
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
index f122fae461a38..10c6c20dc3e06 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
@@ -1,7 +1,8 @@
 add_flang_library(FIROpenACCTransforms
-  ACCUseDeviceCanonicalizer.cpp
   ACCInitializeFIRAnalyses.cpp
+  ACCOptimizeFirstprivateMap.cpp
   ACCRecipeBufferization.cpp
+  ACCUseDeviceCanonicalizer.cpp
 
   DEPENDS
   FIROpenACCPassesIncGen
diff --git a/flang/test/Transforms/OpenACC/acc-optimize-firstprivate-map.fir b/flang/test/Transforms/OpenACC/acc-optimize-firstprivate-map.fir
new file mode 100644
index 0000000000000..bce575e752866
--- /dev/null
+++ b/flang/test/Transforms/OpenACC/acc-optimize-firstprivate-map.fir
@@ -0,0 +1,241 @@
+// RUN: fir-opt %s --acc-optimize-firstprivate-map -split-input-file | FileCheck %s
+
+// Test: Integer variable - should optimize
+
+func.func private @use_i32(i32)
+
+// CHECK-LABEL: func.func @test_trivial_scalar_hoist
+func.func @test_trivial_scalar_hoist() {
+  %scalar = fir.alloca i32 {bindc_name = "scalar_var"}
+  %decl = fir.declare %scalar {uniq_name = "_QFtest_trivial_scalarEscalar_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %fpmap = acc.firstprivate_map varPtr(%decl : !fir.ref<i32>) -> !fir.ref<i32>
+  // CHECK: %[[DECL:.*]] = fir.declare
+  // CHECK: %[[LOAD:.*]] = fir.load %[[DECL]] : !fir.ref<i32>
+  // CHECK-NOT: acc.firstprivate_map
+  // CHECK: acc.parallel
+  acc.parallel {
+    %load = fir.load %fpmap : !fir.ref<i32>
+    // CHECK: fir.call @use_i32(%[[LOAD]])
+    fir.call @use_i32(%load) : (i32) -> ()
+    acc.yield
+  }
+  return
+}
+
+// -----
+
+// Test: Inside offload region - should optimize by getting rid of firstprivate_map
+
+func.func private @use_i32(i32)
+
+// CHECK-LABEL: func.func @test_inside_offload_region
+func.func @test_inside_offload_region() {
+  %scalar = fir.alloca i32 {bindc_name = "scalar_var"}
+  %decl = fir.declare %scalar {uniq_name = "_QFtest_inside_offloadEscalar_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %copyin = acc.copyin varPtr(%decl : !fir.ref<i32>) -> !fir.ref<i32>
+  // CHECK: %[[DECL:.*]] = fir.declare
+  // CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%[[DECL]] : !fir.ref<i32>) -> !fir.ref<i32>
+  acc.parallel dataOperands(%copyin : !fir.ref<i32>) {
+    %fpmap = acc.firstprivate_map varPtr(%copyin : !fir.ref<i32>) -> !fir.ref<i32>
+    %load = fir.load %fpmap : !fir.ref<i32>
+    // CHECK: acc.parallel dataOperands(%[[COPYIN]] : !fir.ref<i32>)
+    // CHECK-NOT: acc.firstprivate_map
+    // CHECK: %[[LOAD:.*]] = fir.load %[[COPYIN]] : !fir.ref<i32>
+    fir.call @use_i32(%load) : (i32) -> ()
+    acc.yield
+  }
+  return
+}
+
+// -----
+
+// Test: Local alloca inside offload region - should optimize (erase)
+
+func.func private @use_i32(i32)
+
+// CHECK-LABEL: func.func @test_local_alloca_inside_offload
+func.func @test_local_alloca_inside_offload() {
+  acc.parallel {
+    %local = fir.alloca i32 {bindc_name = "local_var"}
+    %decl = fir.declare %local {uniq_name = "_QFtest_local_allocaElocal_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    %fpmap = acc.firstprivate_map varPtr(%decl : !fir.ref<i32>) -> !fir.ref<i32>
+    %load = fir.load %fpmap : !fir.ref<i32>
+    // CHECK: acc.parallel
+    // CHECK: %[[LOCAL:.*]] = fir.alloca i32
+    // CHECK: %[[DECL:.*]] = fir.declare %[[LOCAL]]
+    // CHECK-NOT: acc.firstprivate_map
+    // CHECK: %[[LOAD:.*]] = fir.load %[[DECL]] : !fir.ref<i32>
+    fir.call @use_i32(%load) : (i32) -> ()
+    acc.yield
+  }
+  return
+}
+
+// -----
+
+// Test: firstprivate_map with acc.private input - should optimize (erase)
+
+func.func private @use_i32(i32)
+
+// CHECK-LABEL: func.func @test_private_input
+func.func @test_private_input() {
+  %scalar = fir.alloca i32 {bindc_name = "scalar_var"}
+  %decl = fir.declare %scalar {uniq_name = "_QFtest_private_inputEscalar_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %private = acc.private varPtr(%decl : !fir.ref<i32>) -> !fir.ref<i32>
+  // CHECK: %[[DECL:.*]] = fir.declare
+  // CHECK: %[[PRIVATE:.*]] = acc.private varPtr(%[[DECL]] : !fir.ref<i32>) -> !fir.ref<i32>
+  acc.parallel private(%private : !fir.ref<i32>) {
+    %fpmap = acc.firstprivate_map varPtr(%private : !fir.ref<i32>) -> !fir.ref<i32>
+    %load = fir.load %fpmap : !fir.ref<i32>
+    // CHECK: acc.parallel private(%[[PRIVATE]] : !fir.ref<i32>)
+    // CHECK-NOT: acc.firstprivate_map
+    // CHECK: %[[LOAD:.*]] = fir.load %[[PRIVATE]] : !fir.ref<i32>
+    fir.call @use_i32(%load) : (i32) -> ()
+    acc.yield
+  }
+  return
+}
+
+// -----
+
+// Test: Variable defined outside offload region but firstprivate_map inside -
+// should hoist out and then optimize
+
+func.func private @use_i32(i32)
+
+// CHECK-LABEL: func.func @test_hoist_from_offload_region
+func.func @test_hoist_from_offload_region() {
+  %scalar = fir.alloca i32 {bindc_name = "scalar_var"}
+  %decl = fir.declare %scalar {uniq_name = "_QFtest_hoistEscalar_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  // CHECK: %[[DECL:.*]] = fir.declare
+  // CHECK: %[[LOAD:.*]] = fir.load %[[DECL]] : !fir.ref<i32>
+  // CHECK-NOT: acc.firstprivate_map
+  // CHECK: acc.parallel
+  acc.parallel {
+    %fpmap = acc.firstprivate_map varPtr(%decl : !fir.ref<i32>) -> !fir.ref<i32>
+    %load = fir.load %fpmap : !fir.ref<i32>
+    // CHECK: fir.call @use_i32(%[[LOAD]])
+    fir.call @use_i32(%load) : (i32) -> ()
+    acc.yield
+  }
+  return
+}
+
+// -----
+
+// Test: Optional variable - should NOT optimize
+
+func.func private @use_i32(i32)
+
+// CHECK-LABEL: func.func @test_optional_no_hoist
+func.func @test_optional_no_hoist(%arg0: !fir.ref<i32>) {
+  %decl = fir.declare %arg0 {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_optionalEopt_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %fpmap = acc.firstprivate_map varPtr(%decl : !fir.ref<i32>) -> !fir.ref<i32>
+  // CHECK: %[[DECL:.*]] = fir.declare {{.*}} {fortran_attrs = #fir.var_attrs<optional>
+  // CHECK: %[[FPMAP:.*]] = acc.firstprivate_map varPtr(%[[DECL]] : !fir.ref<i32>) -> !fir.ref<i32>
+  // CHECK: acc.parallel
+  acc.parallel {
+    %load = fir.load %fpmap : !fir.ref<i32>
+    // CHECK: %[[LOAD:.*]] = fir.load %[[FPMAP]] : !fir.ref<i32>
+    fir.call @use_i32(%load) : (i32) -> ()
+    acc.yield
+  }
+  return
+}
+
+// -----
+
+// Test: Array type (non-trivial) - should NOT optimize
+
+func.func private @use_array(!fir.array<10xi32>)
+
+// CHECK-LABEL: func.func @test_array_no_hoist
+func.func @test_array_no_hoist() {
+  %c10 = arith.constant 10 : index
+  %array = fir.alloca !fir.array<10xi32> {bindc_name = "array_var"}
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+  %decl = fir.declare %array(%shape) {uniq_name = "_QFtest_arrayEarray_var"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+  %fpmap = acc.firstprivate_map varPtr(%decl : !fir.ref<!fir.array<10xi32>>) -> !fir.ref<!fir.array<10xi32>>
+  // CHECK: %[[DECL:.*]] = fir.declare
+  // CHECK: %[[FPMAP:.*]] = acc.firstprivate_map varPtr(%[[DECL]] : !fir.ref<!fir.array<10xi32>>) -> !fir.ref<!fir.array<10xi32>>
+  // CHECK: acc.parallel
+  acc.parallel {
+    %load = fir.load %fpmap : !fir.ref<!fir.array<10xi32>>
+    // CHECK: %[[LOAD:.*]] = fir.load %[[FPMAP]] : !fir.ref<!fir.array<10xi32>>
+    fir.call @use_array(%load) : (!fir.array<10xi32>) -> ()
+    acc.yield
+  }
+  return
+}
+
+// -----
+
+// Test: Multiple loads from same firstprivate_map - should optimize and hoist all
+
+func.func private @use_i32_i32(i32, i32)
+
+// CHECK-LABEL: func.func @test_multiple_loads_hoist
+func.func @test_multiple_loads_hoist() {
+  %scalar = fir.alloca i32 {bindc_name = "scalar_var"}
+  %decl = fir.declare %scalar {uniq_name = "_QFtest_multiple_loadsEscalar_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %fpmap = acc.firstprivate_map varPtr(%decl : !fir.ref<i32>) -> !fir.ref<i32>
+  // CHECK: %[[DECL:.*]] = fir.declare
+  // CHECK-DAG: %[[LOAD1:.*]] = fir.load %[[DECL]] : !fir.ref<i32>
+  // CHECK-DAG: %[[LOAD2:.*]] = fir.load %[[DECL]] : !fir.ref<i32>
+  // CHECK-NOT: acc.firstprivate_map
+  // CHECK: acc.parallel
+  acc.parallel {
+    %load1 = fir.load %fpmap : !fir.ref<i32>
+    %load2 = fir.load %fpmap : !fir.ref<i32>
+    fir.call @use_i32_i32(%load1, %load2) : (i32, i32) -> ()
+    acc.yield
+  }
+  return
+}
+
+// -----
+
+// Test: Variable through fir.convert - should optimize
+
+func.func private @use_i32(i32)
+
+// CHECK-LABEL: func.func @test_through_convert
+func.func @test_through_convert() {
+  %scalar = fir.alloca i32 {bindc_name = "scalar_var"}
+  %decl = fir.declare %scalar {uniq_name = "_QFtest_convertEscalar_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %convert = fir.convert %decl : (!fir.ref<i32>) -> !fir.ref<i32>
+  %fpmap = acc.firstprivate_map varPtr(%convert : !fir.ref<i32>) -> !fir.ref<i32>
+  // CHECK: %[[DECL:.*]] = fir.declare
+  // CHECK: %[[CONVERT:.*]] = fir.convert %[[DECL]]
+  // CHECK: %[[LOAD:.*]] = fir.load %[[CONVERT]] : !fir.ref<i32>
+  // CHECK-NOT: acc.firstprivate_map
+  // CHECK: acc.parallel
+  acc.parallel {
+    %load = fir.load %fpmap : !fir.ref<i32>
+    // CHECK: fir.call @use_i32(%[[LOAD]])
+    fir.call @use_i32(%load) : (i32) -> ()
+    acc.yield
+  }
+  return
+}
+
+// -----
+
+// Test: Block argument (unknown origin) - should NOT optimize
+
+func.func private @use_i32(i32)
+
+// CHECK-LABEL: func.func @test_block_arg_no_hoist
+func.func @test_block_arg_no_hoist(%arg0: !fir.ref<i32>) {
+  // No declare op, so we can't determine if it's optional - conservative no-op
+  %fpmap = acc.firstprivate_map varPtr(%arg0 : !fir.ref<i32>) -> !fir.ref<i32>
+  // CHECK: %[[FPMAP:.*]] = acc.firstprivate_map varPtr(%arg0 : !fir.ref<i32>) -> !fir.ref<i32>
+  // CHECK: acc.parallel
+  acc.parallel {
+    %load = fir.load %fpmap : !fir.ref<i32>
+    // CHECK: %[[LOAD:.*]] = fir.load %[[FPMAP]] : !fir.ref<i32>
+    fir.call @use_i32(%load) : (i32) -> ()
+    acc.yield
+  }
+  return
+}
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
index 95a8f22a3ddfa..2b585d8d0db90 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
@@ -135,6 +135,14 @@ def OffloadRegionOpInterface : OpInterface<"OffloadRegionOpInterface"> {
     their regions will be extracted and compiled separately (e.g., as
     device kernels or outlined functions).
   }];
+
+  let methods = [
+    InterfaceMethod<"Get the offload region", "::mlir::Region&",
+      "getOffloadRegion",
+      (ins), [{
+        return $_op.getRegion();
+      }]>,
+  ];
 }
 
 #endif // OPENACC_OPS_INTERFACES
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index abfcad88c2437..6ebb2a8127720 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -306,7 +306,11 @@ struct MemrefGlobalVariableModel
 
 struct GPULaunchOffloadRegionModel
     : public acc::OffloadRegionOpInterface::ExternalModel<
-          GPULaunchOffloadRegionModel, gpu::LaunchOp> {};
+          GPULaunchOffloadRegionModel, gpu::LaunchOp> {
+  mlir::Region &getOffloadRegion(mlir::Operation *op) const {
+    return cast<gpu::LaunchOp>(op).getBody();
+  }
+};
 
 /// Helper function for any of the times we need to modify an ArrayAttr based on
 /// a device type list.  Returns a new ArrayAttr with all of the

>From ff10b8460bbde3bc6e9f1f71272d2587fa95786b Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Wed, 28 Jan 2026 16:04:43 -0800
Subject: [PATCH 2/6] fix format

---
 .../Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
index 2c855ecbe99e1..2338816cc074b 100644
--- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h"
-#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/DirectivesCommon.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
@@ -24,6 +23,7 @@
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/Dialect/Support/KindMapping.h"
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h"
 #include "flang/Optimizer/Support/Utils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
@@ -1212,7 +1212,6 @@ template mlir::Value OpenACCPointerLikeModel<fir::LLVMPointerType>::genAllocate(
     llvm::StringRef varName, mlir::Type varType, mlir::Value originalVar,
     bool &needsFree) const;
 
-
 template <typename Ty>
 bool OpenACCPointerLikeModel<Ty>::genFree(
     mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
@@ -1478,7 +1477,8 @@ static bool hasCUDADeviceAttrOnFuncArg(mlir::BlockArgument blockArg) {
 /// Shared implementation for checking if a value represents device data.
 static bool isDeviceDataImpl(mlir::Value var) {
   // Strip casts to find the underlying value.
-  mlir::Value currentVal = fir::acc::getOriginalDef(var, /*stripDeclare=*/false);
+  mlir::Value currentVal =
+      fir::acc::getOriginalDef(var, /*stripDeclare=*/false);
 
   if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(currentVal))
     return hasCUDADeviceAttrOnFuncArg(blockArg);

>From 45ba652685f911c0c9fffa307506ec3b4049a7d7 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Wed, 28 Jan 2026 17:25:34 -0800
Subject: [PATCH 3/6] Add missing library dependency

---
 flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
index 10c6c20dc3e06..27c5ee64aea27 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
@@ -13,6 +13,7 @@ add_flang_library(FIROpenACCTransforms
   FIRDialect
   FIRDialectSupport
   FIROpenACCAnalysis
+  FIROpenACCSupport
   HLFIRDialect
 
   MLIR_LIBS

>From af9e170557de2c80391e099af96fd7baeb09cfeb Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Thu, 29 Jan 2026 10:36:03 -0800
Subject: [PATCH 4/6] Improve getEnclosingOffloadRegion

---
 .../OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp     | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp
index b0444be73862f..628e3b9802e59 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp
@@ -51,13 +51,7 @@ namespace {
 /// Returns the enclosing offload region interface, or nullptr if not inside
 /// one.
 static acc::OffloadRegionOpInterface getEnclosingOffloadRegion(Operation *op) {
-  Operation *parent = op->getParentOp();
-  while (parent) {
-    if (auto offloadOp = dyn_cast<acc::OffloadRegionOpInterface>(parent))
-      return offloadOp;
-    parent = parent->getParentOp();
-  }
-  return nullptr;
+  return op->getParentOfType<acc::OffloadRegionOpInterface>();
 }
 
 /// Returns true if the value is defined by an OpenACC data clause operation.

>From 0627cfac3c6e441ebc0041306d4775bb51c32d4b Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Thu, 29 Jan 2026 10:42:42 -0800
Subject: [PATCH 5/6] Remove if there are no uses

---
 .../OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp         | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp
index 628e3b9802e59..862eec751d817 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp
@@ -163,6 +163,10 @@ class ACCOptimizeFirstprivateMap
     Value var = firstprivateInitOp.getVar();
     Value accVar = firstprivateInitOp.getAccVar();
 
+    // If there are no uses, we can erase the operation.
+    if (accVar.use_empty())
+      return true;
+
     // Only optimize references to trivial types.
     if (!isRefToTrivialType(var.getType()))
       return false;

>From 5c8eb352c76db67ff33e0175f363d3256e9995b9 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Thu, 29 Jan 2026 10:43:00 -0800
Subject: [PATCH 6/6] Check all uses are loads before hoisting

---
 .../Transforms/ACCOptimizeFirstprivateMap.cpp | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp
index 862eec751d817..ec40e1209f97a 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCOptimizeFirstprivateMap.cpp
@@ -97,18 +97,24 @@ static bool isRefToTrivialType(Type type) {
   return fir::isa_trivial(fir::unwrapRefType(type));
 }
 
-static void hoistLoads(acc::FirstprivateMapInitialOp firstprivateInitOp,
+/// Attempts to hoist loads from accVar to before firstprivateInitOp.
+/// Returns true if all uses of accVar are loads and they were hoisted.
+static bool hoistLoads(acc::FirstprivateMapInitialOp firstprivateInitOp,
                        Value var, Value accVar) {
-  llvm::SmallVector<fir::LoadOp> loadsToHoist;
-  for (Operation *user : accVar.getUsers()) {
-    if (auto loadOp = dyn_cast<fir::LoadOp>(user))
-      loadsToHoist.push_back(loadOp);
-  }
+  // Check if all uses are loads - only hoist if we can optimize all uses.
+  bool allLoads = llvm::all_of(accVar.getUsers(), [](Operation *user) {
+    return isa<fir::LoadOp>(user);
+  });
+  if (!allLoads)
+    return false;
 
-  for (fir::LoadOp loadOp : loadsToHoist) {
+  // Hoist all loads before the firstprivate_map operation.
+  for (Operation *user : llvm::make_early_inc_range(accVar.getUsers())) {
+    auto loadOp = cast<fir::LoadOp>(user);
     loadOp.getMemrefMutable().assign(var);
     loadOp->moveBefore(firstprivateInitOp);
   }
+  return true;
 }
 
 class ACCOptimizeFirstprivateMap
@@ -176,8 +182,7 @@ class ACCOptimizeFirstprivateMap
     if (mayBeOptionalVariable(var))
       return false;
 
-    hoistLoads(firstprivateInitOp, var, accVar);
-    return true;
+    return hoistLoads(firstprivateInitOp, var, accVar);
   }
 };
 



More information about the Mlir-commits mailing list