[flang-commits] [flang] [flang][cuda] Compute offset on cuf.shared_memory ops (PR #131395)
Valentin Clement バレンタイン クレメン via flang-commits
flang-commits at lists.llvm.org
Fri Mar 14 15:54:37 PDT 2025
https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/131395
>From f4088db7ee469f269eea9c80f7b3015f88e5a751 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 14 Mar 2025 14:19:08 -0700
Subject: [PATCH 1/2] [flang][cuda] Compute offset on cuf.shared_memory ops
---
.../flang/Optimizer/Builder/CUFCommon.h | 1 +
.../flang/Optimizer/Transforms/Passes.h | 1 +
.../flang/Optimizer/Transforms/Passes.td | 13 ++
flang/lib/Optimizer/Transforms/CMakeLists.txt | 1 +
.../CUFComputeSharedMemoryOffsetsAndSize.cpp | 126 ++++++++++++++++++
flang/test/Fir/CUDA/cuda-shared-offset.mlir | 56 ++++++++
6 files changed, 198 insertions(+)
create mode 100644 flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
create mode 100644 flang/test/Fir/CUDA/cuda-shared-offset.mlir
diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h
index e3c7b5098b83f..65b9cce1d2021 100644
--- a/flang/include/flang/Optimizer/Builder/CUFCommon.h
+++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h
@@ -14,6 +14,7 @@
#include "mlir/IR/BuiltinOps.h"
static constexpr llvm::StringRef cudaDeviceModuleName = "cuda_device_mod";
+static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem";
namespace fir {
class FirOpBuilder;
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 406fedf220d26..6dbabd523f88a 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -43,6 +43,7 @@ namespace fir {
#define GEN_PASS_DECL_CUFDEVICEGLOBAL
#define GEN_PASS_DECL_CUFGPUTOLLVMCONVERSION
#define GEN_PASS_DECL_CUFOPCONVERSION
+#define GEN_PASS_DECL_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
#define GEN_PASS_DECL_EXTERNALNAMECONVERSION
#define GEN_PASS_DECL_MEMREFDATAFLOWOPT
#define GEN_PASS_DECL_SIMPLIFYINTRINSICS
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index e5c17cf7d8881..fbab435887b8a 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -453,6 +453,19 @@ def CUFGPUToLLVMConversion : Pass<"cuf-gpu-convert-to-llvm", "mlir::ModuleOp"> {
];
}
+def CUFComputeSharedMemoryOffsetsAndSize
+ : Pass<"cuf-compute-shared-memory", "mlir::ModuleOp"> {
+ let summary = "Create the shared memory global variable and set offsets";
+
+ let description = [{
+ Compute the size and alignment of the shared memory global and materialize
+ it. Compute the offset of each cuf.shared_memory operation according to
+ the global and set it.
+ }];
+
+ let dependentDialects = ["fir::FIROpsDialect"];
+}
+
def SetRuntimeCallAttributes
: Pass<"set-runtime-call-attrs", "mlir::func::FuncOp"> {
let summary = "Set Fortran runtime fir.call attributes targeting LLVM IR";
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 6e8666759ab83..ca08e4607e019 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -13,6 +13,7 @@ add_flang_library(FIRTransforms
CUFDeviceGlobal.cpp
CUFOpConversion.cpp
CUFGPUToLLVMConversion.cpp
+ CUFComputeSharedMemoryOffsetsAndSize.cpp
ArrayValueCopy.cpp
ExternalNameConversion.cpp
MemoryUtils.cpp
diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
new file mode 100644
index 0000000000000..1881ae72ee721
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
@@ -0,0 +1,126 @@
+//===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/CodeGen/Target.h"
+#include "flang/Optimizer/CodeGen/TypeConverter.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/Dialect/FIRAttr.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Runtime/CUDA/registration.h"
+#include "flang/Runtime/entry-names.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace fir {
+#define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+using namespace Fortran::runtime::cuda;
+
+namespace {
+
+struct CUFComputeSharedMemoryOffsetsAndSize
+ : public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
+ CUFComputeSharedMemoryOffsetsAndSize> {
+
+ void runOnOperation() override {
+ mlir::ModuleOp mod = getOperation();
+ mlir::SymbolTable symTab(mod);
+ mlir::OpBuilder opBuilder{mod.getBodyRegion()};
+ fir::FirOpBuilder builder(opBuilder, mod);
+ fir::KindMapping kindMap{fir::getKindMapping(mod)};
+ std::optional<mlir::DataLayout> dl =
+ fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false);
+ if (!dl) {
+ mlir::emitError(mod.getLoc(),
+ "data layout attribute is required to perform " +
+ getName() + "pass");
+ }
+
+ auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab);
+ mlir::Type i8Ty = builder.getI8Type();
+ for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
+ unsigned nbDynamicSharedVariables = 0;
+ unsigned nbStaticSharedVariables = 0;
+ uint64_t sharedMemSize = 0;
+ unsigned short alignment = 0;
+
+ // Go over each shared memory operation and compute their start offset and
+ // the size and alignment of the global to be generated if all variables
+ // are static. If this is dynamic shared memory, then only the alignment
+ // is computed.
+ for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) {
+ if (fir::hasDynamicSize(sharedOp.getInType())) {
+ mlir::Type ty = sharedOp.getInType();
+ // getTypeSizeAndAlignmentOrCrash will crash trying to compute the
+ // size of an array with dynamic size. Just get the alignment to
+ // create the global.
+ if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
+ ty = seqTy.getEleTy();
+ unsigned short align = dl->getTypeABIAlignment(ty);
+ ++nbDynamicSharedVariables;
+ sharedOp.setOffset(0);
+ alignment = std::max(alignment, align);
+ continue;
+ }
+ auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
+ sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
+ ++nbStaticSharedVariables;
+ sharedOp.setOffset(llvm::alignTo(sharedMemSize, align));
+ sharedMemSize =
+ llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
+ alignment = std::max(alignment, align);
+ }
+ if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0)
+ mlir::emitError(
+ funcOp.getLoc(),
+ "static and dynamic shared variables in a single kernel");
+
+ mlir::DenseElementsAttr init = {};
+ if (sharedMemSize > 0) {
+ auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
+ mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
+ init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
+ }
+
+ // Create the shared memory global where each shared variables will point
+ // to.
+ auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
+ std::string sharedMemGlobalName =
+ (funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
+ mlir::StringAttr linkage = builder.createInternalLinkage();
+ builder.setInsertionPointToEnd(gpuMod.getBody());
+ llvm::SmallVector<mlir::NamedAttribute> attrs;
+ auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
+ gpuMod.getContext());
+ attrs.push_back(mlir::NamedAttribute(
+ fir::GlobalOp::getDataAttrAttrName(globalOpName),
+ cuf::DataAttributeAttr::get(gpuMod.getContext(),
+ cuf::DataAttribute::Shared)));
+ auto sharedMem = builder.create<fir::GlobalOp>(
+ funcOp.getLoc(), sharedMemGlobalName, false, false, sharedMemType,
+ init, linkage, attrs);
+ sharedMem.setAlignment(alignment);
+ }
+ }
+};
+
+} // end anonymous namespace
diff --git a/flang/test/Fir/CUDA/cuda-shared-offset.mlir b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
new file mode 100644
index 0000000000000..b3ea7dfc89cc7
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
@@ -0,0 +1,56 @@
+// RUN: fir-opt --split-input-file --cuf-compute-shared-memory %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ gpu.module @cuda_device_mod {
+ gpu.func @_QPdynshared() kernel {
+ %c-1 = arith.constant -1 : index
+ %6 = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
+ %7 = fir.shape %c-1 : (index) -> !fir.shape<1>
+ %8 = fir.declare %6(%7) {data_attr = #cuf.cuda<shared>, uniq_name = "_QFdynsharedEr"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
+ gpu.return
+ }
+ }
+}
+
+// CHECK-LABEL: gpu.module @cuda_device_mod
+// CHECK: gpu.func @_QPdynshared()
+// CHECK: %{{.*}} = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", offset = 0 : i32, uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
+// CHECK: gpu.return
+// CHECK: }
+// CHECK: fir.global internal @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ gpu.module @cuda_device_mod {
+ gpu.func @_QPshared_static() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %0 = cuf.shared_memory i32 {bindc_name = "a", uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
+ %1 = fir.declare %0 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEa"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %2 = cuf.shared_memory i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
+ %3 = fir.declare %2 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEb"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %8 = cuf.shared_memory i32 {bindc_name = "c", uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
+ %9 = fir.declare %8 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEc"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %10 = cuf.shared_memory i32 {bindc_name = "d", uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
+ %11 = fir.declare %10 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEd"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %12 = cuf.shared_memory i64 {bindc_name = "e", uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
+ %13 = fir.declare %12 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEe"} : (!fir.ref<i64>) -> !fir.ref<i64>
+ %16 = cuf.shared_memory f32 {bindc_name = "r", uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
+ %17 = fir.declare %16 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEr"} : (!fir.ref<f32>) -> !fir.ref<f32>
+ gpu.return
+ }
+ }
+}
+
+// CHECK-LABEL: gpu.module @cuda_device_mod
+// CHECK: gpu.func @_QPshared_static()
+// CHECK: cuf.shared_memory i32 {bindc_name = "a", offset = 0 : i32, uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
+// CHECK: cuf.shared_memory i32 {bindc_name = "b", offset = 4 : i32, uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
+// CHECK: cuf.shared_memory i32 {bindc_name = "c", offset = 8 : i32, uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
+// CHECK: cuf.shared_memory i32 {bindc_name = "d", offset = 12 : i32, uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
+// CHECK: cuf.shared_memory i64 {bindc_name = "e", offset = 16 : i32, uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
+// CHECK: cuf.shared_memory f32 {bindc_name = "r", offset = 24 : i32, uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
+// CHECK: gpu.return
+// CHECK: }
+// CHECK: fir.global internal @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
+// CHECK: }
+// CHECK: }
>From d98655a6e36d2ee1aef4bd5a7e4b9c5c1cdf019d Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 14 Mar 2025 15:54:20 -0700
Subject: [PATCH 2/2] Fix typo
---
.../Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
index 1881ae72ee721..5c6d1233c3ed3 100644
--- a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
@@ -101,7 +101,7 @@ struct CUFComputeSharedMemoryOffsetsAndSize
init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
}
- // Create the shared memory global where each shared variables will point
+ // Create the shared memory global where each shared variable will point
// to.
auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
std::string sharedMemGlobalName =
More information about the flang-commits
mailing list