[flang-commits] [flang] 2fb2d7e - [flang][cuda] Change how to handle static shared memory variables (#170388)
via flang-commits
flang-commits at lists.llvm.org
Wed Dec 3 13:05:33 PST 2025
Author: Valentin Clement (バレンタイン クレメン)
Date: 2025-12-03T13:05:28-08:00
New Revision: 2fb2d7eb412f25fbe48f47a31b017a87d2398f8a
URL: https://github.com/llvm/llvm-project/commit/2fb2d7eb412f25fbe48f47a31b017a87d2398f8a
DIFF: https://github.com/llvm/llvm-project/commit/2fb2d7eb412f25fbe48f47a31b017a87d2398f8a.diff
LOG: [flang][cuda] Change how to handle static shared memory variables (#170388)
Generate one global per static shared variable so the alignment can be
set separately. Dynamic shared memory is unchanged.
Added:
Modified:
flang/include/flang/Optimizer/Builder/CUFCommon.h
flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
flang/test/Fir/CUDA/cuda-code-gen.mlir
flang/test/Fir/CUDA/cuda-shared-offset.mlir
flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir
Removed:
################################################################################
diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h
index 98d01958846f7..736f90123969c 100644
--- a/flang/include/flang/Optimizer/Builder/CUFCommon.h
+++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h
@@ -14,7 +14,7 @@
#include "mlir/IR/BuiltinOps.h"
static constexpr llvm::StringRef cudaDeviceModuleName = "cuda_device_mod";
-static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem";
+static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem__";
namespace fir {
class FirOpBuilder;
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 3fda523acb382..920bef99dc996 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -351,7 +351,8 @@ def cuf_SharedMemoryOp
OptionalAttr<StrAttr>:$bindc_name, Variadic<AnyIntegerType>:$typeparams,
Variadic<AnyIntegerType>:$shape,
// offset in bytes from the shared memory base address.
- Optional<AnyIntegerType>:$offset, OptionalAttr<I64Attr>:$alignment);
+ Optional<AnyIntegerType>:$offset, OptionalAttr<I64Attr>:$alignment,
+ UnitAttr:$isStatic);
let results = (outs fir_ReferenceType:$ptr);
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 671e5f9455c22..97f7f76a8fbe7 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -333,7 +333,8 @@ void cuf::SharedMemoryOp::build(
bindcName.empty() ? mlir::StringAttr{} : builder.getStringAttr(bindcName);
build(builder, result, wrapAllocaResultType(inType),
mlir::TypeAttr::get(inType), nameAttr, bindcAttr, typeparams, shape,
- /*offset=*/mlir::Value{}, /*alignment=*/mlir::IntegerAttr{});
+ /*offset=*/mlir::Value{}, /*alignment=*/mlir::IntegerAttr{},
+ /*isStatic=*/nullptr);
result.addAttributes(attributes);
}
diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
index a64494510d847..7bae0602fe5ca 100644
--- a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
@@ -46,6 +46,43 @@ static bool isAssumedSize(mlir::ValueRange shape) {
return false;
}
+static void createSharedMemoryGlobal(fir::FirOpBuilder &builder,
+ mlir::Location loc, llvm::StringRef prefix,
+ llvm::StringRef suffix,
+ mlir::gpu::GPUModuleOp gpuMod,
+ mlir::Type sharedMemType, unsigned size,
+ unsigned align, bool isDynamic) {
+ std::string sharedMemGlobalName =
+ isDynamic ? (prefix + llvm::Twine(cudaSharedMemSuffix)).str()
+ : (prefix + llvm::Twine(cudaSharedMemSuffix) + suffix).str();
+
+ mlir::OpBuilder::InsertionGuard guard(builder);
+ builder.setInsertionPointToEnd(gpuMod.getBody());
+
+ mlir::StringAttr linkage = isDynamic ? builder.createExternalLinkage()
+ : builder.createInternalLinkage();
+ llvm::SmallVector<mlir::NamedAttribute> attrs;
+ auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
+ gpuMod.getContext());
+ attrs.push_back(mlir::NamedAttribute(
+ fir::GlobalOp::getDataAttrAttrName(globalOpName),
+ cuf::DataAttributeAttr::get(gpuMod.getContext(),
+ cuf::DataAttribute::Shared)));
+
+ mlir::DenseElementsAttr init = {};
+ mlir::Type i8Ty = builder.getI8Type();
+ if (size > 0) {
+ auto vecTy = mlir::VectorType::get(
+ static_cast<fir::SequenceType::Extent>(size), i8Ty);
+ mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
+ init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
+ }
+ auto sharedMem =
+ fir::GlobalOp::create(builder, loc, sharedMemGlobalName, false, false,
+ sharedMemType, init, linkage, attrs);
+ sharedMem.setAlignment(align);
+}
+
struct CUFComputeSharedMemoryOffsetsAndSize
: public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
CUFComputeSharedMemoryOffsetsAndSize> {
@@ -108,18 +145,23 @@ struct CUFComputeSharedMemoryOffsetsAndSize
crtDynOffset, dynSize);
else
crtDynOffset = dynSize;
-
- continue;
+ } else {
+ // Static shared memory.
+ auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
+ loc, sharedOp.getInType(), *dl, kindMap);
+ createSharedMemoryGlobal(
+ builder, sharedOp.getLoc(), funcOp.getName(),
+ *sharedOp.getBindcName(), gpuMod,
+ fir::SequenceType::get(size, i8Ty), size,
+ sharedOp.getAlignment() ? *sharedOp.getAlignment() : align,
+ /*isDynamic=*/false);
+ mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
+ sharedOp.getOffsetMutable().assign(zero);
+ if (!sharedOp.getAlignment())
+ sharedOp.setAlignment(align);
+ sharedOp.setIsStatic(true);
+ ++nbStaticSharedVariables;
}
- auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
- sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
- ++nbStaticSharedVariables;
- mlir::Value offset = builder.createIntegerConstant(
- loc, i32Ty, llvm::alignTo(sharedMemSize, align));
- sharedOp.getOffsetMutable().assign(offset);
- sharedMemSize =
- llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
- alignment = std::max(alignment, align);
}
if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0)
@@ -130,35 +172,13 @@ struct CUFComputeSharedMemoryOffsetsAndSize
funcOp.getLoc(),
"static and dynamic shared variables in a single kernel");
- mlir::DenseElementsAttr init = {};
- if (sharedMemSize > 0) {
- auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
- mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
- init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
- }
+ if (nbStaticSharedVariables > 0)
+ continue;
- // Create the shared memory global where each shared variable will point
- // to.
auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
- std::string sharedMemGlobalName =
- (funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
- // Dynamic shared memory needs an external linkage while static shared
- // memory needs an internal linkage.
- mlir::StringAttr linkage = nbDynamicSharedVariables > 0
- ? builder.createExternalLinkage()
- : builder.createInternalLinkage();
- builder.setInsertionPointToEnd(gpuMod.getBody());
- llvm::SmallVector<mlir::NamedAttribute> attrs;
- auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
- gpuMod.getContext());
- attrs.push_back(mlir::NamedAttribute(
- fir::GlobalOp::getDataAttrAttrName(globalOpName),
- cuf::DataAttributeAttr::get(gpuMod.getContext(),
- cuf::DataAttribute::Shared)));
- auto sharedMem = fir::GlobalOp::create(
- builder, funcOp.getLoc(), sharedMemGlobalName, false, false,
- sharedMemType, init, linkage, attrs);
- sharedMem.setAlignment(alignment);
+ createSharedMemoryGlobal(builder, funcOp.getLoc(), funcOp.getName(), "",
+ gpuMod, sharedMemType, sharedMemSize, alignment,
+ /*isDynamic=*/true);
}
}
};
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 40f180a8c1657..d5a8212eb5472 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -249,8 +249,13 @@ struct CUFSharedMemoryOpConversion
"cuf.shared_memory must have an offset for code gen");
auto gpuMod = op->getParentOfType<gpu::GPUModuleOp>();
+
std::string sharedGlobalName =
- (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str();
+ op.getIsStatic()
+ ? (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix) +
+ *op.getBindcName())
+ .str()
+ : (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str();
mlir::Value sharedGlobalAddr =
createAddressOfOp(rewriter, loc, gpuMod, sharedGlobalName);
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index 60cda9e98c7d8..e83648f21bdf1 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -201,9 +201,9 @@ func.func @_QMm1Psub1(%arg0: !fir.box<!fir.array<?xi32>> {cuf.data_attr = #cuf.c
// -----
-fir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
+fir.global common @_QPshared_static__shared_mem__(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
-// CHECK: llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
+// CHECK: llvm.mlir.global common @_QPshared_static__shared_mem__(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
// -----
diff --git a/flang/test/Fir/CUDA/cuda-shared-offset.mlir b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
index 37b36b2bd050e..1a39fefe85cda 100644
--- a/flang/test/Fir/CUDA/cuda-shared-offset.mlir
+++ b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
@@ -17,7 +17,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
// CHECK: gpu.return
// CHECK: }
-// CHECK: fir.global external @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
+// CHECK: fir.global external @_QPdynshared__shared_mem__ {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
// -----
@@ -43,15 +43,20 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK-LABEL: gpu.module @cuda_device_mod
// CHECK: gpu.func @_QPshared_static()
-// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 {bindc_name = "a", uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
-// CHECK: cuf.shared_memory[%c4{{.*}} : i32] i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
-// CHECK: cuf.shared_memory[%c8{{.*}} : i32] i32 {bindc_name = "c", uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
-// CHECK: cuf.shared_memory[%c12{{.*}} : i32] i32 {bindc_name = "d", uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
-// CHECK: cuf.shared_memory[%c16{{.*}} : i32] i64 {bindc_name = "e", uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
-// CHECK: cuf.shared_memory[%c24{{.*}} : i32] f32 {bindc_name = "r", uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
+// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "a", isStatic, uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
+// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "b", isStatic, uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
+// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "c", isStatic, uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
+// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "d", isStatic, uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
+// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i64 align 8 {bindc_name = "e", isStatic, uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
+// CHECK: cuf.shared_memory[%c0{{.*}} : i32] f32 align 4 {bindc_name = "r", isStatic, uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
// CHECK: gpu.return
// CHECK: }
-// CHECK: fir.global internal @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
+// CHECK: fir.global internal @_QPshared_static__shared_mem__a(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
+// CHECK: fir.global internal @_QPshared_static__shared_mem__b(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
+// CHECK: fir.global internal @_QPshared_static__shared_mem__c(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
+// CHECK: fir.global internal @_QPshared_static__shared_mem__d(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
+// CHECK: fir.global internal @_QPshared_static__shared_mem__e(dense<0> : vector<8xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<8xi8>
+// CHECK: fir.global internal @_QPshared_static__shared_mem__r(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
// CHECK: }
// CHECK: }
@@ -159,4 +164,4 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
-// CHECK: fir.global external @_QMmtestsPtestany__shared_mem {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
+// CHECK: fir.global external @_QMmtestsPtestany__shared_mem__ {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
diff --git a/flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir b/flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir
index 26479d1cdd94f..69370613cd348 100644
--- a/flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir
+++ b/flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir
@@ -9,14 +9,14 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
%1 = cuf.shared_memory [%c4 : i32] i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
llvm.return
}
- llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
+ llvm.mlir.global common @_QPshared_static__shared_mem__(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
}
}
// CHECK-LABEL: llvm.func @_QPshared_static()
-// CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem : !llvm.ptr<3>
+// CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem__ : !llvm.ptr<3>
// CHECK: %[[ADDRCAST0:.*]] = llvm.addrspacecast %[[ADDR0]] : !llvm.ptr<3> to !llvm.ptr
// CHECK: %[[A:.*]] = llvm.getelementptr %[[ADDRCAST0]][%c0{{.*}}] : (!llvm.ptr, i32) -> !llvm.ptr, i8
-// CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem : !llvm.ptr<3>
+// CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem__ : !llvm.ptr<3>
// CHECK: %[[ADDRCAST1:.*]] = llvm.addrspacecast %[[ADDR1]] : !llvm.ptr<3> to !llvm.ptr
// CHECK: %[[B:.*]] = llvm.getelementptr %[[ADDRCAST1]][%c4{{.*}}] : (!llvm.ptr, i32) -> !llvm.ptr, i8
More information about the flang-commits
mailing list