[flang-commits] [flang] [flang][cuda] Set address space for constant variables (PR #163430)
Valentin Clement バレンタイン クレメン via flang-commits
flang-commits at lists.llvm.org
Tue Oct 14 11:47:03 PDT 2025
https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/163430
>From 2fc7512d8f254213888dbe9a4b5d210b48464acf Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 14 Oct 2025 10:59:39 -0700
Subject: [PATCH 1/4] [flang][cuda] Set address space for constant variables
---
flang/lib/Optimizer/CodeGen/CodeGen.cpp | 3 ++-
flang/test/Fir/CUDA/cuda-code-gen.mlir | 19 +++++++++++++++++++
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 0afb295e58e54..3f55c82120ed1 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -3231,7 +3231,8 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
if (global.getDataAttr() &&
*global.getDataAttr() == cuf::DataAttribute::Constant)
- TODO(global.getLoc(), "CUDA Fortran CONSTANT variable code generation");
+ g.setAddrSpace(
+ static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Constant));
rewriter.eraseOp(global);
return mlir::success();
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index bbd3f9fbd351b..8168e97a9f6bb 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -284,3 +284,22 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
// CHECK-LABEL: llvm.func @_QQxxx()
// CHECK: llvm.alloca %{{.*}} x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
// CHECK-NOT: llvm.call @_FortranACUFAllocDescriptor
+
+// -----
+
+fir.global @_QMkernelsEinitial_val {data_attr = #cuf.cuda<constant>} : i32 {
+ %0 = fir.zero_bits i32
+ fir.has_value %0 : i32
+}
+func.func @_QMkernelsPassign(%arg0: !fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %1 = fir.address_of(@_QMkernelsEinitial_val) : !fir.ref<i32>
+ %14 = fir.load %1 : !fir.ref<i32>
+ fir.store %14 to %arg0 : !fir.ref<i32>
+ return
+}
+
+// CHECK: llvm.mlir.global external @_QMkernelsEinitial_val() {addr_space = 4 : i32} : i32
+// CHECK-LABEL: llvm.func @_QMkernelsPassign
+// CHECK: %[[ADDROF:.*]] = llvm.mlir.addressof @_QMkernelsEinitial_val : !llvm.ptr<4>
+// CHECK: %{{.*}} = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<4> to !llvm.ptr
+
>From 91e74bde3d9c00bc2a014e08a0f4a095b5038197 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 14 Oct 2025 11:40:57 -0700
Subject: [PATCH 2/4] Fix address cast in gpu mod
---
flang/lib/Optimizer/CodeGen/CodeGen.cpp | 12 +++++++++
flang/test/Fir/CUDA/cuda-code-gen.mlir | 33 ++++++++++++++++---------
2 files changed, 33 insertions(+), 12 deletions(-)
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 3f55c82120ed1..56a4039117dda 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -176,6 +176,18 @@ struct AddrOfOpConversion : public fir::FIROpConversion<fir::AddrOfOp> {
llvm::LogicalResult
matchAndRewrite(fir::AddrOfOp addr, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
+
+ if (auto gpuMod = addr->getParentOfType<mlir::gpu::GPUModuleOp>()) {
+ auto global = gpuMod.lookupSymbol<mlir::LLVM::GlobalOp>(addr.getSymbol());
+ if (global) {
+ replaceWithAddrOfOrASCast(
+ rewriter, addr->getLoc(), global.getAddrSpace(),
+ getProgramAddressSpace(rewriter), global.getSymName(),
+ convertType(addr.getType()), addr);
+ }
+ return mlir::success();
+ }
+
auto global = addr->getParentOfType<mlir::ModuleOp>()
.lookupSymbol<mlir::LLVM::GlobalOp>(addr.getSymbol());
replaceWithAddrOfOrASCast(
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index 8168e97a9f6bb..b4fa2965070d2 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -287,19 +287,28 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
// -----
-fir.global @_QMkernelsEinitial_val {data_attr = #cuf.cuda<constant>} : i32 {
- %0 = fir.zero_bits i32
- fir.has_value %0 : i32
-}
-func.func @_QMkernelsPassign(%arg0: !fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
- %1 = fir.address_of(@_QMkernelsEinitial_val) : !fir.ref<i32>
- %14 = fir.load %1 : !fir.ref<i32>
- fir.store %14 to %arg0 : !fir.ref<i32>
- return
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+ gpu.module @cuda_device_mod [#nvvm.target<chip = "sm_90", features = "+ptx75", link = ["/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_builtin_intrinsics_runtime.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_utils_runtime.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_cpp_builtins.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_runtime.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_runtime_cc90.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_utils_runtime_cc90.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12//libdevice_nvhpc_cuda_runtime_builtins_cc90.10.bc", "/proj/ng/Linux_x86_64/dev/cuda/12.9/nvvm/libdevice/libdevice.10.bc"]>] attributes {llvm.data_layout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"} {
+ fir.global @_QMkernelsEinitial_val {data_attr = #cuf.cuda<constant>} : i32 {
+ %0 = fir.zero_bits i32
+ fir.has_value %0 : i32
+ }
+ gpu.func @_QMkernelsPassign(%arg0: !fir.ref<!fir.array<?xi32>>) kernel {
+ %c-1 = arith.constant -1 : index
+ %c1_i32 = arith.constant 1 : i32
+ %0 = arith.constant 1 : i32
+ %1 = arith.addi %0, %c1_i32 : i32
+ %2 = fir.address_of(@_QMkernelsEinitial_val) : !fir.ref<i32>
+ %4 = fir.load %2 : !fir.ref<i32>
+ %5 = fir.convert %1 : (i32) -> i64
+ %6 = fircg.ext_array_coor %arg0(%c-1)<%5> : (!fir.ref<!fir.array<?xi32>>, index, i64) -> !fir.ref<i32>
+ fir.store %4 to %6 : !fir.ref<i32>
+ gpu.return
+ }
+ }
}
-// CHECK: llvm.mlir.global external @_QMkernelsEinitial_val() {addr_space = 4 : i32} : i32
-// CHECK-LABEL: llvm.func @_QMkernelsPassign
+// CHECK: llvm.mlir.global external @_QMkernelsEinitial_val() {addr_space = 4 : i32} : i32
+// CHECK-LABEL: gpu.func @_QMkernelsPassign
// CHECK: %[[ADDROF:.*]] = llvm.mlir.addressof @_QMkernelsEinitial_val : !llvm.ptr<4>
// CHECK: %{{.*}} = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<4> to !llvm.ptr
-
>From a8b0bee55c5fad2adc27f91d653f908edbdca1ae Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 14 Oct 2025 11:42:37 -0700
Subject: [PATCH 3/4] Add assertion
---
flang/lib/Optimizer/CodeGen/CodeGen.cpp | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 56a4039117dda..3c3804f35bf30 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -179,12 +179,11 @@ struct AddrOfOpConversion : public fir::FIROpConversion<fir::AddrOfOp> {
if (auto gpuMod = addr->getParentOfType<mlir::gpu::GPUModuleOp>()) {
auto global = gpuMod.lookupSymbol<mlir::LLVM::GlobalOp>(addr.getSymbol());
- if (global) {
- replaceWithAddrOfOrASCast(
- rewriter, addr->getLoc(), global.getAddrSpace(),
- getProgramAddressSpace(rewriter), global.getSymName(),
- convertType(addr.getType()), addr);
- }
+ assert(global && "Expect global in gpu module");
+ replaceWithAddrOfOrASCast(rewriter, addr->getLoc(), global.getAddrSpace(),
+ getProgramAddressSpace(rewriter),
+ global.getSymName(),
+ convertType(addr.getType()), addr);
return mlir::success();
}
>From 781c29348483890d0631c2d181fa016649db0b4b Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 14 Oct 2025 11:46:50 -0700
Subject: [PATCH 4/4] Remove data layout
---
flang/test/Fir/CUDA/cuda-code-gen.mlir | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index b4fa2965070d2..60cda9e98c7d8 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -288,7 +288,7 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
// -----
module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
- gpu.module @cuda_device_mod [#nvvm.target<chip = "sm_90", features = "+ptx75", link = ["/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_builtin_intrinsics_runtime.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_utils_runtime.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_cpp_builtins.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_runtime.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_runtime_cc90.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_utils_runtime_cc90.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12//libdevice_nvhpc_cuda_runtime_builtins_cc90.10.bc", "/proj/ng/Linux_x86_64/dev/cuda/12.9/nvvm/libdevice/libdevice.10.bc"]>] attributes {llvm.data_layout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"} {
+ gpu.module @cuda_device_mod {
fir.global @_QMkernelsEinitial_val {data_attr = #cuf.cuda<constant>} : i32 {
%0 = fir.zero_bits i32
fir.has_value %0 : i32
More information about the flang-commits
mailing list