[flang-commits] [flang] ab2a302 - [flang][cuda] Add support for cluster_dim_blocks in cooperative_groups (#169417)
via flang-commits
flang-commits at lists.llvm.org
Mon Nov 24 14:55:07 PST 2025
Author: Valentin Clement (バレンタイン クレメン)
Date: 2025-11-24T22:55:02Z
New Revision: ab2a302f0ee8b31404aa4cc454caee40f46602bd
URL: https://github.com/llvm/llvm-project/commit/ab2a302f0ee8b31404aa4cc454caee40f46602bd
DIFF: https://github.com/llvm/llvm-project/commit/ab2a302f0ee8b31404aa4cc454caee40f46602bd.diff
LOG: [flang][cuda] Add support for cluster_dim_blocks in cooperative_groups (#169417)
Added:
flang/test/Lower/CUDA/cuda-cluster.cuf
Modified:
flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
flang/module/cooperative_groups.f90
Removed:
################################################################################
diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
index 027bd3b79a1df..cedc7a9437eb5 100644
--- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -47,6 +47,7 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary {
void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genClusterDimBlocks(mlir::Type, llvm::ArrayRef<mlir::Value>);
void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
template <const char *fctName, int extent>
fir::ExtendedValue genLDXXFunc(mlir::Type,
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index c560c53033780..a770e2d9cdeff 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -368,6 +368,11 @@ static constexpr IntrinsicHandler cudaHandlers[]{
&CI::genNVVMTime<mlir::NVVM::Clock64Op>),
{},
/*isElemental=*/false},
+ {"cluster_dim_blocks",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genClusterDimBlocks),
+ {},
+ /*isElemental=*/false},
{"fence_proxy_async",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genFenceProxyAsync),
@@ -985,6 +990,38 @@ CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
.getResult(0);
}
+// CLUSTER_DIM_BLOCKS
+mlir::Value
+CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 0);
+ auto recTy = mlir::cast<fir::RecordType>(resultType);
+ assert(recTy && "RecordType expepected");
+ mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+
+ auto insertDim = [&](mlir::Value dim, unsigned fieldPos) {
+ auto fieldName = recTy.getTypeList()[fieldPos].first;
+ mlir::Type fieldTy = recTy.getTypeList()[fieldPos].second;
+ mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+ mlir::Value fieldIndex = fir::FieldIndexOp::create(
+ builder, loc, fieldIndexType, fieldName, recTy,
+ /*typeParams=*/mlir::ValueRange{});
+ mlir::Value coord = fir::CoordinateOp::create(
+ builder, loc, builder.getRefType(fieldTy), res, fieldIndex);
+ fir::StoreOp::create(builder, loc, dim, coord);
+ };
+
+ mlir::Type i32Ty = builder.getI32Type();
+ mlir::Value x = mlir::NVVM::ClusterDimBlocksXOp::create(builder, loc, i32Ty);
+ insertDim(x, 0);
+ mlir::Value y = mlir::NVVM::ClusterDimBlocksYOp::create(builder, loc, i32Ty);
+ insertDim(y, 1);
+ mlir::Value z = mlir::NVVM::ClusterDimBlocksZOp::create(builder, loc, i32Ty);
+ insertDim(z, 2);
+
+ return res;
+}
+
// FENCE_PROXY_ASYNC
void CUDAIntrinsicLibrary::genFenceProxyAsync(
llvm::ArrayRef<fir::ExtendedValue> args) {
diff --git a/flang/module/cooperative_groups.f90 b/flang/module/cooperative_groups.f90
index 1c89866f9c84a..2631975837a5b 100644
--- a/flang/module/cooperative_groups.f90
+++ b/flang/module/cooperative_groups.f90
@@ -38,6 +38,13 @@ module cooperative_groups
integer(4) :: rank
end type thread_group
+interface
+ attributes(device) function cluster_dim_blocks()
+ import
+ type(dim3) :: cluster_dim_blocks
+ end function
+end interface
+
interface
attributes(device) function this_cluster()
import
diff --git a/flang/test/Lower/CUDA/cuda-cluster.cuf b/flang/test/Lower/CUDA/cuda-cluster.cuf
new file mode 100644
index 0000000000000..51cc4208a35de
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-cluster.cuf
@@ -0,0 +1,34 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+attributes(global) subroutine test_this_cluster()
+ use cooperative_groups
+ type(cluster_group) :: cluster
+
+ cluster = this_cluster()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_this_cluster() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: %{{.*}} = fir.alloca !fir.type<_QMcooperative_groupsTcluster_group
+! CHECK: %[[RES:.*]] = fir.alloca !fir.type<_QMcooperative_groupsTcluster_group{_QMcooperative_groupsTcluster_group.handle:!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>,size:i32,rank:i32}>
+! CHECK: %[[RANK:.*]] = nvvm.read.ptx.sreg.cluster.ctarank : i32
+! CHECK: %[[RANK_1:.*]] = arith.addi %[[RANK]], %c1{{.*}} : i32
+! CHECK: %[[RANK_COORD:.*]] = fir.coordinate_of %[[RES]], rank : (!fir.ref<!fir.type<_QMcooperative_groupsTcluster_group{_QMcooperative_groupsTcluster_group.handle:!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>,size:i32,rank:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[RANK_1]] to %[[RANK_COORD]] : !fir.ref<i32>
+
+attributes(global) subroutine test_cluster_dim_blocks()
+ use cooperative_groups
+ type(dim3) :: clusterDim
+
+ clusterDim = cluster_dim_blocks()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cluster_dim_blocks() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: %[[X:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.x : i32
+! CHECK: %[[COORD_X:.*]] = fir.coordinate_of %{{.*}}, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[X]] to %[[COORD_X]] : !fir.ref<i32>
+! CHECK: %[[Y:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.y : i32
+! CHECK: %[[COORD_Y:.*]] = fir.coordinate_of %{{.*}}, y : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[Y]] to %[[COORD_Y]] : !fir.ref<i32>
+! CHECK: %[[Z:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.z : i32
+! CHECK: %[[COORD_Z:.*]] = fir.coordinate_of %{{.*}}, z : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[Z]] to %[[COORD_Z]] : !fir.ref<i32>
More information about the flang-commits
mailing list