[flang-commits] [flang] ab2a302 - [flang][cuda] Add support for cluster_dim_blocks in cooperative_groups (#169417)

Mon Nov 24 14:55:07 PST 2025

Author: Valentin Clement (バレンタイン クレメン)
Date: 2025-11-24T22:55:02Z
New Revision: ab2a302f0ee8b31404aa4cc454caee40f46602bd

URL: https://github.com/llvm/llvm-project/commit/ab2a302f0ee8b31404aa4cc454caee40f46602bd
DIFF: https://github.com/llvm/llvm-project/commit/ab2a302f0ee8b31404aa4cc454caee40f46602bd.diff

LOG: [flang][cuda] Add support for cluster_dim_blocks in cooperative_groups (#169417)

Added: 
    flang/test/Lower/CUDA/cuda-cluster.cuf

Modified: 
    flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
    flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
    flang/module/cooperative_groups.f90

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
index 027bd3b79a1df..cedc7a9437eb5 100644

--- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -47,6 +47,7 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary {
   void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genClusterDimBlocks(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
   template <const char *fctName, int extent>
   fir::ExtendedValue genLDXXFunc(mlir::Type,

diff  --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index c560c53033780..a770e2d9cdeff 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -368,6 +368,11 @@ static constexpr IntrinsicHandler cudaHandlers[]{
          &CI::genNVVMTime<mlir::NVVM::Clock64Op>),
      {},
      /*isElemental=*/false},
+    {"cluster_dim_blocks",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genClusterDimBlocks),
+     {},
+     /*isElemental=*/false},
     {"fence_proxy_async",
      static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
          &CI::genFenceProxyAsync),
@@ -985,6 +990,38 @@ CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
       .getResult(0);
 }
 
+// CLUSTER_DIM_BLOCKS
+mlir::Value
+CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+
+  auto insertDim = [&](mlir::Value dim, unsigned fieldPos) {
+    auto fieldName = recTy.getTypeList()[fieldPos].first;
+    mlir::Type fieldTy = recTy.getTypeList()[fieldPos].second;
+    mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+    mlir::Value fieldIndex = fir::FieldIndexOp::create(
+        builder, loc, fieldIndexType, fieldName, recTy,
+        /*typeParams=*/mlir::ValueRange{});
+    mlir::Value coord = fir::CoordinateOp::create(
+        builder, loc, builder.getRefType(fieldTy), res, fieldIndex);
+    fir::StoreOp::create(builder, loc, dim, coord);
+  };
+
+  mlir::Type i32Ty = builder.getI32Type();
+  mlir::Value x = mlir::NVVM::ClusterDimBlocksXOp::create(builder, loc, i32Ty);
+  insertDim(x, 0);
+  mlir::Value y = mlir::NVVM::ClusterDimBlocksYOp::create(builder, loc, i32Ty);
+  insertDim(y, 1);
+  mlir::Value z = mlir::NVVM::ClusterDimBlocksZOp::create(builder, loc, i32Ty);
+  insertDim(z, 2);
+
+  return res;
+}
+
 // FENCE_PROXY_ASYNC
 void CUDAIntrinsicLibrary::genFenceProxyAsync(
     llvm::ArrayRef<fir::ExtendedValue> args) {

diff  --git a/flang/module/cooperative_groups.f90 b/flang/module/cooperative_groups.f90
index 1c89866f9c84a..2631975837a5b 100644
--- a/flang/module/cooperative_groups.f90
+++ b/flang/module/cooperative_groups.f90
@@ -38,6 +38,13 @@ module cooperative_groups
   integer(4) :: rank
 end type thread_group
 
+interface
+  attributes(device) function cluster_dim_blocks()
+    import
+    type(dim3) :: cluster_dim_blocks
+  end function
+end interface
+
 interface
   attributes(device) function this_cluster()
     import

diff  --git a/flang/test/Lower/CUDA/cuda-cluster.cuf b/flang/test/Lower/CUDA/cuda-cluster.cuf
new file mode 100644
index 0000000000000..51cc4208a35de
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-cluster.cuf
@@ -0,0 +1,34 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+attributes(global) subroutine test_this_cluster()
+  use cooperative_groups
+  type(cluster_group) :: cluster
+
+  cluster = this_cluster()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_this_cluster() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: %{{.*}} = fir.alloca !fir.type<_QMcooperative_groupsTcluster_group
+! CHECK: %[[RES:.*]] = fir.alloca !fir.type<_QMcooperative_groupsTcluster_group{_QMcooperative_groupsTcluster_group.handle:!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>,size:i32,rank:i32}>
+! CHECK: %[[RANK:.*]] = nvvm.read.ptx.sreg.cluster.ctarank : i32
+! CHECK: %[[RANK_1:.*]] = arith.addi %[[RANK]], %c1{{.*}} : i32 
+! CHECK: %[[RANK_COORD:.*]] = fir.coordinate_of %[[RES]], rank : (!fir.ref<!fir.type<_QMcooperative_groupsTcluster_group{_QMcooperative_groupsTcluster_group.handle:!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>,size:i32,rank:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[RANK_1]] to %[[RANK_COORD]] : !fir.ref<i32>
+  
+attributes(global) subroutine test_cluster_dim_blocks()
+  use cooperative_groups
+  type(dim3) :: clusterDim
+
+  clusterDim = cluster_dim_blocks()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cluster_dim_blocks() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: %[[X:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.x : i32
+! CHECK: %[[COORD_X:.*]] = fir.coordinate_of %{{.*}}, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[X]] to %[[COORD_X]] : !fir.ref<i32>
+! CHECK: %[[Y:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.y : i32
+! CHECK: %[[COORD_Y:.*]] = fir.coordinate_of %{{.*}}, y : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[Y]] to %[[COORD_Y]] : !fir.ref<i32>
+! CHECK: %[[Z:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.z : i32
+! CHECK: %[[COORD_Z:.*]] = fir.coordinate_of %{{.*}}, z : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[Z]] to %[[COORD_Z]] : !fir.ref<i32>