[flang-commits] [flang] 1c95c7a - [flang][cuda] Add interfaces and lowering for barrier_arrive (#162949)

via flang-commits flang-commits at lists.llvm.org
Sat Oct 11 09:54:43 PDT 2025


Author: Valentin Clement (バレンタイン クレメン)
Date: 2025-10-11T09:54:38-07:00
New Revision: 1c95c7ae201c8f89e2ecbeb630813c9fcd23471f

URL: https://github.com/llvm/llvm-project/commit/1c95c7ae201c8f89e2ecbeb630813c9fcd23471f
DIFF: https://github.com/llvm/llvm-project/commit/1c95c7ae201c8f89e2ecbeb630813c9fcd23471f.diff

LOG: [flang][cuda] Add interfaces and lowering for barrier_arrive (#162949)

Added: 
    

Modified: 
    flang/include/flang/Optimizer/Builder/IntrinsicCall.h
    flang/lib/Optimizer/Builder/IntrinsicCall.cpp
    flang/module/cudadevice.f90
    flang/test/Lower/CUDA/cuda-device-proc.cuf

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 695221cbcb42c..ca02693c53aeb 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -208,6 +208,8 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genAssociated(mlir::Type,
                                    llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtand(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genBesselJn(mlir::Type,
                                  llvm::ArrayRef<fir::ExtendedValue>);

diff  --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 2c21868295528..c9cf6c23a81a5 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -346,6 +346,14 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>,
      {{{"mask", asValue}, {"pred", asValue}}},
      /*isElemental=*/false},
+    {"barrier_arrive",
+     &I::genBarrierArrive,
+     {{{"barrier", asAddr}}},
+     /*isElemental=*/false},
+    {"barrier_arrive_cnt",
+     &I::genBarrierArriveCnt,
+     {{{"barrier", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
     {"barrier_init",
      &I::genBarrierInit,
      {{{"barrier", asAddr}, {"count", asValue}}},
@@ -3180,19 +3188,53 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType,
   return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox);
 }
 
-// BARRIER_INIT (CUDA)
-void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 2);
-  auto llvmPtr = fir::ConvertOp::create(
+static mlir::Value convertBarrierToLLVM(fir::FirOpBuilder &builder,
+                                        mlir::Location loc,
+                                        mlir::Value barrier) {
+  mlir::Value llvmPtr = fir::ConvertOp::create(
       builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()),
-      fir::getBase(args[0]));
-  auto addrCast = mlir::LLVM::AddrSpaceCastOp::create(
+      barrier);
+  mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create(
       builder, loc,
       mlir::LLVM::LLVMPointerType::get(
           builder.getContext(),
           static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Shared)),
       llvmPtr);
-  mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, addrCast,
+  return addrCast;
+}
+
+// BARRIER_ARRIVE (CUDA)
+mlir::Value
+IntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 1);
+  mlir::Value barrier = convertBarrierToLLVM(builder, loc, args[0]);
+  return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType,
+                                                    barrier)
+      .getResult();
+}
+
+// BARRIER_ARRIBVE_CNT (CUDA)
+mlir::Value
+IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
+                                      llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::Value barrier = convertBarrierToLLVM(builder, loc, args[0]);
+  mlir::Value token = fir::AllocaOp::create(builder, loc, resultType);
+  // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and
+  // currently just the sink symbol `_`.
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
+  mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1],
+                                               {});
+  return fir::LoadOp::create(builder, loc, token);
+}
+
+// BARRIER_INIT (CUDA)
+void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value barrier =
+      convertBarrierToLLVM(builder, loc, fir::getBase(args[0]));
+  mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, barrier,
                                            fir::getBase(args[1]), {});
 }
 

diff  --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 4f552dcf08372..e6c9e958af365 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -1987,13 +1987,27 @@ attributes(device,host) logical function on_device() bind(c)
     end function
   end interface
 
+  ! TMA Operations
+
   interface 
     attributes(device) subroutine barrier_init(barrier, count)
-      integer(8) :: barrier
+      integer(8), shared :: barrier
       integer(4) :: count
     end subroutine
   end interface
 
+  interface barrier_arrive
+    attributes(device) function barrier_arrive(barrier) result(token)
+      integer(8), shared :: barrier
+      integer(8) :: token
+    end function
+    attributes(device) function barrier_arrive_cnt(barrier, count) result(token)
+      integer(8), shared :: barrier
+      integer(4) :: count
+      integer(8) :: token
+    end function
+  end interface
+
 contains
 
   attributes(device) subroutine syncthreads()

diff  --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index cdb337b115e47..1bf714010f5d3 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -394,9 +394,14 @@ end subroutine
 
 attributes(global) subroutine test_barrier()
   integer(8), shared :: barrier
+  integer(8) :: token
+  integer :: count
   call barrier_init(barrier, 256)
-end subroutine
 
+  token = barrier_arrive(barrier)
+
+  token = barrier_arrive(barrier, count)
+end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_barrier()
     
@@ -406,3 +411,11 @@ end subroutine
 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
 ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
 ! CHECK: nvvm.mbarrier.init.shared %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32
+
+! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
+! CHECK: %{{.*}} = nvvm.mbarrier.arrive.shared %[[SHARED_PTR]] : !llvm.ptr<3> -> i64
+
+! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
+! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32


        


More information about the flang-commits mailing list