[flang-commits] [flang] Adding atomicadd as a cudadevice intrinsic and converting it LLVM dialect (PR #123840)

Tue Jan 21 14:58:54 PST 2025

https://github.com/Renaud-K created https://github.com/llvm/llvm-project/pull/123840

With these changes, CUF atomic operations are handled as cudadevice intrinsics and are converted straight to the LLVM dialect with the `llvm.atomicrw` operation. 

I am only submitting changes for `atomicadd` to gather feedback. If we are to proceed with these changes I will add support for all other applicable atomic operations following this pattern.


>From 44a05d05146a71d4c7d364695d0db788bd3d7d97 Mon Sep 17 00:00:00 2001
From: Renaud-K <rkauffmann at nvidia.com>
Date: Tue, 21 Jan 2025 14:38:45 -0800
Subject: [PATCH 1/2] Adding atomicadd as a cudadevice intrinsic and converting
 it LLVM dialect

---
 .../flang/Optimizer/Builder/IntrinsicCall.h   |  2 ++
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 26 +++++++++++++++++++
 flang/module/cudadevice.f90                   | 26 +++++++++++++++++++
 flang/test/Lower/CUDA/cuda-device-proc.cuf    | 13 ++++++++++
 .../Semantics/cuf-device-procedures01.cuf     |  9 +++++++
 5 files changed, 76 insertions(+)

diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 9c9c0609f4fc3c..cba7378cfbbfec 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -20,6 +20,7 @@
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include <mlir/IR/Value.h>
 #include <optional>
 
 namespace fir {
@@ -185,6 +186,7 @@ struct IntrinsicLibrary {
   mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue
       genCommandArgumentCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAsind(mlir::Type, llvm::ArrayRef<mlir::Value>);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 6a343645ab8786..fd88a44ac3bcae 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -51,6 +51,8 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cfenv> // temporary -- only used in genIeeeGetOrSetModesOrStatus
+#include <mlir/Dialect/LLVMIR/LLVMTypes.h>
+#include <mlir/IR/BuiltinAttributes.h>
 #include <mlir/IR/Value.h>
 #include <optional>
 
@@ -147,6 +149,10 @@ static constexpr IntrinsicHandler handlers[]{
     {"atan2pi", &I::genAtanpi},
     {"atand", &I::genAtand},
     {"atanpi", &I::genAtanpi},
+    {"atomicaddd", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},
+    {"atomicaddf", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},
+    {"atomicaddi", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},
+    {"atomicaddl", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},
     {"bessel_jn",
      &I::genBesselJn,
      {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}},
@@ -2574,6 +2580,26 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType,
   return builder.create<mlir::arith::MulFOp>(loc, atan, factor);
 }
 
+static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
+                                mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
+                                mlir::Value arg1) {
+  auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  arg0 = builder.createConvert(loc, llvmPointerType, arg0);
+  return builder.create<mlir::LLVM::AtomicRMWOp>(
+      loc, binOp, arg0, arg1, mlir::LLVM::AtomicOrdering::seq_cst);
+}
+
+mlir::Value IntrinsicLibrary::genAtomAdd(mlir::Type resultType,
+                                         llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::add
+          : mlir::LLVM::AtomicBinOp::fadd;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
 // ASSOCIATED
 fir::ExtendedValue
 IntrinsicLibrary::genAssociated(mlir::Type resultType,
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 3d487fd000a094..2dc0ed1f49bf67 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -92,5 +92,31 @@ attributes(device) subroutine threadfence_system()
     end function
   end interface
   public :: __fadd_ru
+
+  ! Atomic Operations
   
+  interface atomicadd
+    attributes(device) pure integer function atomicaddi(address, val)
+  !dir$ ignore_tkr (rd) address, (d) val
+    integer, intent(inout) :: address
+    integer, value :: val
+    end function
+    attributes(device) pure real function atomicaddf(address, val)
+  !dir$ ignore_tkr (rd) address, (d) val
+    real, intent(inout) :: address
+    real, value :: val
+    end function
+    attributes(device) pure real*8 function atomicaddd(address, val)
+  !dir$ ignore_tkr (rd) address, (d) val
+    real*8, intent(inout) :: address
+    real*8, value :: val
+    end function
+    attributes(device) pure integer(8) function atomicaddl(address, val)
+  !dir$ ignore_tkr (rd) address, (dk) val
+    integer(8), intent(inout) :: address
+    integer(8), value :: val
+    end function
+  end interface 
+public :: atomicadd
+
 end module
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 2042bbbe19650a..661e5728bf85b8 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -5,6 +5,10 @@
 attributes(global) subroutine devsub()
   implicit none
   integer :: ret
+  real(4) :: af
+  real(8) :: ad
+  integer(4) :: ai
+  integer(8) :: al
 
   call syncthreads()
   call syncwarp(1)
@@ -14,6 +18,11 @@ attributes(global) subroutine devsub()
   ret = syncthreads_and(1)
   ret = syncthreads_count(1)
   ret = syncthreads_or(1)
+
+  ai = atomicadd(ai, 1_4)
+  al = atomicadd(al, 1_8)
+  af = atomicadd(af, 1.0_4)
+  ad = atomicadd(ad, 1.0_8)
 end
 
 ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
@@ -25,6 +34,10 @@ end
 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1_i32_0) fastmath<contract> : (i32) -> i32
 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1_i32_1) fastmath<contract> : (i32) -> i32
 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1_i32_2) fastmath<contract> : (i32) -> i32
+! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
+! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
+! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
+! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64
 
 ! CHECK: func.func private @llvm.nvvm.barrier0()
 ! CHECK: func.func private @__syncwarp(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncwarp", fir.proc_attrs = #fir.proc_attrs<bind_c>}
diff --git a/flang/test/Semantics/cuf-device-procedures01.cuf b/flang/test/Semantics/cuf-device-procedures01.cuf
index b9918d8a4ae4ce..92ee02bb3c64df 100644
--- a/flang/test/Semantics/cuf-device-procedures01.cuf
+++ b/flang/test/Semantics/cuf-device-procedures01.cuf
@@ -28,8 +28,17 @@ end
 ! CHECK: threadfence_system (Subroutine): Use from threadfence_system in cudadevice
 
 subroutine host()
+  real(4) :: af
+  real(8) :: ad
+  integer(4) :: ai
+  integer(8) :: al
   call syncthreads()
+  ai = atomicadd(ai, 1_4)
+  al = atomicadd(al, 1_8)
+  af = atomicadd(af, 1.0_4)
+  ad = atomicadd(ad, 1.0_8)
 end subroutine
 
 ! CHECK-LABEL: Subprogram scope: host
+! CHECK: atomicadd, EXTERNAL: HostAssoc{{$}}
 ! CHECK: syncthreads, EXTERNAL: HostAssoc{{$}}

>From 18120ce5b2eefa1b5380ad46ff27b13dd17a56b8 Mon Sep 17 00:00:00 2001
From: Renaud-K <rkauffmann at nvidia.com>
Date: Tue, 21 Jan 2025 14:46:58 -0800
Subject: [PATCH 2/2] Corrected includes added by clangd

---
 flang/include/flang/Optimizer/Builder/IntrinsicCall.h | 1 -
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp         | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index cba7378cfbbfec..d5f794a382334f 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -20,7 +20,6 @@
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
-#include <mlir/IR/Value.h>
 #include <optional>
 
 namespace fir {
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index fd88a44ac3bcae..0dccd168714e1a 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -44,6 +44,7 @@
 #include "flang/Runtime/iostat-consts.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "llvm/Support/CommandLine.h"
@@ -51,9 +52,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cfenv> // temporary -- only used in genIeeeGetOrSetModesOrStatus
-#include <mlir/Dialect/LLVMIR/LLVMTypes.h>
-#include <mlir/IR/BuiltinAttributes.h>
-#include <mlir/IR/Value.h>
 #include <optional>
 
 #define DEBUG_TYPE "flang-lower-intrinsic"