[flang-commits] [flang] Adding atomicadd as a cudadevice intrinsic and converting it LLVM dialect (PR #123840)
Renaud Kauffmann via flang-commits
flang-commits at lists.llvm.org
Tue Jan 21 14:58:54 PST 2025
https://github.com/Renaud-K created https://github.com/llvm/llvm-project/pull/123840
With these changes, CUF atomic operations are handled as cudadevice intrinsics and are converted straight to the LLVM dialect with the `llvm.atomicrw` operation.
I am only submitting changes for `atomicadd` to gather feedback. If we are to proceed with these changes I will add support for all other applicable atomic operations following this pattern.
>From 44a05d05146a71d4c7d364695d0db788bd3d7d97 Mon Sep 17 00:00:00 2001
From: Renaud-K <rkauffmann at nvidia.com>
Date: Tue, 21 Jan 2025 14:38:45 -0800
Subject: [PATCH 1/2] Adding atomicadd as a cudadevice intrinsic and converting
it LLVM dialect
---
.../flang/Optimizer/Builder/IntrinsicCall.h | 2 ++
flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 26 +++++++++++++++++++
flang/module/cudadevice.f90 | 26 +++++++++++++++++++
flang/test/Lower/CUDA/cuda-device-proc.cuf | 13 ++++++++++
.../Semantics/cuf-device-procedures01.cuf | 9 +++++++
5 files changed, 76 insertions(+)
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 9c9c0609f4fc3c..cba7378cfbbfec 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -20,6 +20,7 @@
#include "mlir/Dialect/Complex/IR/Complex.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
+#include <mlir/IR/Value.h>
#include <optional>
namespace fir {
@@ -185,6 +186,7 @@ struct IntrinsicLibrary {
mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>);
fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genAtomAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
fir::ExtendedValue
genCommandArgumentCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genAsind(mlir::Type, llvm::ArrayRef<mlir::Value>);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 6a343645ab8786..fd88a44ac3bcae 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -51,6 +51,8 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <cfenv> // temporary -- only used in genIeeeGetOrSetModesOrStatus
+#include <mlir/Dialect/LLVMIR/LLVMTypes.h>
+#include <mlir/IR/BuiltinAttributes.h>
#include <mlir/IR/Value.h>
#include <optional>
@@ -147,6 +149,10 @@ static constexpr IntrinsicHandler handlers[]{
{"atan2pi", &I::genAtanpi},
{"atand", &I::genAtand},
{"atanpi", &I::genAtanpi},
+ {"atomicaddd", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},
+ {"atomicaddf", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},
+ {"atomicaddi", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},
+ {"atomicaddl", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},
{"bessel_jn",
&I::genBesselJn,
{{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}},
@@ -2574,6 +2580,26 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType,
return builder.create<mlir::arith::MulFOp>(loc, atan, factor);
}
+static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
+ mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
+ mlir::Value arg1) {
+ auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+ arg0 = builder.createConvert(loc, llvmPointerType, arg0);
+ return builder.create<mlir::LLVM::AtomicRMWOp>(
+ loc, binOp, arg0, arg1, mlir::LLVM::AtomicOrdering::seq_cst);
+}
+
+mlir::Value IntrinsicLibrary::genAtomAdd(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+
+ mlir::LLVM::AtomicBinOp binOp =
+ mlir::isa<mlir::IntegerType>(args[1].getType())
+ ? mlir::LLVM::AtomicBinOp::add
+ : mlir::LLVM::AtomicBinOp::fadd;
+ return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
// ASSOCIATED
fir::ExtendedValue
IntrinsicLibrary::genAssociated(mlir::Type resultType,
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 3d487fd000a094..2dc0ed1f49bf67 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -92,5 +92,31 @@ attributes(device) subroutine threadfence_system()
end function
end interface
public :: __fadd_ru
+
+ ! Atomic Operations
+ interface atomicadd
+ attributes(device) pure integer function atomicaddi(address, val)
+ !dir$ ignore_tkr (rd) address, (d) val
+ integer, intent(inout) :: address
+ integer, value :: val
+ end function
+ attributes(device) pure real function atomicaddf(address, val)
+ !dir$ ignore_tkr (rd) address, (d) val
+ real, intent(inout) :: address
+ real, value :: val
+ end function
+ attributes(device) pure real*8 function atomicaddd(address, val)
+ !dir$ ignore_tkr (rd) address, (d) val
+ real*8, intent(inout) :: address
+ real*8, value :: val
+ end function
+ attributes(device) pure integer(8) function atomicaddl(address, val)
+ !dir$ ignore_tkr (rd) address, (dk) val
+ integer(8), intent(inout) :: address
+ integer(8), value :: val
+ end function
+ end interface
+public :: atomicadd
+
end module
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 2042bbbe19650a..661e5728bf85b8 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -5,6 +5,10 @@
attributes(global) subroutine devsub()
implicit none
integer :: ret
+ real(4) :: af
+ real(8) :: ad
+ integer(4) :: ai
+ integer(8) :: al
call syncthreads()
call syncwarp(1)
@@ -14,6 +18,11 @@ attributes(global) subroutine devsub()
ret = syncthreads_and(1)
ret = syncthreads_count(1)
ret = syncthreads_or(1)
+
+ ai = atomicadd(ai, 1_4)
+ al = atomicadd(al, 1_8)
+ af = atomicadd(af, 1.0_4)
+ ad = atomicadd(ad, 1.0_8)
end
! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
@@ -25,6 +34,10 @@ end
! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1_i32_0) fastmath<contract> : (i32) -> i32
! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1_i32_1) fastmath<contract> : (i32) -> i32
! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1_i32_2) fastmath<contract> : (i32) -> i32
+! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
+! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
+! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
+! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64
! CHECK: func.func private @llvm.nvvm.barrier0()
! CHECK: func.func private @__syncwarp(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncwarp", fir.proc_attrs = #fir.proc_attrs<bind_c>}
diff --git a/flang/test/Semantics/cuf-device-procedures01.cuf b/flang/test/Semantics/cuf-device-procedures01.cuf
index b9918d8a4ae4ce..92ee02bb3c64df 100644
--- a/flang/test/Semantics/cuf-device-procedures01.cuf
+++ b/flang/test/Semantics/cuf-device-procedures01.cuf
@@ -28,8 +28,17 @@ end
! CHECK: threadfence_system (Subroutine): Use from threadfence_system in cudadevice
subroutine host()
+ real(4) :: af
+ real(8) :: ad
+ integer(4) :: ai
+ integer(8) :: al
call syncthreads()
+ ai = atomicadd(ai, 1_4)
+ al = atomicadd(al, 1_8)
+ af = atomicadd(af, 1.0_4)
+ ad = atomicadd(ad, 1.0_8)
end subroutine
! CHECK-LABEL: Subprogram scope: host
+! CHECK: atomicadd, EXTERNAL: HostAssoc{{$}}
! CHECK: syncthreads, EXTERNAL: HostAssoc{{$}}
>From 18120ce5b2eefa1b5380ad46ff27b13dd17a56b8 Mon Sep 17 00:00:00 2001
From: Renaud-K <rkauffmann at nvidia.com>
Date: Tue, 21 Jan 2025 14:46:58 -0800
Subject: [PATCH 2/2] Corrected includes added by clangd
---
flang/include/flang/Optimizer/Builder/IntrinsicCall.h | 1 -
flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 4 +---
2 files changed, 1 insertion(+), 4 deletions(-)
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index cba7378cfbbfec..d5f794a382334f 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -20,7 +20,6 @@
#include "mlir/Dialect/Complex/IR/Complex.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
-#include <mlir/IR/Value.h>
#include <optional>
namespace fir {
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index fd88a44ac3bcae..0dccd168714e1a 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -44,6 +44,7 @@
#include "flang/Runtime/iostat-consts.h"
#include "mlir/Dialect/Complex/IR/Complex.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "llvm/Support/CommandLine.h"
@@ -51,9 +52,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <cfenv> // temporary -- only used in genIeeeGetOrSetModesOrStatus
-#include <mlir/Dialect/LLVMIR/LLVMTypes.h>
-#include <mlir/IR/BuiltinAttributes.h>
-#include <mlir/IR/Value.h>
#include <optional>
#define DEBUG_TYPE "flang-lower-intrinsic"
More information about the flang-commits
mailing list