[flang-commits] [flang] [flang][cuda] Adding support for more atomic calls (PR #124671)

Mon Jan 27 18:06:25 PST 2025

https://github.com/Renaud-K created https://github.com/llvm/llvm-project/pull/124671

The PR follows the earlier https://github.com/llvm/llvm-project/pull/123840 PR for atomic operation support in CUF

>From ac74d6a7170ddd4c8c09c6b5b0400e229e72bbe4 Mon Sep 17 00:00:00 2001
From: Renaud-K <rkauffmann at nvidia.com>
Date: Mon, 27 Jan 2025 17:57:17 -0800
Subject: [PATCH 1/2] Adding support for more atomic calls

---
 .../flang/Optimizer/Builder/IntrinsicCall.h   |   7 ++
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp |  88 +++++++++++++
 flang/module/cudadevice.f90                   | 117 +++++++++++++++++-
 flang/test/Lower/CUDA/cuda-device-proc.cuf    |  40 ++++++
 4 files changed, 248 insertions(+), 4 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index e2ea89483ef11f..52ada485033323 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -186,6 +186,13 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue
       genCommandArgumentCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAsind(mlir::Type, llvm::ArrayRef<mlir::Value>);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index db9918c265164d..5edc050f7f82c4 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -46,12 +46,15 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cfenv> // temporary -- only used in genIeeeGetOrSetModesOrStatus
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Types.h>
 #include <optional>
 
 #define DEBUG_TYPE "flang-lower-intrinsic"
@@ -151,6 +154,22 @@ static constexpr IntrinsicHandler handlers[]{
     {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
     {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
     {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicdeci", &I::genAtomicDec, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicinci", &I::genAtomicInc, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicmaxd", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicmaxf", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicmaxi", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicmaxl", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicmind", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicminf", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicmini", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicminl", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicori", &I::genAtomicOr, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicsubd", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicsubf", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicsubi", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicsubl", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
     {"bessel_jn",
      &I::genBesselJn,
      {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}},
@@ -2600,6 +2619,75 @@ mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
   return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
 }
 
+mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType,
+                                           llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::sub
+          : mlir::LLVM::AtomicBinOp::fsub;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value IntrinsicLibrary::genAtomicAnd(mlir::Type resultType,
+                                           llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value IntrinsicLibrary::genAtomicOr(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value IntrinsicLibrary::genAtomicDec(mlir::Type resultType,
+                                           llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value IntrinsicLibrary::genAtomicInc(mlir::Type resultType,
+                                           llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value IntrinsicLibrary::genAtomicMax(mlir::Type resultType,
+                                           llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::max
+          : mlir::LLVM::AtomicBinOp::fmax;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value IntrinsicLibrary::genAtomicMin(mlir::Type resultType,
+                                           llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::min
+          : mlir::LLVM::AtomicBinOp::fmin;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
 // ASSOCIATED
 fir::ExtendedValue
 IntrinsicLibrary::genAssociated(mlir::Type resultType,
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 53b6beaaf1ad8f..af516a1866fa97 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -106,10 +106,10 @@ attributes(device) pure real function atomicaddf(address, val)
     real, intent(inout) :: address
     real, value :: val
     end function
-    attributes(device) pure real*8 function atomicaddd(address, val)
+    attributes(device) pure real(8) function atomicaddd(address, val)
   !dir$ ignore_tkr (d) address, (d) val
-    real*8, intent(inout) :: address
-    real*8, value :: val
+    real(8), intent(inout) :: address
+    real(8), value :: val
     end function
     attributes(device) pure integer(8) function atomicaddl(address, val)
   !dir$ ignore_tkr (d) address, (d) val
@@ -117,6 +117,115 @@ attributes(device) pure integer(8) function atomicaddl(address, val)
     integer(8), value :: val
     end function
   end interface 
-public :: atomicadd
+  public :: atomicadd
+
+  interface atomicsub
+    attributes(device) pure integer function atomicsubi(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    integer, intent(inout) :: address
+    integer, value :: val
+    end function
+    attributes(device) pure real function atomicsubf(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    real, intent(inout) :: address
+    real, value :: val
+    end function
+    attributes(device) pure real(8) function atomicsubd(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    real(8), intent(inout) :: address
+    real(8), value :: val
+    end function
+    attributes(device) pure integer(8) function atomicsubl(address, val)
+  !dir$ ignore_tkr (d) address, (dk) val
+    integer(8), intent(inout) :: address
+    integer(8), value :: val
+    end function
+  end interface
+  public :: atomicsub
+  
+  interface atomicmax
+    attributes(device) pure integer function atomicmaxi(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    integer, intent(inout) :: address
+    integer, value :: val
+    end function
+    attributes(device) pure real function atomicmaxf(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    real, intent(inout) :: address
+    real, value :: val
+    end function
+    attributes(device) pure real(8) function atomicmaxd(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    real(8), intent(inout) :: address
+    real(8), value :: val
+    end function
+    attributes(device) pure integer(8) function atomicmaxl(address, val)
+  !dir$ ignore_tkr (d) address, (dk) val
+    integer(8), intent(inout) :: address
+    integer(8), value :: val
+    end function
+  end interface
+  public :: atomicmax
+  
+  interface atomicmin
+    attributes(device) pure integer function atomicmini(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    integer, intent(inout) :: address
+    integer, value :: val
+    end function
+    attributes(device) pure real function atomicminf(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    real, intent(inout) :: address
+    real, value :: val
+    end function
+    attributes(device) pure real(8) function atomicmind(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    real(8), intent(inout) :: address
+    real(8), value :: val
+    end function
+    attributes(device) pure integer(8) function atomicminl(address, val)
+  !dir$ ignore_tkr (d) address, (dk) val
+    integer(8), intent(inout) :: address
+    integer(8), value :: val
+    end function
+  end interface
+  public :: atomicmin
+  
+  interface atomicand
+    attributes(device) pure integer function atomicandi(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    integer, intent(inout) :: address
+    integer, value :: val
+    end function
+  end interface
+  public :: atomicand
+  
+  interface atomicor
+    attributes(device) pure integer function atomicori(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    integer, intent(inout) :: address
+    integer, value :: val
+    end function
+  end interface
+  public :: atomicor
+
+  interface atomicinc
+    attributes(device) pure integer function atomicinci(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    integer, intent(inout) :: address
+    integer, value :: val
+    end function
+  end interface
+  public :: atomicinc
+  
+  interface atomicdec
+    attributes(device) pure integer function atomicdeci(address, val)
+  !dir$ ignore_tkr (d) address, (d) val
+    integer, intent(inout) :: address
+    integer, value :: val
+    end function
+  end interface
+  public :: atomicdec
+
 
 end module
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 661e5728bf85b8..7ef391c7d308ba 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -23,6 +23,26 @@ attributes(global) subroutine devsub()
   al = atomicadd(al, 1_8)
   af = atomicadd(af, 1.0_4)
   ad = atomicadd(ad, 1.0_8)
+  
+  ai = atomicsub(ai, 1_4)
+  al = atomicsub(al, 1_8)
+  af = atomicsub(af, 1.0_4)
+  ad = atomicsub(ad, 1.0_8)
+  
+  ai = atomicmax(ai, 1_4)
+  al = atomicmax(al, 1_8)
+  af = atomicmax(af, 1.0_4)
+  ad = atomicmax(ad, 1.0_8)
+  
+  ai = atomicmin(ai, 1_4)
+  al = atomicmin(al, 1_8)
+  af = atomicmin(af, 1.0_4)
+  ad = atomicmin(ad, 1.0_8)
+  
+  ai = atomicand(ai, 1_4)
+  ai = atomicor(ai, 1_4)
+  ai = atomicinc(ai, 1_4)
+  ai = atomicdec(ai, 1_4)
 end
 
 ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
@@ -39,6 +59,26 @@ end
 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64
 
+! CHECK: %{{.*}} = llvm.atomicrmw sub  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
+! CHECK: %{{.*}} = llvm.atomicrmw sub  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
+! CHECK: %{{.*}} = llvm.atomicrmw fsub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
+! CHECK: %{{.*}} = llvm.atomicrmw fsub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64
+
+! CHECK: %{{.*}} = llvm.atomicrmw max  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
+! CHECK: %{{.*}} = llvm.atomicrmw max  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
+! CHECK: %{{.*}} = llvm.atomicrmw fmax %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
+! CHECK: %{{.*}} = llvm.atomicrmw fmax %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64
+
+! CHECK: %{{.*}} = llvm.atomicrmw min  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
+! CHECK: %{{.*}} = llvm.atomicrmw min  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
+! CHECK: %{{.*}} = llvm.atomicrmw fmin %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
+! CHECK: %{{.*}} = llvm.atomicrmw fmin %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64
+
+! CHECK: %{{.*}} = llvm.atomicrmw _and  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
+! CHECK: %{{.*}} = llvm.atomicrmw _or  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
+! CHECK: %{{.*}} = llvm.atomicrmw uinc_wrap  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
+! CHECK: %{{.*}} = llvm.atomicrmw udec_wrap  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
+
 ! CHECK: func.func private @llvm.nvvm.barrier0()
 ! CHECK: func.func private @__syncwarp(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncwarp", fir.proc_attrs = #fir.proc_attrs<bind_c>}
 ! CHECK: func.func private @llvm.nvvm.membar.gl()

>From 6f7108f891f4730d662f35798d3c0a80787aa35f Mon Sep 17 00:00:00 2001
From: Renaud-K <rkauffmann at nvidia.com>
Date: Mon, 27 Jan 2025 18:01:44 -0800
Subject: [PATCH 2/2] Removing includes from MemRef experiments

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 5edc050f7f82c4..e75a29c968d177 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -46,15 +46,12 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cfenv> // temporary -- only used in genIeeeGetOrSetModesOrStatus
-#include <mlir/IR/BuiltinTypes.h>
-#include <mlir/IR/Types.h>
 #include <optional>
 
 #define DEBUG_TYPE "flang-lower-intrinsic"