[flang-commits] [flang] [flang][cuda] Avoid intrinsics simplification in device context (PR #117026)
Valentin Clement バレンタイン クレメン via flang-commits
flang-commits at lists.llvm.org
Wed Nov 20 14:24:27 PST 2024
Valentin Clement =?utf-8?b?KOODkOODrOODsw=?Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/117026 at github.com>
https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/117026
>From 9a4ab3d5f63666786cb22eb56231f93c0ec3883e Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Wed, 20 Nov 2024 10:49:31 -0800
Subject: [PATCH 1/2] [flang][cuda] Avoid intrinsics simplification in device
context
---
.../flang/Optimizer/Transforms/CUFCommon.h | 2 +
flang/lib/Optimizer/Transforms/CUFCommon.cpp | 17 +++++++
.../Transforms/SimplifyIntrinsics.cpp | 3 ++
flang/test/Fir/CUDA/cuda-device-context.mlir | 44 +++++++++++++++++++
4 files changed, 66 insertions(+)
create mode 100644 flang/test/Fir/CUDA/cuda-device-context.mlir
diff --git a/flang/include/flang/Optimizer/Transforms/CUFCommon.h b/flang/include/flang/Optimizer/Transforms/CUFCommon.h
index b88133489df5e2..f019d1893bda4c 100644
--- a/flang/include/flang/Optimizer/Transforms/CUFCommon.h
+++ b/flang/include/flang/Optimizer/Transforms/CUFCommon.h
@@ -20,6 +20,8 @@ namespace cuf {
mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
mlir::SymbolTable &symTab);
+bool isInCUDADeviceContext(mlir::Operation *op);
+
} // namespace cuf
#endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_
diff --git a/flang/lib/Optimizer/Transforms/CUFCommon.cpp b/flang/lib/Optimizer/Transforms/CUFCommon.cpp
index 162df8f9cab9cd..5b7631bbacb5f2 100644
--- a/flang/lib/Optimizer/Transforms/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFCommon.cpp
@@ -7,6 +7,8 @@
//===----------------------------------------------------------------------===//
#include "flang/Optimizer/Transforms/CUFCommon.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
/// Retrieve or create the CUDA Fortran GPU module in the give in \p mod.
@@ -26,3 +28,18 @@ mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
symTab.insert(gpuMod, insertPt);
return gpuMod;
}
+
+bool cuf::isInCUDADeviceContext(mlir::Operation *op) {
+ if (!op)
+ return false;
+ if (op->getParentOfType<cuf::KernelOp>() ||
+ op->getParentOfType<mlir::gpu::GPUFuncOp>())
+ return true;
+ if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>()) {
+ if (auto cudaProcAttr = funcOp->getAttrOfType<cuf::ProcAttributeAttr>(
+ cuf::getProcAttrName())) {
+ return cudaProcAttr.getValue() != cuf::ProcAttribute::Host;
+ }
+ }
+ return false;
+}
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index c61179a7460e32..d3567f453fceb3 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -31,6 +31,7 @@
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Dialect/Support/FIRContext.h"
#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
+#include "flang/Optimizer/Transforms/CUFCommon.h"
#include "flang/Optimizer/Transforms/Passes.h"
#include "flang/Optimizer/Transforms/Utils.h"
#include "flang/Runtime/entry-names.h"
@@ -1276,6 +1277,8 @@ void SimplifyIntrinsicsPass::runOnOperation() {
fir::KindMapping kindMap = fir::getKindMapping(module);
module.walk([&](mlir::Operation *op) {
if (auto call = mlir::dyn_cast<fir::CallOp>(op)) {
+ if (cuf::isInCUDADeviceContext(op))
+ return;
if (mlir::SymbolRefAttr callee = call.getCalleeAttr()) {
mlir::StringRef funcName = callee.getLeafReference().getValue();
// Replace call to runtime function for SUM when it has single
diff --git a/flang/test/Fir/CUDA/cuda-device-context.mlir b/flang/test/Fir/CUDA/cuda-device-context.mlir
new file mode 100644
index 00000000000000..689c92dc50e6ab
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-device-context.mlir
@@ -0,0 +1,44 @@
+// RUN: fir-opt --simplify-intrinsics %s | FileCheck %s
+
+func.func @_QPsum_in_device(%arg0: !fir.ref<!fir.array<?xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %arg1: i32 {fir.bindc_name = "n"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+ %c5_i32 = arith.constant 5 : i32
+ %c1 = arith.constant 1 : index
+ %c0 = arith.constant 0 : index
+ %c-1 = arith.constant -1 : index
+ %0 = fir.dummy_scope : !fir.dscope
+ %1 = fir.shape %c-1 : (index) -> !fir.shape<1>
+ %2 = fir.declare %arg0(%1) dummy_scope %0 {data_attr = #cuf.cuda<device>, uniq_name = "_QFsum_in_deviceEa"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<?xi32>>
+ %3 = fir.embox %2(%1) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+ %4 = fir.alloca i32
+ fir.store %arg1 to %4 : !fir.ref<i32>
+ %5 = fir.declare %4 dummy_scope %0 {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFsum_in_deviceEn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+ %12 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsum_in_deviceEi"}
+ %13 = fir.declare %12 {uniq_name = "_QFsum_in_deviceEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %14 = fir.address_of(@_QM__fortran_builtinsE__builtin_threadidx) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %18 = fir.load %5 : !fir.ref<i32>
+ %19 = fir.convert %18 : (i32) -> index
+ %20 = arith.cmpi sgt, %19, %c0 : index
+ %21 = arith.select %20, %19, %c0 : index
+ %22 = fir.alloca !fir.array<?xi32>, %21 {bindc_name = "auto", uniq_name = "_QFsum_in_deviceEauto"}
+ %23 = fir.shape %21 : (index) -> !fir.shape<1>
+ %24 = fir.declare %22(%23) {uniq_name = "_QFsum_in_deviceEauto"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
+ %25 = fir.embox %24(%23) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+ %26 = fir.undefined index
+ %27 = fir.slice %c1, %19, %c1 : (index, index, index) -> !fir.slice<1>
+ %28 = fir.embox %24(%23) [%27] : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box<!fir.array<?xi32>>
+ %29 = fir.absent !fir.box<i1>
+ %30 = fir.address_of(@_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5) : !fir.ref<!fir.char<1,50>>
+ %31 = fir.convert %28 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+ %32 = fir.convert %30 : (!fir.ref<!fir.char<1,50>>) -> !fir.ref<i8>
+ %33 = fir.convert %c0 : (index) -> i32
+ %34 = fir.convert %29 : (!fir.box<i1>) -> !fir.box<none>
+ %35 = fir.call @_FortranASumInteger4(%31, %32, %c5_i32, %33, %34) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+ %36 = fir.load %13 : !fir.ref<i32>
+ %37 = fir.convert %36 : (i32) -> i64
+ %38 = fir.array_coor %2(%1) %37 : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, i64) -> !fir.ref<i32>
+ fir.store %35 to %38 : !fir.ref<i32>
+ return
+}
+
+// CHECK-LABEL: func.func @_QPsum_in_device
+// CHECK-NOT: fir.call @_FortranASumInteger4x1_contract_simplified
>From f23ba5c15749d6bcd2c0ca8be73a957a2f5ca82d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
=?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
=?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Wed, 20 Nov 2024 14:24:19 -0800
Subject: [PATCH 2/2] Update flang/test/Fir/CUDA/cuda-device-context.mlir
---
flang/test/Fir/CUDA/cuda-device-context.mlir | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/flang/test/Fir/CUDA/cuda-device-context.mlir b/flang/test/Fir/CUDA/cuda-device-context.mlir
index 689c92dc50e6ab..96c9e7ab4ecf94 100644
--- a/flang/test/Fir/CUDA/cuda-device-context.mlir
+++ b/flang/test/Fir/CUDA/cuda-device-context.mlir
@@ -40,5 +40,10 @@ func.func @_QPsum_in_device(%arg0: !fir.ref<!fir.array<?xi32>> {cuf.data_attr =
return
}
+// Check that intrinsic simplification is disable in CUDA Fortran context. The simplified intrinsic is
+// created in the module op but the device func will be migrated into a gpu module op resulting in a
+// missing symbol error.
+// The simplified intrinsic could also be migrated to the gpu module but the choice as not be made
+// at this point.
// CHECK-LABEL: func.func @_QPsum_in_device
// CHECK-NOT: fir.call @_FortranASumInteger4x1_contract_simplified
More information about the flang-commits
mailing list