[flang-commits] [flang] [flang][cuda] Add conversion pattern for cuf.kernel_launch op (PR #114129)
Valentin Clement バレンタイン クレメン via flang-commits
flang-commits at lists.llvm.org
Tue Oct 29 13:54:34 PDT 2024
https://github.com/clementval created https://github.com/llvm/llvm-project/pull/114129
None
>From 822acc7a1c3942e5da45b3a4c3df683d530ff15c Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 29 Oct 2024 11:55:04 -0700
Subject: [PATCH] [flang][cuda] Add conversion pattern for cuf.kernel_launch op
---
.../Optimizer/Transforms/CUFOpConversion.cpp | 69 ++++++++++++++++++-
flang/test/Fir/CUDA/cuda-launch.fir | 64 +++++++++++++++++
2 files changed, 131 insertions(+), 2 deletions(-)
create mode 100644 flang/test/Fir/CUDA/cuda-launch.fir
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 14cc1cb508cfc0..2e1ff203707b22 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -15,6 +15,7 @@
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Optimizer/Transforms/CUFCommon.h"
#include "flang/Runtime/CUDA/allocatable.h"
#include "flang/Runtime/CUDA/common.h"
#include "flang/Runtime/CUDA/descriptor.h"
@@ -620,6 +621,69 @@ struct CufDataTransferOpConversion
const mlir::SymbolTable &symtab;
};
+struct CUFLaunchOpConversion
+ : public mlir::OpRewritePattern<cuf::KernelLaunchOp> {
+public:
+ using OpRewritePattern::OpRewritePattern;
+
+ CUFLaunchOpConversion(mlir::MLIRContext *context,
+ const mlir::SymbolTable &symTab)
+ : OpRewritePattern(context), symTab{symTab} {}
+
+ mlir::LogicalResult
+ matchAndRewrite(cuf::KernelLaunchOp op,
+ mlir::PatternRewriter &rewriter) const override {
+ mlir::Location loc = op.getLoc();
+ auto idxTy = mlir::IndexType::get(op.getContext());
+ auto zero = rewriter.create<mlir::arith::ConstantOp>(
+ loc, rewriter.getIntegerType(32), rewriter.getI32IntegerAttr(0));
+ auto gridSizeX =
+ rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridX());
+ auto gridSizeY =
+ rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridY());
+ auto gridSizeZ =
+ rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridZ());
+ auto blockSizeX =
+ rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockX());
+ auto blockSizeY =
+ rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockY());
+ auto blockSizeZ =
+ rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockZ());
+ auto kernelName = mlir::SymbolRefAttr::get(
+ rewriter.getStringAttr(cudaDeviceModuleName),
+ {mlir::SymbolRefAttr::get(
+ rewriter.getContext(),
+ op.getCallee().getLeafReference().getValue())});
+ mlir::Value clusterDimX, clusterDimY, clusterDimZ;
+ if (auto funcOp = symTab.lookup<mlir::func::FuncOp>(
+ op.getCallee().getLeafReference())) {
+ if (auto clusterDimsAttr = funcOp->getAttrOfType<cuf::ClusterDimsAttr>(
+ cuf::getClusterDimsAttrName())) {
+ clusterDimX = rewriter.create<mlir::arith::ConstantIndexOp>(
+ loc, clusterDimsAttr.getX().getInt());
+ clusterDimY = rewriter.create<mlir::arith::ConstantIndexOp>(
+ loc, clusterDimsAttr.getY().getInt());
+ clusterDimZ = rewriter.create<mlir::arith::ConstantIndexOp>(
+ loc, clusterDimsAttr.getZ().getInt());
+ }
+ }
+ auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>(
+ loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
+ mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero,
+ op.getArgs());
+ if (clusterDimX && clusterDimY && clusterDimZ) {
+ gpuLaunchOp.getClusterSizeXMutable().assign(clusterDimX);
+ gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
+ gpuLaunchOp.getClusterSizeZMutable().assign(clusterDimZ);
+ }
+ rewriter.replaceOp(op, gpuLaunchOp);
+ return mlir::success();
+ }
+
+private:
+ const mlir::SymbolTable &symTab;
+};
+
class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
public:
void runOnOperation() override {
@@ -637,7 +701,8 @@ class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
/*forceUnifiedTBAATree=*/false, *dl);
- target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect>();
+ target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect,
+ mlir::gpu::GPUDialect>();
cuf::populateCUFToFIRConversionPatterns(typeConverter, *dl, symtab,
patterns);
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
@@ -656,5 +721,5 @@ void cuf::populateCUFToFIRConversionPatterns(
patterns.insert<CufAllocOpConversion>(patterns.getContext(), &dl, &converter);
patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
CufFreeOpConversion>(patterns.getContext());
- patterns.insert<CufDataTransferOpConversion>(patterns.getContext(), symtab);
+ patterns.insert<CufDataTransferOpConversion, CUFLaunchOpConversion>(patterns.getContext(), symtab);
}
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
new file mode 100644
index 00000000000000..f11bcbdb7fce55
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -0,0 +1,64 @@
+// RUN: fir-opt --split-input-file --cuf-convert %s | FileCheck %s
+
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+ gpu.module @cuda_device_mod {
+ gpu.func @_QPsub_device1() kernel {
+ cf.br ^bb1
+ ^bb1: // pred: ^bb0
+ gpu.return
+ }
+ gpu.func @_QPsub_device2(%arg0: !fir.ref<f32>) kernel {
+ cf.br ^bb1(%arg0 : !fir.ref<f32>)
+ ^bb1(%0: !fir.ref<f32>): // pred: ^bb0
+ %1 = fir.declare %0 {uniq_name = "_QFsub1Ei"} : (!fir.ref<f32>) -> !fir.ref<f32>
+ %cst = arith.constant 2.000000e+00 : f32
+ fir.store %cst to %1 : !fir.ref<f32>
+ gpu.return
+ }
+ }
+
+ func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+ %0 = fir.alloca f32
+ // CHECK: %[[ALLOCA:.*]] = fir.alloca f32
+ %c1 = arith.constant 1 : index
+ %c11_i32 = arith.constant 11 : i32
+ %c6_i32 = arith.constant 6 : i32
+ %c1_i32 = arith.constant 1 : i32
+ // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}}
+ cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>()
+
+ // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref<f32>)
+ cuf.kernel_launch @cuda_device_mod::@_QPsub_device2<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%0) : (!fir.ref<f32>)
+ return
+ }
+
+}
+
+// -----
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+ gpu.module @cuda_device_mod {
+ gpu.func @_QMmod1Psub1(%arg0: !fir.ref<!fir.array<10xi32>>) kernel {
+ gpu.return
+ }
+ }
+
+ func.func @_QMmod1Psub1(%arg0: !fir.ref<!fir.array<10xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "adev"}) attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>} {
+ return
+ }
+ func.func @_QMmod1Phost_sub() {
+ %c10 = arith.constant 10 : index
+ %0 = cuf.alloc !fir.array<10xi32> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadev"} -> !fir.ref<!fir.array<10xi32>>
+ %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+ %2:2 = hlfir.declare %0(%1) {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+ %c1_i32 = arith.constant 1 : i32
+ %c10_i32 = arith.constant 10 : i32
+ cuf.kernel_launch @_QMmod1Psub1<<<%c1_i32, %c1_i32, %c1_i32, %c10_i32, %c1_i32, %c1_i32>>>(%2#1) : (!fir.ref<!fir.array<10xi32>>)
+ return
+ }
+}
+
+// CHECK-LABEL: func.func @_QMmod1Phost_sub()
+// CHECK: gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}})
+
More information about the flang-commits
mailing list