[flang-commits] [flang] [flang][cuda] Add specialized gpu.launch_func conversion (PR #113493)
via flang-commits
flang-commits at lists.llvm.org
Wed Oct 23 13:52:22 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
@llvm/pr-subscribers-flang-runtime
Author: Valentin Clement (バレンタイン クレメン) (clementval)
<details>
<summary>Changes</summary>
CUDA Fortran has a constructor that register the fatbinary and the kernel functions. To launch a kernel it does not need to relay on the MLIR runtime and the lowering of the gpu.launch_func is therefore different. This patch adds a conversion pattern to convert gpu.launch_func to the `CUFLaunchKernel` runtime function.
---
Patch is 20.77 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113493.diff
9 Files Affected:
- (added) flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h (+28)
- (modified) flang/include/flang/Optimizer/Transforms/Passes.h (+1)
- (modified) flang/include/flang/Optimizer/Transforms/Passes.td (+7)
- (modified) flang/lib/Optimizer/Transforms/CMakeLists.txt (+1)
- (added) flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp (+180)
- (modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+1)
- (modified) flang/runtime/CUDA/registration.cpp (+5-1)
- (added) flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir (+104)
- (modified) flang/tools/fir-opt/fir-opt.cpp (+1)
``````````diff
diff --git a/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h b/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h
new file mode 100644
index 00000000000000..7d76c1f4e52187
--- /dev/null
+++ b/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h
@@ -0,0 +1,28 @@
+//===------- Optimizer/Transforms/CUFGPUToLLVMConversion.h ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_TRANSFORMS_CUFGPUTOLLVMCONVERSION_H_
+#define FORTRAN_OPTIMIZER_TRANSFORMS_CUFGPUTOLLVMCONVERSION_H_
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace fir {
+class LLVMTypeConverter;
+}
+
+namespace cuf {
+
+void populateCUFGPUToLLVMConversionPatterns(
+ const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
+ mlir::PatternBenefit benefit = 1);
+
+} // namespace cuf
+
+#endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFGPUTOLLVMCONVERSION_H_
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 5d3067aa359813..e8f0a8444a31a1 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -41,6 +41,7 @@ namespace fir {
#define GEN_PASS_DECL_CFGCONVERSION
#define GEN_PASS_DECL_CUFADDCONSTRUCTOR
#define GEN_PASS_DECL_CUFDEVICEGLOBAL
+#define GEN_PASS_DECL_CUFGPUTOLLVMCONVERSION
#define GEN_PASS_DECL_CUFOPCONVERSION
#define GEN_PASS_DECL_EXTERNALNAMECONVERSION
#define GEN_PASS_DECL_MEMREFDATAFLOWOPT
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 2efa543ca07148..a41f0f348f27a6 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -443,4 +443,11 @@ def CUFAddConstructor : Pass<"cuf-add-constructor", "mlir::ModuleOp"> {
];
}
+def CUFGPUToLLVMConversion : Pass<"cuf-gpu-convert-to-llvm", "mlir::ModuleOp"> {
+ let summary = "Convert some GPU operations lowered from CUF to runtime calls";
+ let dependentDialects = [
+ "mlir::LLVM::LLVMDialect"
+ ];
+}
+
#endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 8f4f731e009221..d20d3bc4108ce9 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -12,6 +12,7 @@ add_flang_library(FIRTransforms
CUFAddConstructor.cpp
CUFDeviceGlobal.cpp
CUFOpConversion.cpp
+ CUFGPUToLLVMConversion.cpp
ArrayValueCopy.cpp
ExternalNameConversion.cpp
MemoryUtils.cpp
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
new file mode 100644
index 00000000000000..5645ce6e6858c8
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -0,0 +1,180 @@
+//===-- CUFGPUToLLVMConversion.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h"
+#include "flang/Common/Fortran.h"
+#include "flang/Optimizer/CodeGen/TypeConverter.h"
+#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Runtime/CUDA/common.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace fir {
+#define GEN_PASS_DEF_CUFGPUTOLLVMCONVERSION
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+using namespace fir;
+using namespace mlir;
+using namespace Fortran::runtime;
+
+namespace {
+
+static mlir::Value createKernelArgArray(mlir::Location loc,
+ mlir::ValueRange operands,
+ mlir::PatternRewriter &rewriter) {
+
+ auto *ctx = rewriter.getContext();
+ llvm::SmallVector<mlir::Type> structTypes(operands.size(), nullptr);
+
+ for (auto [i, arg] : llvm::enumerate(operands))
+ structTypes[i] = arg.getType();
+
+ auto structTy = mlir::LLVM::LLVMStructType::getLiteral(ctx, structTypes);
+ auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
+ mlir::Type i32Ty = rewriter.getI32Type();
+ auto one = rewriter.create<mlir::LLVM::ConstantOp>(
+ loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 1));
+ mlir::Value argStruct =
+ rewriter.create<mlir::LLVM::AllocaOp>(loc, ptrTy, structTy, one);
+ auto size = rewriter.create<mlir::LLVM::ConstantOp>(
+ loc, i32Ty, rewriter.getIntegerAttr(i32Ty, structTypes.size()));
+ mlir::Value argArray =
+ rewriter.create<mlir::LLVM::AllocaOp>(loc, ptrTy, ptrTy, size);
+
+ for (auto [i, arg] : llvm::enumerate(operands)) {
+ auto indice = rewriter.create<mlir::LLVM::ConstantOp>(
+ loc, i32Ty, rewriter.getIntegerAttr(i32Ty, i));
+ mlir::Value structMember = rewriter.create<LLVM::GEPOp>(
+ loc, ptrTy, structTy, argStruct, mlir::ArrayRef<mlir::Value>({indice}));
+ rewriter.create<LLVM::StoreOp>(loc, arg, structMember);
+ mlir::Value arrayMember = rewriter.create<LLVM::GEPOp>(
+ loc, ptrTy, structTy, argArray, mlir::ArrayRef<mlir::Value>({indice}));
+ rewriter.create<LLVM::StoreOp>(loc, structMember, arrayMember);
+ }
+ return argArray;
+}
+
+struct GPULaunchKernelConversion
+ : public mlir::ConvertOpToLLVMPattern<mlir::gpu::LaunchFuncOp> {
+ explicit GPULaunchKernelConversion(
+ const fir::LLVMTypeConverter &typeConverter, mlir::PatternBenefit benefit)
+ : mlir::ConvertOpToLLVMPattern<mlir::gpu::LaunchFuncOp>(typeConverter,
+ benefit) {}
+
+ using OpAdaptor = typename mlir::gpu::LaunchFuncOp::Adaptor;
+
+ mlir::LogicalResult
+ matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor,
+ mlir::ConversionPatternRewriter &rewriter) const override {
+
+ if (op.hasClusterSize()) {
+ return mlir::failure();
+ }
+
+ mlir::Location loc = op.getLoc();
+ auto *ctx = rewriter.getContext();
+ mlir::ModuleOp mod = op->getParentOfType<mlir::ModuleOp>();
+ mlir::Value dynamicMemorySize = op.getDynamicSharedMemorySize();
+ mlir::Type i32Ty = rewriter.getI32Type();
+ if (!dynamicMemorySize)
+ dynamicMemorySize = rewriter.create<mlir::LLVM::ConstantOp>(
+ loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 0));
+
+ mlir::Value kernelArgs =
+ createKernelArgArray(loc, adaptor.getKernelOperands(), rewriter);
+
+ auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
+ auto kernel = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(op.getKernelName());
+ mlir::Value kernelPtr;
+ if (!kernel) {
+ auto funcOp = mod.lookupSymbol<mlir::func::FuncOp>(op.getKernelName());
+ if (!funcOp)
+ return mlir::failure();
+ kernelPtr =
+ rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, funcOp.getName());
+ } else {
+ kernelPtr =
+ rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, kernel.getName());
+ }
+
+ auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
+ RTNAME_STRING(CUFLaunchKernel));
+
+ auto llvmIntPtrType = mlir::IntegerType::get(
+ ctx, this->getTypeConverter()->getPointerBitwidth(0));
+ auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx);
+ auto funcTy = mlir::LLVM::LLVMFunctionType::get(
+ voidTy,
+ {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
+ llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
+ /*isVarArg=*/false);
+
+ auto cufLaunchKernel = mlir::SymbolRefAttr::get(
+ mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
+ if (!funcOp) {
+ mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
+ rewriter.setInsertionPointToStart(mod.getBody());
+ auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
+ loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
+ launchKernelFuncOp.setVisibility(mlir::SymbolTable::Visibility::Private);
+ }
+
+ mlir::Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, ptrTy);
+
+ rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
+ op, funcTy, cufLaunchKernel,
+ mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
+ adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
+ adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
+ adaptor.getBlockSizeZ(), dynamicMemorySize, kernelArgs,
+ nullPtr});
+
+ return mlir::success();
+ }
+};
+
+class CUFGPUToLLVMConversion
+ : public fir::impl::CUFGPUToLLVMConversionBase<CUFGPUToLLVMConversion> {
+public:
+ void runOnOperation() override {
+ auto *ctx = &getContext();
+ mlir::RewritePatternSet patterns(ctx);
+ mlir::ConversionTarget target(*ctx);
+
+ mlir::Operation *op = getOperation();
+ mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
+ if (!module)
+ return signalPassFailure();
+
+ std::optional<mlir::DataLayout> dl =
+ fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
+ fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
+ /*forceUnifiedTBAATree=*/false, *dl);
+ cuf::populateCUFGPUToLLVMConversionPatterns(typeConverter, patterns);
+ target.addIllegalOp<mlir::gpu::LaunchFuncOp>();
+ target.addLegalDialect<mlir::LLVM::LLVMDialect>();
+ if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
+ std::move(patterns)))) {
+ mlir::emitError(mlir::UnknownLoc::get(ctx),
+ "error in CUF GPU op conversion\n");
+ signalPassFailure();
+ }
+ }
+};
+} // namespace
+
+void cuf::populateCUFGPUToLLVMConversionPatterns(
+ const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
+ mlir::PatternBenefit benefit) {
+ patterns.add<GPULaunchKernelConversion>(converter, benefit);
+}
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 069d88e0afca47..9c2b882c7f46fe 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -20,6 +20,7 @@
#include "flang/Runtime/CUDA/descriptor.h"
#include "flang/Runtime/CUDA/memory.h"
#include "flang/Runtime/allocatable.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
diff --git a/flang/runtime/CUDA/registration.cpp b/flang/runtime/CUDA/registration.cpp
index 22d43a7dc57a3a..20d274c4d8d1c2 100644
--- a/flang/runtime/CUDA/registration.cpp
+++ b/flang/runtime/CUDA/registration.cpp
@@ -7,6 +7,8 @@
//===----------------------------------------------------------------------===//
#include "flang/Runtime/CUDA/registration.h"
+#include "../terminator.h"
+#include "flang/Runtime/CUDA/common.h"
#include "cuda_runtime.h"
@@ -31,5 +33,7 @@ void RTDEF(CUFRegisterFunction)(
__cudaRegisterFunction(module, fctSym, fctName, fctName, -1, (uint3 *)0,
(uint3 *)0, (dim3 *)0, (dim3 *)0, (int *)0);
}
-}
+
+} // extern "C"
+
} // namespace Fortran::runtime::cuda
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
new file mode 100644
index 00000000000000..f10bd82f978dc4
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -0,0 +1,104 @@
+// RUN: fir-opt --cuf-gpu-convert-to-llvm %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git at github.com:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ llvm.func @_QMmod1Phost_sub() {
+ %0 = llvm.mlir.constant(1 : i32) : i32
+ %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+ %2 = llvm.mlir.constant(40 : i64) : i64
+ %3 = llvm.mlir.constant(16 : i32) : i32
+ %4 = llvm.mlir.constant(25 : i32) : i32
+ %5 = llvm.mlir.constant(21 : i32) : i32
+ %6 = llvm.mlir.constant(17 : i32) : i32
+ %7 = llvm.mlir.constant(1 : index) : i64
+ %8 = llvm.mlir.constant(27 : i32) : i32
+ %9 = llvm.mlir.constant(6 : i32) : i32
+ %10 = llvm.mlir.constant(1 : i32) : i32
+ %11 = llvm.mlir.constant(0 : i32) : i32
+ %12 = llvm.mlir.constant(10 : index) : i64
+ %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
+ %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
+ %15 = llvm.mlir.constant(10 : index) : i64
+ %16 = llvm.mlir.constant(1 : index) : i64
+ %17 = llvm.alloca %15 x i32 : (i64) -> !llvm.ptr
+ %18 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %19 = llvm.insertvalue %17, %18[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %20 = llvm.insertvalue %17, %19[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %21 = llvm.mlir.constant(0 : index) : i64
+ %22 = llvm.insertvalue %21, %20[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %23 = llvm.insertvalue %15, %22[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %24 = llvm.insertvalue %16, %23[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %25 = llvm.extractvalue %24[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %26 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %27 = llvm.insertvalue %25, %26[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %28 = llvm.insertvalue %25, %27[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %29 = llvm.mlir.constant(0 : index) : i64
+ %30 = llvm.insertvalue %29, %28[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %31 = llvm.mlir.constant(10 : index) : i64
+ %32 = llvm.insertvalue %31, %30[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %33 = llvm.mlir.constant(1 : index) : i64
+ %34 = llvm.insertvalue %33, %32[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %35 = llvm.mlir.constant(1 : index) : i64
+ %36 = llvm.mlir.constant(11 : index) : i64
+ %37 = llvm.mlir.constant(1 : index) : i64
+ llvm.br ^bb1(%35 : i64)
+ ^bb1(%38: i64): // 2 preds: ^bb0, ^bb2
+ %39 = llvm.icmp "slt" %38, %36 : i64
+ llvm.cond_br %39, ^bb2, ^bb3
+ ^bb2: // pred: ^bb1
+ %40 = llvm.mlir.constant(-1 : index) : i64
+ %41 = llvm.add %38, %40 : i64
+ %42 = llvm.extractvalue %34[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+ %43 = llvm.getelementptr %42[%41] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ llvm.store %11, %43 : i32, !llvm.ptr
+ %44 = llvm.add %38, %37 : i64
+ llvm.br ^bb1(%44 : i64)
+ ^bb3: // pred: ^bb1
+ %45 = llvm.call @_FortranACUFDataTransferPtrPtr(%14, %25, %2, %11, %13, %5) : (!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()>
+ gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr)
+ %46 = llvm.call @_FortranACUFDataTransferPtrPtr(%25, %14, %2, %10, %13, %4) : (!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()>
+ %47 = llvm.call @_FortranAioBeginExternalListOutput(%9, %13, %8) {fastmathFlags = #llvm.fastmath<contract>} : (i32, !llvm.ptr, i32) -> !llvm.ptr
+ %48 = llvm.mlir.constant(9 : i32) : i32
+ %49 = llvm.mlir.zero : !llvm.ptr
+ %50 = llvm.getelementptr %49[1] : (!llvm.ptr) -> !llvm.ptr, i32
+ %51 = llvm.ptrtoint %50 : !llvm.ptr to i64
+ %52 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ %53 = llvm.insertvalue %51, %52[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ %54 = llvm.mlir.constant(20240719 : i32) : i32
+ %55 = llvm.insertvalue %54, %53[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ %56 = llvm.mlir.constant(1 : i32) : i32
+ %57 = llvm.trunc %56 : i32 to i8
+ %58 = llvm.insertvalue %57, %55[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ %59 = llvm.trunc %48 : i32 to i8
+ %60 = llvm.insertvalue %59, %58[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ %61 = llvm.mlir.constant(0 : i32) : i32
+ %62 = llvm.trunc %61 : i32 to i8
+ %63 = llvm.insertvalue %62, %60[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ %64 = llvm.mlir.constant(0 : i32) : i32
+ %65 = llvm.trunc %64 : i32 to i8
+ %66 = llvm.insertvalue %65, %63[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ %67 = llvm.mlir.constant(0 : i64) : i64
+ %68 = llvm.mlir.constant(1 : i64) : i64
+ %69 = llvm.insertvalue %68, %66[7, 0, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ %70 = llvm.insertvalue %12, %69[7, 0, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ %71 = llvm.insertvalue %51, %70[7, 0, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ %72 = llvm.mul %51, %12 : i64
+ %73 = llvm.insertvalue %25, %71[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ llvm.store %73, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+ llvm.return
+ }
+ llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
+ llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5() {addr_space = 0 : i32} : !llvm.array<2 x i8> {
+ %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
+ llvm.return %0 : !llvm.array<2 x i8>
+ }
+ llvm.func @_FortranAioBeginExternalListOutput(i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.io, fir.runtime, sym_visibility = "private"}
+ llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
+ llvm.func @_FortranACUFDataTransferPtrPtr(!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+ llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+ gpu.binary @cuda_device...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/113493
More information about the flang-commits
mailing list