[flang-commits] [flang] [flang][cuda] Add pass to transform predefined variables (PR #174451)
via flang-commits
flang-commits at lists.llvm.org
Mon Jan 5 09:34:09 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: Valentin Clement (バレンタイン クレメン) (clementval)
<details>
<summary>Changes</summary>
Change the predefined variables injected in the device function to their corresponding NVVM dialect operations.
---
Patch is 21.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174451.diff
4 Files Affected:
- (modified) flang/include/flang/Optimizer/Transforms/Passes.td (+10)
- (modified) flang/lib/Optimizer/Transforms/CMakeLists.txt (+1)
- (added) flang/lib/Optimizer/Transforms/CUDA/CUFPredefinedVarToGPU.cpp (+153)
- (added) flang/test/Fir/CUDA/predefined-variables.mlir (+188)
``````````diff
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index f50202784e2dc..d457cbbb32798 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -523,6 +523,16 @@ def CUFComputeSharedMemoryOffsetsAndSize
];
}
+def CUFPredefinedVarToGPU
+ : Pass<"cuf-predefined-var-to-gpu", "::mlir::func::FuncOp"> {
+ let summary = "Transform predefined variables to GPU operations";
+ let description = [{
+ Change the predefined variables injected in the device function to their
+ corresponding NVVM dialect operations.
+ }];
+ let dependentDialects = ["mlir::NVVM::NVVMDialect"];
+}
+
def SetRuntimeCallAttributes
: Pass<"set-runtime-call-attrs", "mlir::func::FuncOp"> {
let summary = "Set Fortran runtime fir.call attributes targeting LLVM IR";
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 619f3adc67c85..c07921b23f3ff 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -10,6 +10,7 @@ add_flang_library(FIRTransforms
ConstantArgumentGlobalisation.cpp
ControlFlowConverter.cpp
CUDA/CUFAllocationConversion.cpp
+ CUDA/CUFPredefinedVarToGPU.cpp
CUFAddConstructor.cpp
CUFDeviceGlobal.cpp
CUFOpConversion.cpp
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFPredefinedVarToGPU.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFPredefinedVarToGPU.cpp
new file mode 100644
index 0000000000000..3eb655980a391
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFPredefinedVarToGPU.cpp
@@ -0,0 +1,153 @@
+//===-- CUFPredefinedVarToGPU.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Pass/Pass.h"
+
+namespace fir {
+#define GEN_PASS_DEF_CUFPREDEFINEDVARTOGPU
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+using namespace mlir;
+
+namespace {
+
+template <typename OpTyX, typename OpTyY, typename OpTyZ>
+static void createForAllDimensions(mlir::OpBuilder &builder, mlir::Location loc,
+ mlir::Value c1,
+ SmallVectorImpl<mlir::Value> &values,
+ bool incrementByOne = false) {
+ if (incrementByOne) {
+ auto baseX = OpTyX::create(builder, loc, builder.getI32Type());
+ values.push_back(mlir::arith::AddIOp::create(builder, loc, baseX, c1));
+ auto baseY = OpTyY::create(builder, loc, builder.getI32Type());
+ values.push_back(mlir::arith::AddIOp::create(builder, loc, baseY, c1));
+ auto baseZ = OpTyZ::create(builder, loc, builder.getI32Type());
+ values.push_back(mlir::arith::AddIOp::create(builder, loc, baseZ, c1));
+ } else {
+ values.push_back(OpTyX::create(builder, loc, builder.getI32Type()));
+ values.push_back(OpTyY::create(builder, loc, builder.getI32Type()));
+ values.push_back(OpTyZ::create(builder, loc, builder.getI32Type()));
+ }
+}
+
+static constexpr llvm::StringRef builtinsModuleName = "__fortran_builtins";
+static constexpr llvm::StringRef builtinVarPrefix = "__builtin_";
+static constexpr llvm::StringRef threadidx = "threadidx";
+static constexpr llvm::StringRef blockidx = "blockidx";
+static constexpr llvm::StringRef blockdim = "blockdim";
+static constexpr llvm::StringRef griddim = "griddim";
+
+static constexpr unsigned field_x = 0;
+static constexpr unsigned field_y = 1;
+static constexpr unsigned field_z = 2;
+
+std::string mangleBuiltin(llvm::StringRef varName) {
+ return "_QM" + builtinsModuleName.str() + "E" + builtinVarPrefix.str() +
+ varName.str();
+}
+
+static void processCoordinateOp(mlir::OpBuilder &builder, mlir::Location loc,
+ fir::CoordinateOp coordOp, unsigned fieldIdx,
+ mlir::Value &gpuValue) {
+ std::optional<llvm::ArrayRef<int32_t>> fieldIndices =
+ coordOp.getFieldIndices();
+ assert(fieldIndices && fieldIndices->size() == 1 &&
+ "expect only one coordinate");
+ if (static_cast<unsigned>((*fieldIndices)[0]) == fieldIdx) {
+ llvm::SmallVector<fir::LoadOp> opToErase;
+ for (mlir::OpOperand &coordUse : coordOp.getResult().getUses()) {
+ assert(mlir::isa<fir::LoadOp>(coordUse.getOwner()) &&
+ "only expect load op");
+ auto loadOp = mlir::dyn_cast<fir::LoadOp>(coordUse.getOwner());
+ loadOp.getResult().replaceAllUsesWith(gpuValue);
+ opToErase.push_back(loadOp);
+ }
+ for (auto op : opToErase)
+ op.erase();
+ }
+}
+
+static void
+processDeclareOp(mlir::OpBuilder &builder, mlir::Location loc,
+ fir::DeclareOp declareOp, llvm::StringRef builtinVar,
+ llvm::SmallVectorImpl<mlir::Value> &gpuValues,
+ llvm::SmallVectorImpl<mlir::Operation *> &opsToDelete) {
+ if (declareOp.getUniqName().str().compare(builtinVar) == 0) {
+ for (mlir::OpOperand &use : declareOp.getResult().getUses()) {
+ fir::CoordinateOp coordOp =
+ mlir::dyn_cast<fir::CoordinateOp>(use.getOwner());
+ processCoordinateOp(builder, loc, coordOp, field_x, gpuValues[0]);
+ processCoordinateOp(builder, loc, coordOp, field_y, gpuValues[1]);
+ processCoordinateOp(builder, loc, coordOp, field_z, gpuValues[2]);
+ opsToDelete.push_back(coordOp);
+ }
+ opsToDelete.push_back(declareOp.getOperation());
+ if (declareOp.getMemref().getDefiningOp())
+ opsToDelete.push_back(declareOp.getMemref().getDefiningOp());
+ }
+}
+
+struct CUFPredefinedVarToGPU
+ : public fir::impl::CUFPredefinedVarToGPUBase<CUFPredefinedVarToGPU> {
+
+ void runOnOperation() override {
+ func::FuncOp funcOp = getOperation();
+ if (funcOp.getBody().empty())
+ return;
+
+ if (auto cudaProcAttr =
+ funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
+ cuf::getProcAttrName())) {
+ if (cudaProcAttr.getValue() == cuf::ProcAttribute::Device ||
+ cudaProcAttr.getValue() == cuf::ProcAttribute::Global ||
+ cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal ||
+ cudaProcAttr.getValue() == cuf::ProcAttribute::HostDevice) {
+ mlir::Location loc = funcOp.getLoc();
+ mlir::OpBuilder builder(funcOp.getContext());
+ builder.setInsertionPointToStart(&funcOp.getBody().front());
+ auto c1 = mlir::arith::ConstantOp::create(
+ builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1));
+ llvm::SmallVector<mlir::Value, 3> threadids, blockids, blockdims,
+ griddims;
+ createForAllDimensions<mlir::NVVM::ThreadIdXOp, mlir::NVVM::ThreadIdYOp,
+ mlir::NVVM::ThreadIdZOp>(
+ builder, loc, c1, threadids, /*incrementByOne=*/true);
+ createForAllDimensions<mlir::NVVM::BlockIdXOp, mlir::NVVM::BlockIdYOp,
+ mlir::NVVM::BlockIdZOp>(
+ builder, loc, c1, blockids, /*incrementByOne=*/true);
+ createForAllDimensions<mlir::NVVM::GridDimXOp, mlir::NVVM::GridDimYOp,
+ mlir::NVVM::GridDimZOp>(builder, loc, c1,
+ griddims);
+ createForAllDimensions<mlir::NVVM::BlockDimXOp, mlir::NVVM::BlockDimYOp,
+ mlir::NVVM::BlockDimZOp>(builder, loc, c1,
+ blockdims);
+
+ llvm::SmallVector<mlir::Operation *> opsToDelete;
+ for (auto declareOp : funcOp.getOps<fir::DeclareOp>()) {
+ processDeclareOp(builder, loc, declareOp, mangleBuiltin(threadidx),
+ threadids, opsToDelete);
+ processDeclareOp(builder, loc, declareOp, mangleBuiltin(blockidx),
+ blockids, opsToDelete);
+ processDeclareOp(builder, loc, declareOp, mangleBuiltin(blockdim),
+ blockdims, opsToDelete);
+ processDeclareOp(builder, loc, declareOp, mangleBuiltin(griddim),
+ griddims, opsToDelete);
+ }
+
+ for (auto op : opsToDelete)
+ op->erase();
+ }
+ }
+ }
+};
+
+} // end anonymous namespace
diff --git a/flang/test/Fir/CUDA/predefined-variables.mlir b/flang/test/Fir/CUDA/predefined-variables.mlir
new file mode 100644
index 0000000000000..1a9a46d213be4
--- /dev/null
+++ b/flang/test/Fir/CUDA/predefined-variables.mlir
@@ -0,0 +1,188 @@
+// RUN: fir-opt --split-input-file --cuf-predefined-var-to-gpu --canonicalize %s | FileCheck %s
+// RUN: fir-opt --split-input-file --cuf-predefined-var-to-gpu --canonicalize %s | fir-opt --cuf-predefined-var-to-gpu --canonicalize | FileCheck %s
+
+// attributes(device) subroutine sub1(i)
+// integer :: i
+// i = threadidx%x
+// i = blockdim%x
+// i = blockidx%x
+// i = griddim%x
+// i = warpsize
+// end subroutine
+
+// The following FIR output is coming from the small CUDA Fortran code above.
+// To reproduce the output or update it:
+// bbc -emit-hlfir -fcuda %s -o - | fir-opt --convert-hlfir-to-fir
+func.func @_QPsub1(%arg0: !fir.ref<i32> {fir.bindc_name = "i", cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>} {
+ %0 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockdim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %1 = fir.declare %0 {uniq_name = "_QM__fortran_builtinsE__builtin_blockdim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %2 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockidx) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %3 = fir.declare %2 {uniq_name = "_QM__fortran_builtinsE__builtin_blockidx"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %4 = fir.address_of(@_QM__fortran_builtinsE__builtin_griddim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %5 = fir.declare %4 {uniq_name = "_QM__fortran_builtinsE__builtin_griddim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %6 = fir.declare %arg0 {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %7 = fir.address_of(@_QM__fortran_builtinsE__builtin_threadidx) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %8 = fir.declare %7 {uniq_name = "_QM__fortran_builtinsE__builtin_threadidx"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %9 = fir.alloca i32 {bindc_name = "__builtin_warpsize", uniq_name = "_QM__fortran_builtinsEC__builtin_warpsize"}
+ %10 = fir.declare %9 {uniq_name = "_QM__fortran_builtinsEC__builtin_warpsize"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %12 = fir.coordinate_of %8, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+ %13 = fir.load %12 : !fir.ref<i32>
+ fir.store %13 to %6 : !fir.ref<i32>
+ %15 = fir.coordinate_of %1, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+ %16 = fir.load %15 : !fir.ref<i32>
+ fir.store %16 to %6 : !fir.ref<i32>
+ %18 = fir.coordinate_of %3, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+ %19 = fir.load %18 : !fir.ref<i32>
+ fir.store %19 to %6 : !fir.ref<i32>
+ %21 = fir.coordinate_of %5, y : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+ %22 = fir.load %21 : !fir.ref<i32>
+ fir.store %22 to %6 : !fir.ref<i32>
+ %c32_i32 = arith.constant 32 : i32
+ fir.store %c32_i32 to %6 : !fir.ref<i32>
+ %24 = fir.coordinate_of %8, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+ %25 = fir.load %24 : !fir.ref<i32>
+ %c0_i32 = arith.constant 0 : i32
+ %26 = arith.cmpi eq, %25, %c0_i32 : i32
+ fir.if %26 {
+ %c0_i32_0 = arith.constant 0 : i32
+ fir.store %c0_i32_0 to %6 : !fir.ref<i32>
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @_QPsub1
+
+// CHECK: %[[WARPSIZE:.*]] = arith.constant 32 : i32
+
+// CHECK: %[[BASE_THREAD_ID_X:.*]] = nvvm.read.ptx.sreg.tid.x : i32
+// CHECK: %[[THREAD_ID_X:.*]] = arith.addi %[[BASE_THREAD_ID_X]], %c1{{.*}} : i32
+// CHECK: %[[BASE_BLOCK_ID_X:.*]] = nvvm.read.ptx.sreg.ctaid.x : i32
+// CHECK: %[[BLOCK_ID_X:.*]] = arith.addi %[[BASE_BLOCK_ID_X]], %c1{{.*}} : i32
+// CHECK: %[[GRID_DIM_Y:.*]] = nvvm.read.ptx.sreg.nctaid.y : i32
+// CHECK: %[[BLOCK_DIM_X:.*]] = nvvm.read.ptx.sreg.ntid.x : i32
+
+// CHECK: %[[I:.*]] = fir.declare %{{.*}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK: fir.store %[[THREAD_ID_X]] to %[[I]] : !fir.ref<i32>
+// CHECK: fir.store %[[BLOCK_DIM_X]] to %[[I]] : !fir.ref<i32>
+// CHECK: fir.store %[[BLOCK_ID_X]] to %[[I]] : !fir.ref<i32>
+// CHECK: fir.store %[[GRID_DIM_Y]] to %[[I]] : !fir.ref<i32>
+
+// CHECK: fir.store %[[WARPSIZE]] to %[[I]] : !fir.ref<i32>
+
+// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[THREAD_ID_X]], %c0{{.*}} : i32
+// CHECK: fir.if %[[CMP]] {
+// CHECK: fir.store %c0{{.*}} to %[[I]] : !fir.ref<i32>
+// CHECK: }
+
+
+// These function should not be transformed. Just here to make sure the pass
+// does not crash on them.
+
+func.func private @_QPsub2(%arg0: !fir.ref<i32> {fir.bindc_name = "i"})
+
+func.func @_QPsub3(%arg0: !fir.ref<i32> {fir.bindc_name = "i"}) {
+ return
+}
+
+// -----
+
+func.func @_QPsub1(%arg0: !fir.ref<i32> {fir.bindc_name = "i", cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<grid_global>} {
+ %0 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockdim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %1 = fir.declare %0 {uniq_name = "_QM__fortran_builtinsE__builtin_blockdim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %2 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockidx) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %3 = fir.declare %2 {uniq_name = "_QM__fortran_builtinsE__builtin_blockidx"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %4 = fir.address_of(@_QM__fortran_builtinsE__builtin_griddim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %5 = fir.declare %4 {uniq_name = "_QM__fortran_builtinsE__builtin_griddim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %6 = fir.declare %arg0 {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %7 = fir.address_of(@_QM__fortran_builtinsE__builtin_threadidx) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %8 = fir.declare %7 {uniq_name = "_QM__fortran_builtinsE__builtin_threadidx"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %9 = fir.alloca i32 {bindc_name = "__builtin_warpsize", uniq_name = "_QM__fortran_builtinsEC__builtin_warpsize"}
+ %10 = fir.declare %9 {uniq_name = "_QM__fortran_builtinsEC__builtin_warpsize"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %12 = fir.coordinate_of %8, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+ %13 = fir.load %12 : !fir.ref<i32>
+ fir.store %13 to %6 : !fir.ref<i32>
+ %15 = fir.coordinate_of %1, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+ %16 = fir.load %15 : !fir.ref<i32>
+ fir.store %16 to %6 : !fir.ref<i32>
+ %18 = fir.coordinate_of %3, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+ %19 = fir.load %18 : !fir.ref<i32>
+ fir.store %19 to %6 : !fir.ref<i32>
+ %21 = fir.coordinate_of %5, y : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+ %22 = fir.load %21 : !fir.ref<i32>
+ fir.store %22 to %6 : !fir.ref<i32>
+ %c32_i32 = arith.constant 32 : i32
+ fir.store %c32_i32 to %6 : !fir.ref<i32>
+ %24 = fir.coordinate_of %8, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+ %25 = fir.load %24 : !fir.ref<i32>
+ %c0_i32 = arith.constant 0 : i32
+ %26 = arith.cmpi eq, %25, %c0_i32 : i32
+ fir.if %26 {
+ %c0_i32_0 = arith.constant 0 : i32
+ fir.store %c0_i32_0 to %6 : !fir.ref<i32>
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @_QPsub1
+
+// CHECK: %{{.*}} = arith.constant 32 : i32
+
+// CHECK: %[[BASE_THREAD_ID_X:.*]] = nvvm.read.ptx.sreg.tid.x : i32
+// CHECK: %{{.*}} = arith.addi %[[BASE_THREAD_ID_X]], %c1{{.*}} : i32
+// CHECK: %[[BASE_BLOCK_ID_X:.*]] = nvvm.read.ptx.sreg.ctaid.x : i32
+// CHECK: %{{.*}} = arith.addi %[[BASE_BLOCK_ID_X]], %c1{{.*}} : i32
+// CHECK: %{{.*}} = nvvm.read.ptx.sreg.nctaid.y : i32
+// CHECK: %{{.*}} = nvvm.read.ptx.sreg.ntid.x : i32
+
+
+// -----
+
+func.func @_QPsub1(%arg0: !fir.ref<i32> {fir.bindc_name = "i", cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<grid_global>} {
+ %0 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockdim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %1 = fir.declare %0 {uniq_name = "_QM__fortran_builtinsE__builtin_blockdim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %2 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockidx) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %3 = fir.declare %2 {uniq_name = "_QM__fortran_builtinsE__builtin_blockidx"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %4 = fir.address_of(@_QM__fortran_builtinsE__builtin_griddim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %5 = fir.declare %4 {uniq_name = "_QM__fortran_builtinsE__builtin_griddim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %6 = fir.declare %arg0 {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %7 = fir.address_of(@_QM__fortran_builtinsE__builtin_threadidx) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %8 = fir.declare %7 {uniq_name = "_QM__fortran_builtinsE__builtin_threadidx"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
+ %9 = fir.alloca i32 {bindc_name = "__builtin_warpsize", uniq_name = "_QM__fortran_builtinsEC__builtin_warpsize"}
+ %10 = fir.declare %9 {uniq_name = "_QM__fortran_builtinsEC__builtin_warpsize"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %tid = nvvm.read.ptx.sreg.tid.x : i32
+ %12 = fir.coordinate_of %8, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+ %13 = fir.load %12 : !fir.ref<i32>
+ fir.store %13 to %6 : !fir.ref<i32>
+ %15 = fir.coordinate_of %1, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/174451
More information about the flang-commits
mailing list