[flang-commits] [flang] [flang][cuda][NFC] Move CUDA intrinsics lowering to a separate file (PR #166461)
via flang-commits
flang-commits at lists.llvm.org
Tue Nov 4 15:07:21 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: Valentin Clement (バレンタイン クレメン) (clementval)
<details>
<summary>Changes</summary>
Just move all CUDA related intrinsics lowering to a separate file to avoid clobbering the main Fortran intrinsic file.
---
Patch is 142.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166461.diff
5 Files Affected:
- (added) flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h (+95)
- (modified) flang/include/flang/Optimizer/Builder/IntrinsicCall.h (-64)
- (modified) flang/lib/Optimizer/Builder/CMakeLists.txt (+1)
- (added) flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp (+1588)
- (modified) flang/lib/Optimizer/Builder/IntrinsicCall.cpp (+7-1339)
``````````diff
diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
new file mode 100644
index 0000000000000..d735ce95a83dc
--- /dev/null
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -0,0 +1,95 @@
+//==-- Builder/CUDAIntrinsicCall.h - lowering of CUDA intrinsics ---*-C++-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_CUDAINTRINSICCALL_H
+#define FORTRAN_LOWER_CUDAINTRINSICCALL_H
+
+#include "flang/Optimizer/Builder/IntrinsicCall.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+
+namespace fir {
+
+struct CUDAIntrinsicLibrary : IntrinsicLibrary {
+
+ // Constructors.
+ explicit CUDAIntrinsicLibrary(fir::FirOpBuilder &builder, mlir::Location loc)
+ : IntrinsicLibrary(builder, loc) {}
+ CUDAIntrinsicLibrary() = delete;
+ CUDAIntrinsicLibrary(const CUDAIntrinsicLibrary &) = delete;
+
+ // CUDA intrinsic handlers.
+ mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ fir::ExtendedValue genAtomicAddR2(mlir::Type,
+ llvm::ArrayRef<fir::ExtendedValue>);
+ template <int extent>
+ fir::ExtendedValue genAtomicAddVector(mlir::Type,
+ llvm::ArrayRef<fir::ExtendedValue>);
+ mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ fir::ExtendedValue genAtomicCas(mlir::Type,
+ llvm::ArrayRef<fir::ExtendedValue>);
+ mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ fir::ExtendedValue genAtomicExch(mlir::Type,
+ llvm::ArrayRef<fir::ExtendedValue>);
+ mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ fir::ExtendedValue genAtomicXor(mlir::Type,
+ llvm::ArrayRef<fir::ExtendedValue>);
+ mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
+ mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
+ template <const char *fctName, int extent>
+ fir::ExtendedValue genLDXXFunc(mlir::Type,
+ llvm::ArrayRef<fir::ExtendedValue>);
+ mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ template <typename OpTy>
+ mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>);
+ mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>);
+ mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>);
+ void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>);
+ void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>);
+ void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
+ template <mlir::NVVM::VoteSyncKind kind>
+ mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
+};
+
+const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name);
+
+} // namespace fir
+
+#endif // FORTRAN_LOWER_CUDAINTRINSICCALL_H
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index b64419f5ae6da..01d27fd5fc399 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -19,7 +19,6 @@
#include "flang/Runtime/iostat-consts.h"
#include "mlir/Dialect/Complex/IR/Complex.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include <optional>
@@ -187,25 +186,6 @@ struct IntrinsicLibrary {
mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>);
fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
- fir::ExtendedValue genAtomicAddR2(mlir::Type,
- llvm::ArrayRef<fir::ExtendedValue>);
- template <int extent>
- fir::ExtendedValue genAtomicAddVector(mlir::Type,
- llvm::ArrayRef<fir::ExtendedValue>);
- mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
- fir::ExtendedValue genAtomicCas(mlir::Type,
- llvm::ArrayRef<fir::ExtendedValue>);
- mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>);
- fir::ExtendedValue genAtomicExch(mlir::Type,
- llvm::ArrayRef<fir::ExtendedValue>);
- mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>);
- fir::ExtendedValue genAtomicXor(mlir::Type,
- llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue
genCommandArgumentCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genAsind(mlir::Type, llvm::ArrayRef<mlir::Value>);
@@ -213,11 +193,6 @@ struct IntrinsicLibrary {
fir::ExtendedValue genAssociated(mlir::Type,
llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genAtand(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>);
- void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
- mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
fir::ExtendedValue genBesselJn(mlir::Type,
llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genBesselYn(mlir::Type,
@@ -239,9 +214,6 @@ struct IntrinsicLibrary {
fir::ExtendedValue genCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
void genCpuTime(llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genCshift(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
- template <const char *fctName, int extent>
- fir::ExtendedValue genCUDALDXXFunc(mlir::Type,
- llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genCAssociatedCFunPtr(mlir::Type,
llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genCAssociatedCPtr(mlir::Type,
@@ -281,7 +253,6 @@ struct IntrinsicLibrary {
llvm::ArrayRef<fir::ExtendedValue>);
template <Extremum, ExtremumBehavior>
mlir::Value genExtremum(mlir::Type, llvm::ArrayRef<mlir::Value>);
- void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genFloor(mlir::Type, llvm::ArrayRef<mlir::Value>);
mlir::Value genFraction(mlir::Type resultType,
mlir::ArrayRef<mlir::Value> args);
@@ -373,8 +344,6 @@ struct IntrinsicLibrary {
mlir::Value genMalloc(mlir::Type, llvm::ArrayRef<mlir::Value>);
template <typename Shift>
mlir::Value genMask(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>);
fir::ExtendedValue genMatmul(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genMatmulTranspose(mlir::Type,
llvm::ArrayRef<fir::ExtendedValue>);
@@ -397,8 +366,6 @@ struct IntrinsicLibrary {
fir::ExtendedValue genNull(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genNumImages(mlir::Type,
llvm::ArrayRef<fir::ExtendedValue>);
- template <typename OpTy>
- mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
fir::ExtendedValue genPack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genParity(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
void genPerror(llvm::ArrayRef<fir::ExtendedValue>);
@@ -453,56 +420,25 @@ struct IntrinsicLibrary {
fir::ExtendedValue genSum(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
void genSignalSubroutine(llvm::ArrayRef<fir::ExtendedValue>);
void genSleep(llvm::ArrayRef<fir::ExtendedValue>);
- void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>);
- mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
- void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genSystem(std::optional<mlir::Type>,
mlir::ArrayRef<fir::ExtendedValue> args);
void genSystemClock(llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genTand(mlir::Type, llvm::ArrayRef<mlir::Value>);
mlir::Value genTanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
- void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>);
- void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>);
fir::ExtendedValue genTransfer(mlir::Type,
llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genTranspose(mlir::Type,
llvm::ArrayRef<fir::ExtendedValue>);
- mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>);
fir::ExtendedValue genThisImage(mlir::Type,
llvm::ArrayRef<fir::ExtendedValue>);
- mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>);
- mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>);
- void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>);
- void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>);
- void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genTrim(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genUbound(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genUnlink(std::optional<mlir::Type> resultType,
llvm::ArrayRef<fir::ExtendedValue> args);
fir::ExtendedValue genUnpack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genVerify(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
- template <mlir::NVVM::VoteSyncKind kind>
- mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
/// Implement all conversion functions like DBLE, the first argument is
/// the value to convert. There may be an additional KIND arguments that
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index 1f95259a857da..37c9c2d703c76 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -5,6 +5,7 @@ add_flang_library(FIRBuilder
BoxValue.cpp
Character.cpp
Complex.cpp
+ CUDAIntrinsicCall.cpp
CUFCommon.cpp
DoLoopHelper.cpp
FIRBuilder.cpp
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
new file mode 100644
index 0000000000000..4e276a72897fe
--- /dev/null
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -0,0 +1,1588 @@
+//===-- CUDAIntrinsicCall.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper routines for constructing the FIR dialect of MLIR for PowerPC
+// intrinsics. Extensive use of MLIR interfaces and MLIR's coding style
+// (https://mlir.llvm.org/getting_started/DeveloperGuide/) is used in this
+// module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h"
+#include "flang/Evaluate/common.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/MutableBox.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+namespace fir {
+
+using CI = CUDAIntrinsicLibrary;
+
+static const char __ldca_i4x4[] = "__ldca_i4x4_";
+static const char __ldca_i8x2[] = "__ldca_i8x2_";
+static const char __ldca_r2x2[] = "__ldca_r2x2_";
+static const char __ldca_r4x4[] = "__ldca_r4x4_";
+static const char __ldca_r8x2[] = "__ldca_r8x2_";
+static const char __ldcg_i4x4[] = "__ldcg_i4x4_";
+static const char __ldcg_i8x2[] = "__ldcg_i8x2_";
+static const char __ldcg_r2x2[] = "__ldcg_r2x2_";
+static const char __ldcg_r4x4[] = "__ldcg_r4x4_";
+static const char __ldcg_r8x2[] = "__ldcg_r8x2_";
+static const char __ldcs_i4x4[] = "__ldcs_i4x4_";
+static const char __ldcs_i8x2[] = "__ldcs_i8x2_";
+static const char __ldcs_r2x2[] = "__ldcs_r2x2_";
+static const char __ldcs_r4x4[] = "__ldcs_r4x4_";
+static const char __ldcs_r8x2[] = "__ldcs_r8x2_";
+static const char __ldcv_i4x4[] = "__ldcv_i4x4_";
+static const char __ldcv_i8x2[] = "__ldcv_i8x2_";
+static const char __ldcv_r2x2[] = "__ldcv_r2x2_";
+static const char __ldcv_r4x4[] = "__ldcv_r4x4_";
+static const char __ldcv_r8x2[] = "__ldcv_r8x2_";
+static const char __ldlu_i4x4[] = "__ldlu_i4x4_";
+static const char __ldlu_i8x2[] = "__ldlu_i8x2_";
+static const char __ldlu_r2x2[] = "__ldlu_r2x2_";
+static const char __ldlu_r4x4[] = "__ldlu_r4x4_";
+static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
+
+// CUDA specific intrinsic handlers.
+static constexpr IntrinsicHandler cudaHandlers[]{
+ {"__ldca_i4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldca_i4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldca_i8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldca_i8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldca_r2x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldca_r2x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldca_r4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldca_r4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldca_r8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldca_r8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_i4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcg_i4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_i8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcg_i8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_r2x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcg_r2x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_r4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcg_r4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_r8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcg_r8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_i4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcs_i4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_i8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcs_i8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_r2x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcs_r2x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_r4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcs_r4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_r8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcs_r8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_i4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcv_i4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_i8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcv_i8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_r2x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/166461
More information about the flang-commits
mailing list