[Mlir-commits] [clang] [llvm] [mlir] [NVPTX] Convert vector function nvvm.annotations to attributes (PR #127736)
Alex MacLean
llvmlistbot at llvm.org
Wed Feb 19 12:53:09 PST 2025
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/127736
>From fd8f342fa2b65f7604955c88e2b73e758dc17134 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Wed, 19 Feb 2025 02:26:23 +0000
Subject: [PATCH 1/2] [NVPTX] Convert vector function nvvm.annotations to
attributes
---
clang/lib/CodeGen/Targets/NVPTX.cpp | 8 +-
clang/test/CodeGenCUDA/launch-bounds.cu | 59 +++++-------
clang/test/OpenMP/ompx_attributes_codegen.cpp | 19 ++--
clang/test/OpenMP/thread_limit_nvptx.c | 18 ++--
llvm/docs/NVPTXUsage.rst | 17 ++++
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 51 ++---------
llvm/lib/IR/AutoUpgrade.cpp | 45 ++++++++++
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 42 ++++-----
.../Target/NVPTX/NVPTXCtorDtorLowering.cpp | 29 +-----
.../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 17 ++--
llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 90 +++++++++----------
llvm/lib/Target/NVPTX/NVPTXUtilities.h | 18 ++--
llvm/lib/Target/NVPTX/NVVMIntrRange.cpp | 4 +-
.../KernelInfo/launch-bounds/nvptx.ll | 6 +-
llvm/test/CodeGen/NVPTX/annotations.ll | 12 +--
llvm/test/CodeGen/NVPTX/bug26185-2.ll | 6 +-
llvm/test/CodeGen/NVPTX/cluster-dim.ll | 7 +-
llvm/test/CodeGen/NVPTX/intr-range.ll | 15 ++--
llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll | 9 +-
llvm/test/CodeGen/NVPTX/maxclusterrank.ll | 8 +-
.../CodeGen/NVPTX/upgrade-nvvm-annotations.ll | 62 ++++++++++++-
.../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 41 +++------
mlir/test/Target/LLVMIR/nvvmir.mlir | 27 ++----
23 files changed, 296 insertions(+), 314 deletions(-)
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 98a90613a2d3e..f617e645a9eaf 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -357,17 +357,13 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
int32_t *MaxThreadsVal,
int32_t *MinBlocksVal,
int32_t *MaxClusterRankVal) {
- // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
llvm::APSInt MaxThreads(32);
MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
if (MaxThreads > 0) {
if (MaxThreadsVal)
*MaxThreadsVal = MaxThreads.getExtValue();
- if (F) {
- // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
- NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
- MaxThreads.getExtValue());
- }
+ if (F)
+ F->addFnAttr("nvvm.maxntid", llvm::utostr(MaxThreads.getExtValue()));
}
// min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
diff --git a/clang/test/CodeGenCUDA/launch-bounds.cu b/clang/test/CodeGenCUDA/launch-bounds.cu
index 72f7857264f8c..fba66e85040c7 100644
--- a/clang/test/CodeGenCUDA/launch-bounds.cu
+++ b/clang/test/CodeGenCUDA/launch-bounds.cu
@@ -10,23 +10,30 @@
#endif
// CHECK: @Kernel1() #[[ATTR0:[0-9]+]]
+// CHECK: @Kernel2() #[[ATTR1:[0-9]+]]
+// CHECK: @{{.*}}Kernel3{{.*}}() #[[ATTR1]]
// CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]]
-// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]]
-// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]]
-// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]]
-
-// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
-// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}}
-// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}}
-
-// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]]
-
-// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}}
-// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}}
-// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}}
+// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR2:[0-9]+]]
+// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR3:[0-9]+]]
+// CHECK: @{{.*}}Kernel7{{.*}}() #[[ATTR1]]
+// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR4:[0-9]+]]
+
+// CHECK-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
+
+// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR0:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR0]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR1:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel7_sm_90{{.*}} #[[ATTR2:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR3:[0-9]+]]
+
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
// Test both max threads per block and Min cta per sm.
extern "C" {
@@ -37,8 +44,6 @@ Kernel1()
}
}
-// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
-
#ifdef USE_MAX_BLOCKS
// Test max threads per block and min/max cta per sm.
extern "C" {
@@ -48,8 +53,6 @@ Kernel1_sm_90()
{
}
}
-
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
#endif // USE_MAX_BLOCKS
// Test only max threads per block. Min cta per sm defaults to 0, and
@@ -62,8 +65,6 @@ Kernel2()
}
}
-// CHECK: !{{[0-9]+}} = !{ptr @Kernel2, !"maxntidx", i32 256}
-
template <int max_threads_per_block>
__global__ void
__launch_bounds__(max_threads_per_block)
@@ -72,7 +73,6 @@ Kernel3()
}
template __global__ void Kernel3<MAX_THREADS_PER_BLOCK>();
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}
template <int max_threads_per_block, int min_blocks_per_mp>
__global__ void
@@ -82,7 +82,6 @@ Kernel4()
}
template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
#ifdef USE_MAX_BLOCKS
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@@ -93,7 +92,6 @@ Kernel4_sm_90()
}
template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
#endif //USE_MAX_BLOCKS
const int constint = 100;
@@ -106,8 +104,6 @@ Kernel5()
}
template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
-
#ifdef USE_MAX_BLOCKS
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@@ -120,7 +116,6 @@ Kernel5_sm_90()
}
template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
#endif //USE_MAX_BLOCKS
// Make sure we don't emit negative launch bounds values.
@@ -129,15 +124,12 @@ __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
Kernel6()
{
}
-// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx",
__global__ void
__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
Kernel7()
{
}
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx",
-// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm",
#ifdef USE_MAX_BLOCKS
__global__ void
@@ -145,17 +137,12 @@ __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP
Kernel7_sm_90()
{
}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx",
-// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm",
-// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank",
#endif // USE_MAX_BLOCKS
const char constchar = 12;
__global__ void __launch_bounds__(constint, constchar) Kernel8() {}
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
#ifdef USE_MAX_BLOCKS
const char constchar_2 = 14;
__global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
#endif // USE_MAX_BLOCKS
diff --git a/clang/test/OpenMP/ompx_attributes_codegen.cpp b/clang/test/OpenMP/ompx_attributes_codegen.cpp
index 6c163c1875171..7cdbe8b9d788a 100644
--- a/clang/test/OpenMP/ompx_attributes_codegen.cpp
+++ b/clang/test/OpenMP/ompx_attributes_codegen.cpp
@@ -15,6 +15,10 @@ void func() {
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}})
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4
+ // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]]
+ // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]]
+ // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]]
+
#pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
{}
#pragma omp target teams ompx_attribute(__attribute__((launch_bounds(45, 90))))
@@ -34,9 +38,12 @@ void func() {
// AMD-SAME: "omp_target_thread_limit"="17"
// It is unclear if we should use the AMD annotations for other targets, we do for now.
-// NVIDIA: "omp_target_thread_limit"="20"
-// NVIDIA: "omp_target_thread_limit"="45"
-// NVIDIA: "omp_target_thread_limit"="17"
-// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l18, !"maxntidx", i32 20}
-// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l20, !"maxntidx", i32 45}
-// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l22, !"maxntidx", i32 17}
+// NVIDIA: attributes #[[ATTR0]]
+// NVIDIA-SAME: "omp_target_thread_limit"="20"
+// NVIDIA-SAME: "nvvm.maxntid"="20"
+// NVIDIA: attributes #[[ATTR1]]
+// NVIDIA-SAME: "omp_target_thread_limit"="45"
+// NVIDIA-SAME: "nvvm.maxntid"="45"
+// NVIDIA: attributes #[[ATTR2]]
+// NVIDIA-SAME: "omp_target_thread_limit"="17"
+// NVIDIA-SAME: "nvvm.maxntid"="17"
diff --git a/clang/test/OpenMP/thread_limit_nvptx.c b/clang/test/OpenMP/thread_limit_nvptx.c
index 2132e1aa7834a..ffa6c453067d1 100644
--- a/clang/test/OpenMP/thread_limit_nvptx.c
+++ b/clang/test/OpenMP/thread_limit_nvptx.c
@@ -7,23 +7,21 @@
#define HEADER
void foo(int N) {
-// CHECK: l11, !"maxntidx", i32 128}
+// CHECK: define {{.*}}l11{{.*}} #[[ATTR0:[0-9]+]]
#pragma omp target teams distribute parallel for simd
for (int i = 0; i < N; ++i)
;
-// CHECK: l15, !"maxntidx", i32 4}
+// CHECK: define {{.*}}l15{{.*}} #[[ATTR1:[0-9]+]]
#pragma omp target teams distribute parallel for simd thread_limit(4)
for (int i = 0; i < N; ++i)
;
-// CHECK-NOT: l21, !"maxntidx", i32 128}
-// CHECK: l21, !"maxntidx", i32 42}
-// CHECK-NOT: l21, !"maxntidx", i32 128}
+
+// CHECK: define {{.*}}l20{{.*}} #[[ATTR2:[0-9]+]]
#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
for (int i = 0; i < N; ++i)
;
-// CHECK-NOT: l27, !"maxntidx", i32 42}
-// CHECK: l27, !"maxntidx", i32 22}
-// CHECK-NOT: l27, !"maxntidx", i32 42}
+
+// CHECK: define {{.*}}l25{{.*}} #[[ATTR3:[0-9]+]]
#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
for (int i = 0; i < N; ++i)
;
@@ -31,3 +29,7 @@ void foo(int N) {
#endif
+// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="128" {{.*}}}
+// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}}
+// CHECK: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}}
+// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}}
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 8550af456e961..f17d7ddd75f19 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -74,6 +74,23 @@ Function Attributes
This attribute indicates the maximum number of registers to be used for the
kernel function.
+``"nvvm.maxntid"="<x>[,<y>[,<z>]]"``
+ This attribute declares the maximum number of threads in the thread block
+ (CTA). The maximum number of threads is the product of the maximum extent in
+ each dimension. Exceeding the maximum number of threads results in a runtime
+ error or kernel launch failure.
+
+``"nvvm.reqntid"="<x>[,<y>[,<z>]]"``
+ This attribute declares the exact number of threads in the thread block
+ (CTA). The number of threads is the product of the value in each dimension.
+ Specifying a different CTA dimension at launch will result in a runtime
+ error or kernel launch failure.
+
+``"nvvm.cluster_dim"="<x>[,<y>[,<z>]]"``
+ This attribute declares the number of thread blocks (CTAs) in the cluster.
+ The total number of CTAs is the product of the number of CTAs in each
+ dimension. Specifying a different cluster dimension at launch will result in
+ a runtime error or kernel launch failure. Only supported for Hopper+.
.. _address_spaces:
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 18bc82fc827f7..a56e66fbab0a1 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6366,45 +6366,13 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
KernelEnvironmentGV->setInitializer(NewInitializer);
}
-static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) {
- Module &M = *Kernel.getParent();
- NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
- for (auto *Op : MD->operands()) {
- if (Op->getNumOperands() != 3)
- continue;
- auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
- if (!KernelOp || KernelOp->getValue() != &Kernel)
- continue;
- auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
- if (!Prop || Prop->getString() != Name)
- continue;
- return Op;
- }
- return nullptr;
-}
-
-static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value,
- bool Min) {
- // Update the "maxntidx" metadata for NVIDIA, or add it.
- MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
- if (ExistingOp) {
- auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
- int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
- ExistingOp->replaceOperandWith(
- 2, ConstantAsMetadata::get(ConstantInt::get(
- OldVal->getValue()->getType(),
- Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
- } else {
- LLVMContext &Ctx = Kernel.getContext();
- Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel),
- MDString::get(Ctx, Name),
- ConstantAsMetadata::get(
- ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
- // Append metadata to nvvm.annotations
- Module &M = *Kernel.getParent();
- NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
- MD->addOperand(MDNode::get(Ctx, MDVals));
+static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
+ bool Min) {
+ if (Kernel.hasFnAttribute(Name)) {
+ int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
+ Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
}
+ Kernel.addFnAttr(Name, llvm::utostr(Value));
}
std::pair<int32_t, int32_t>
@@ -6426,9 +6394,8 @@ OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
return {LB, UB};
}
- if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
- auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
- int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
+ if (Kernel.hasFnAttribute("nvvm.maxntid")) {
+ int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
}
return {0, ThreadLimit};
@@ -6445,7 +6412,7 @@ void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
return;
}
- updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
+ updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
}
std::pair<int32_t, int32_t>
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 57072715366c9..dc18ba9780fec 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -13,11 +13,13 @@
//===----------------------------------------------------------------------===//
#include "llvm/IR/AutoUpgrade.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfo.h"
@@ -46,6 +48,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
+#include <cstdint>
#include <cstring>
#include <numeric>
@@ -5021,6 +5024,36 @@ bool llvm::UpgradeDebugInfo(Module &M) {
return Modified;
}
+static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC,
+ GlobalValue *GV, const Metadata *V) {
+ Function *F = cast<Function>(GV);
+
+ constexpr StringLiteral DefaultValue = "1";
+ StringRef Vect3[3] = {DefaultValue, DefaultValue, DefaultValue};
+ unsigned Length = 0;
+
+ if (F->hasFnAttribute(Attr)) {
+ StringRef S = F->getFnAttribute(Attr).getValueAsString();
+ for (; Length < 3 && !S.empty(); Length++) {
+ auto [Part, Rest] = S.split(',');
+ Vect3[Length] = Part.trim();
+ S = Rest;
+ }
+ }
+
+ const uint64_t VInt = mdconst::extract<ConstantInt>(V)->getZExtValue();
+ const std::string VStr = llvm::utostr(VInt);
+
+ const unsigned Dim = DimC - 'x';
+ assert(Dim >= 0 && Dim < 3 && "Unexpected dim char");
+
+ Vect3[Dim] = VStr;
+ Length = std::max(Length, Dim + 1);
+
+ const std::string NewAttr = llvm::join(ArrayRef(Vect3, Length), ",");
+ F->addFnAttr(Attr, NewAttr);
+}
+
bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
const Metadata *V) {
if (K == "kernel") {
@@ -5059,6 +5092,18 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
cast<Function>(GV)->addFnAttr("nvvm.maxnreg", llvm::utostr(CV));
return true;
}
+ if (K.consume_front("maxntid") && (K == "x" || K == "y" || K == "z")) {
+ upgradeNVVMFnVectorAttr("nvvm.maxntid", K[0], GV, V);
+ return true;
+ }
+ if (K.consume_front("reqntid") && (K == "x" || K == "y" || K == "z")) {
+ upgradeNVVMFnVectorAttr("nvvm.reqntid", K[0], GV, V);
+ return true;
+ }
+ if (K.consume_front("cluster_dim_") && (K == "x" || K == "y" || K == "z")) {
+ upgradeNVVMFnVectorAttr("nvvm.cluster_dim", K[0], GV, V);
+ return true;
+ }
return false;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index c8e29c1da6ec4..f98d9609ff135 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -35,6 +35,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -506,24 +507,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
// If the NVVM IR has some of reqntid* specified, then output
// the reqntid directive, and set the unspecified ones to 1.
// If none of Reqntid* is specified, don't output reqntid directive.
- std::optional<unsigned> Reqntidx = getReqNTIDx(F);
- std::optional<unsigned> Reqntidy = getReqNTIDy(F);
- std::optional<unsigned> Reqntidz = getReqNTIDz(F);
+ const auto ReqNTID = getReqNTID(F);
+ if (!ReqNTID.empty())
+ O << formatv(".reqntid {0:$[, ]}\n",
+ make_range(ReqNTID.begin(), ReqNTID.end()));
- if (Reqntidx || Reqntidy || Reqntidz)
- O << ".reqntid " << Reqntidx.value_or(1) << ", " << Reqntidy.value_or(1)
- << ", " << Reqntidz.value_or(1) << "\n";
-
- // If the NVVM IR has some of maxntid* specified, then output
- // the maxntid directive, and set the unspecified ones to 1.
- // If none of maxntid* is specified, don't output maxntid directive.
- std::optional<unsigned> Maxntidx = getMaxNTIDx(F);
- std::optional<unsigned> Maxntidy = getMaxNTIDy(F);
- std::optional<unsigned> Maxntidz = getMaxNTIDz(F);
-
- if (Maxntidx || Maxntidy || Maxntidz)
- O << ".maxntid " << Maxntidx.value_or(1) << ", " << Maxntidy.value_or(1)
- << ", " << Maxntidz.value_or(1) << "\n";
+ const auto MaxNTID = getMaxNTID(F);
+ if (!MaxNTID.empty())
+ O << formatv(".maxntid {0:$[, ]}\n",
+ make_range(MaxNTID.begin(), MaxNTID.end()));
if (const auto Mincta = getMinCTASm(F))
O << ".minnctapersm " << *Mincta << "\n";
@@ -537,21 +529,19 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
if (STI->getSmVersion() >= 90) {
- std::optional<unsigned> ClusterX = getClusterDimx(F);
- std::optional<unsigned> ClusterY = getClusterDimy(F);
- std::optional<unsigned> ClusterZ = getClusterDimz(F);
+ const auto ClusterDim = getClusterDim(F);
- if (ClusterX || ClusterY || ClusterZ) {
+ if (!ClusterDim.empty()) {
O << ".explicitcluster\n";
- if (ClusterX.value_or(1) != 0) {
- assert(ClusterY.value_or(1) && ClusterZ.value_or(1) &&
+ if (ClusterDim[0] != 0) {
+ assert(llvm::all_of(ClusterDim, [](unsigned D) { return D != 0; }) &&
"cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z "
"should be non-zero as well");
- O << ".reqnctapercluster " << ClusterX.value_or(1) << ", "
- << ClusterY.value_or(1) << ", " << ClusterZ.value_or(1) << "\n";
+ O << formatv(".reqnctapercluster {0:$[, ]}\n",
+ make_range(ClusterDim.begin(), ClusterDim.end()));
} else {
- assert(!ClusterY.value_or(1) && !ClusterZ.value_or(1) &&
+ assert(llvm::all_of(ClusterDim, [](unsigned D) { return D == 0; }) &&
"cluster_dim_x == 0 implies cluster_dim_y and cluster_dim_z "
"should be 0 as well");
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
index ae5922cba4ce3..b10e0b14118a1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
@@ -50,33 +50,10 @@ static std::string getHash(StringRef Str) {
return llvm::utohexstr(Hash.low(), /*LowerCase=*/true);
}
-static void addKernelMetadata(Module &M, Function *F) {
- llvm::LLVMContext &Ctx = M.getContext();
-
- // Get "nvvm.annotations" metadata node.
- llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
-
- // This kernel is only to be called single-threaded.
- llvm::Metadata *ThreadXMDVals[] = {
- llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidx"),
- llvm::ConstantAsMetadata::get(
- llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
- llvm::Metadata *ThreadYMDVals[] = {
- llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidy"),
- llvm::ConstantAsMetadata::get(
- llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
- llvm::Metadata *ThreadZMDVals[] = {
- llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidz"),
- llvm::ConstantAsMetadata::get(
- llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
-
+static void addKernelAttrs(Function *F) {
F->addFnAttr("nvvm.maxclusterrank", "1");
+ F->addFnAttr("nvvm.maxntid", "1");
F->setCallingConv(CallingConv::PTX_Kernel);
-
- // Append metadata to nvvm.annotations.
- MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals));
- MD->addOperand(llvm::MDNode::get(Ctx, ThreadYMDVals));
- MD->addOperand(llvm::MDNode::get(Ctx, ThreadZMDVals));
}
static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
@@ -88,7 +65,7 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
Function *InitOrFiniKernel = Function::createWithDefaultAttr(
FunctionType::get(Type::getVoidTy(M.getContext()), false),
GlobalValue::WeakODRLinkage, 0, InitOrFiniKernelName, &M);
- addKernelMetadata(M, InitOrFiniKernel);
+ addKernelAttrs(InitOrFiniKernel);
return InitOrFiniKernel;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 85e99d7fe97a2..1d0aa8981d58a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -567,13 +567,14 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
void NVPTXTTIImpl::collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
- std::optional<unsigned> Val;
- if ((Val = getMaxClusterRank(F)))
+ if (const auto Val = getMaxClusterRank(F))
LB.push_back({"maxclusterrank", *Val});
- if ((Val = getMaxNTIDx(F)))
- LB.push_back({"maxntidx", *Val});
- if ((Val = getMaxNTIDy(F)))
- LB.push_back({"maxntidy", *Val});
- if ((Val = getMaxNTIDz(F)))
- LB.push_back({"maxntidz", *Val});
+
+ const auto MaxNTID = getMaxNTID(F);
+ if (MaxNTID.size() > 0)
+ LB.push_back({"maxntidx", MaxNTID[0]});
+ if (MaxNTID.size() > 1)
+ LB.push_back({"maxntidy", MaxNTID[1]});
+ if (MaxNTID.size() > 2)
+ LB.push_back({"maxntidz", MaxNTID[2]});
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index cdb0f559d78b4..0a707bf06095d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -13,6 +13,8 @@
#include "NVPTXUtilities.h"
#include "NVPTX.h"
#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
@@ -196,6 +198,36 @@ static std::optional<unsigned> getFnAttrParsedInt(const Function &F,
: std::nullopt;
}
+static SmallVector<unsigned, 3> getFnAttrParsedVector(const Function &F,
+ StringRef Attr) {
+ SmallVector<unsigned, 3> V;
+ auto &Ctx = F.getContext();
+
+ if (F.hasFnAttribute(Attr)) {
+ StringRef S = F.getFnAttribute(Attr).getValueAsString();
+ for (unsigned I = 0; I < 3 && !S.empty(); I++) {
+ auto [First, Rest] = S.split(",");
+ unsigned IntVal;
+ if (First.trim().getAsInteger(0, IntVal))
+ Ctx.emitError("can't parse integer attribute " + First + " in " + Attr);
+
+ V.push_back(IntVal);
+ S = Rest;
+ }
+ }
+ return V;
+}
+
+static std::optional<unsigned> getVectorProduct(ArrayRef<unsigned> V) {
+ if (V.empty())
+ return std::nullopt;
+
+ unsigned Product = 1;
+ for (const unsigned E : V)
+ Product *= E;
+ return Product;
+}
+
bool isParamGridConstant(const Value &V) {
if (const Argument *Arg = dyn_cast<Argument>(&V)) {
// "grid_constant" counts argument indices starting from 1
@@ -254,71 +286,39 @@ StringRef getSamplerName(const Value &V) {
return V.getName();
}
-std::optional<unsigned> getMaxNTIDx(const Function &F) {
- return findOneNVVMAnnotation(&F, "maxntidx");
+SmallVector<unsigned, 3> getMaxNTID(const Function &F) {
+ return getFnAttrParsedVector(F, "nvvm.maxntid");
}
-std::optional<unsigned> getMaxNTIDy(const Function &F) {
- return findOneNVVMAnnotation(&F, "maxntidy");
+SmallVector<unsigned, 3> getReqNTID(const Function &F) {
+ return getFnAttrParsedVector(F, "nvvm.reqntid");
}
-std::optional<unsigned> getMaxNTIDz(const Function &F) {
- return findOneNVVMAnnotation(&F, "maxntidz");
+SmallVector<unsigned, 3> getClusterDim(const Function &F) {
+ return getFnAttrParsedVector(F, "nvvm.cluster_dim");
}
-std::optional<unsigned> getMaxNTID(const Function &F) {
+std::optional<unsigned> getOverallMaxNTID(const Function &F) {
// Note: The semantics here are a bit strange. The PTX ISA states the
// following (11.4.2. Performance-Tuning Directives: .maxntid):
//
// Note that this directive guarantees that the total number of threads does
// not exceed the maximum, but does not guarantee that the limit in any
// particular dimension is not exceeded.
- std::optional<unsigned> MaxNTIDx = getMaxNTIDx(F);
- std::optional<unsigned> MaxNTIDy = getMaxNTIDy(F);
- std::optional<unsigned> MaxNTIDz = getMaxNTIDz(F);
- if (MaxNTIDx || MaxNTIDy || MaxNTIDz)
- return MaxNTIDx.value_or(1) * MaxNTIDy.value_or(1) * MaxNTIDz.value_or(1);
- return std::nullopt;
-}
-
-std::optional<unsigned> getClusterDimx(const Function &F) {
- return findOneNVVMAnnotation(&F, "cluster_dim_x");
+ const auto MaxNTID = getMaxNTID(F);
+ return getVectorProduct(MaxNTID);
}
-std::optional<unsigned> getClusterDimy(const Function &F) {
- return findOneNVVMAnnotation(&F, "cluster_dim_y");
-}
-
-std::optional<unsigned> getClusterDimz(const Function &F) {
- return findOneNVVMAnnotation(&F, "cluster_dim_z");
+std::optional<unsigned> getOverallReqNTID(const Function &F) {
+ // Note: The semantics here are a bit strange. See getMaxNTID.
+ const auto ReqNTID = getReqNTID(F);
+ return getVectorProduct(ReqNTID);
}
std::optional<unsigned> getMaxClusterRank(const Function &F) {
return getFnAttrParsedInt(F, "nvvm.maxclusterrank");
}
-std::optional<unsigned> getReqNTIDx(const Function &F) {
- return findOneNVVMAnnotation(&F, "reqntidx");
-}
-
-std::optional<unsigned> getReqNTIDy(const Function &F) {
- return findOneNVVMAnnotation(&F, "reqntidy");
-}
-
-std::optional<unsigned> getReqNTIDz(const Function &F) {
- return findOneNVVMAnnotation(&F, "reqntidz");
-}
-
-std::optional<unsigned> getReqNTID(const Function &F) {
- // Note: The semantics here are a bit strange. See getMaxNTID.
- std::optional<unsigned> ReqNTIDx = getReqNTIDx(F);
- std::optional<unsigned> ReqNTIDy = getReqNTIDy(F);
- std::optional<unsigned> ReqNTIDz = getReqNTIDz(F);
- if (ReqNTIDx || ReqNTIDy || ReqNTIDz)
- return ReqNTIDx.value_or(1) * ReqNTIDy.value_or(1) * ReqNTIDz.value_or(1);
- return std::nullopt;
-}
-
std::optional<unsigned> getMinCTASm(const Function &F) {
return getFnAttrParsedInt(F, "nvvm.minctasm");
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index cf35eaf4cbae5..cecd003bf23a4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -14,6 +14,7 @@
#define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
#include "NVPTX.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallingConv.h"
@@ -47,19 +48,12 @@ StringRef getTextureName(const Value &);
StringRef getSurfaceName(const Value &);
StringRef getSamplerName(const Value &);
-std::optional<unsigned> getMaxNTIDx(const Function &);
-std::optional<unsigned> getMaxNTIDy(const Function &);
-std::optional<unsigned> getMaxNTIDz(const Function &);
-std::optional<unsigned> getMaxNTID(const Function &);
+SmallVector<unsigned, 3> getMaxNTID(const Function &);
+SmallVector<unsigned, 3> getReqNTID(const Function &);
+SmallVector<unsigned, 3> getClusterDim(const Function &);
-std::optional<unsigned> getReqNTIDx(const Function &);
-std::optional<unsigned> getReqNTIDy(const Function &);
-std::optional<unsigned> getReqNTIDz(const Function &);
-std::optional<unsigned> getReqNTID(const Function &);
-
-std::optional<unsigned> getClusterDimx(const Function &);
-std::optional<unsigned> getClusterDimy(const Function &);
-std::optional<unsigned> getClusterDimz(const Function &);
+std::optional<unsigned> getOverallMaxNTID(const Function &);
+std::optional<unsigned> getOverallReqNTID(const Function &);
std::optional<unsigned> getMaxClusterRank(const Function &);
std::optional<unsigned> getMinCTASm(const Function &);
diff --git a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
index 8dd46f9a1402d..8286e9661f202 100644
--- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -67,8 +67,8 @@ static bool runNVVMIntrRange(Function &F) {
unsigned x, y, z;
} MaxBlockSize, MaxGridSize;
- const unsigned MetadataNTID = getReqNTID(F).value_or(
- getMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
+ const unsigned MetadataNTID = getOverallReqNTID(F).value_or(
+ getOverallMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
MaxBlockSize.x = std::min(1024u, MetadataNTID);
MaxBlockSize.y = std::min(1024u, MetadataNTID);
diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
index a0c06083c270b..f5cabb4e6488d 100644
--- a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
+++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
@@ -24,11 +24,11 @@ attributes #0 = {
"omp_target_num_teams"="100"
"omp_target_thread_limit"="101"
"nvvm.maxclusterrank"="200"
+ "nvvm.maxntid"="210,211,212"
}
!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
-!nvvm.annotations = !{!7, !8, !9, !10}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
@@ -36,7 +36,3 @@ attributes #0 = {
!3 = !{}
!4 = !DISubroutineType(types: !3)
!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
-!7 = !{ptr @test, !"maxntidx", i32 210}
-!8 = !{ptr @test, !"maxntidy", i32 211}
-!9 = !{ptr @test, !"maxntidz", i32 212}
-!10 = distinct !{ptr null, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/annotations.ll b/llvm/test/CodeGen/NVPTX/annotations.ll
index 1f888d7fb21f1..5360e8988777b 100644
--- a/llvm/test/CodeGen/NVPTX/annotations.ll
+++ b/llvm/test/CodeGen/NVPTX/annotations.ll
@@ -9,14 +9,14 @@
; CHECK: .global .surfref surface
; CHECK: .entry kernel_func_maxntid
-define void @kernel_func_maxntid(ptr %a) {
+define ptx_kernel void @kernel_func_maxntid(ptr %a) "nvvm.maxntid"="10,20,30" {
; CHECK: .maxntid 10, 20, 30
; CHECK: ret
ret void
}
; CHECK: .entry kernel_func_reqntid
-define void @kernel_func_reqntid(ptr %a) {
+define ptx_kernel void @kernel_func_reqntid(ptr %a) "nvvm.reqntid"="11,22,33" {
; CHECK: .reqntid 11, 22, 33
; CHECK: ret
ret void
@@ -36,13 +36,7 @@ define ptx_kernel void @kernel_func_maxnreg() "nvvm.maxnreg"="1234" {
ret void
}
-!nvvm.annotations = !{!1, !2, !3, !4, !9, !10}
-
-!1 = !{ptr @kernel_func_maxntid, !"kernel", i32 1}
-!2 = !{ptr @kernel_func_maxntid, !"maxntidx", i32 10, !"maxntidy", i32 20, !"maxntidz", i32 30}
-
-!3 = !{ptr @kernel_func_reqntid, !"kernel", i32 1}
-!4 = !{ptr @kernel_func_reqntid, !"reqntidx", i32 11, !"reqntidy", i32 22, !"reqntidz", i32 33}
+!nvvm.annotations = !{!9, !10}
!9 = !{ptr addrspace(1) @texture, !"texture", i32 1}
!10 = !{ptr addrspace(1) @surface, !"surface", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
index 89cafcede06bd..c0bbf5b3559bb 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
@@ -26,8 +26,4 @@ bb:
ret void
}
-attributes #0 = { norecurse nounwind "polly.skip.fn" }
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @spam, !"maxntidx", i64 1, !"maxntidy", i64 1, !"maxntidz", i64 1}
+attributes #0 = { norecurse nounwind "polly.skip.fn" "nvvm.maxntid"="1,1,1" }
diff --git a/llvm/test/CodeGen/NVPTX/cluster-dim.ll b/llvm/test/CodeGen/NVPTX/cluster-dim.ll
index 9275c895b224a..196b967ce8685 100644
--- a/llvm/test/CodeGen/NVPTX/cluster-dim.ll
+++ b/llvm/test/CodeGen/NVPTX/cluster-dim.ll
@@ -3,7 +3,7 @@
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 | FileCheck -check-prefixes=CHECK90 %s
; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %}
-define ptx_kernel void @kernel_func_clusterxyz() {
+define ptx_kernel void @kernel_func_clusterxyz() "nvvm.cluster_dim"="3,5,7" {
; CHECK80-LABEL: kernel_func_clusterxyz(
; CHECK80: {
; CHECK80-EMPTY:
@@ -21,8 +21,3 @@ define ptx_kernel void @kernel_func_clusterxyz() {
; CHECK90-NEXT: ret;
ret void
}
-
-
-!nvvm.annotations = !{!1}
-
-!1 = !{ptr @kernel_func_clusterxyz, !"cluster_dim_x", i32 3, !"cluster_dim_y", i32 5, !"cluster_dim_z", i32 7}
diff --git a/llvm/test/CodeGen/NVPTX/intr-range.ll b/llvm/test/CodeGen/NVPTX/intr-range.ll
index 86776ab09efc6..884a4b1a3584f 100644
--- a/llvm/test/CodeGen/NVPTX/intr-range.ll
+++ b/llvm/test/CodeGen/NVPTX/intr-range.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s
-define ptx_kernel i32 @test_maxntid() {
+define ptx_kernel i32 @test_maxntid() "nvvm.maxntid"="32,1,3" {
; CHECK-LABEL: define ptx_kernel i32 @test_maxntid(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -31,9 +31,9 @@ define ptx_kernel i32 @test_maxntid() {
ret i32 %11
}
-define ptx_kernel i32 @test_reqntid() {
+define ptx_kernel i32 @test_reqntid() "nvvm.reqntid"="20" {
; CHECK-LABEL: define ptx_kernel i32 @test_reqntid(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
@@ -64,9 +64,9 @@ define ptx_kernel i32 @test_reqntid() {
;; A case like this could occur if a function with the sreg intrinsic was
;; inlined into a kernel where the tid metadata is present, ensure the range is
;; updated.
-define ptx_kernel i32 @test_inlined() {
+define ptx_kernel i32 @test_inlined() "nvvm.maxntid"="4" {
; CHECK-LABEL: define ptx_kernel i32 @test_inlined(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 4) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; CHECK-NEXT: ret i32 [[TMP1]]
;
@@ -81,8 +81,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-
-!nvvm.annotations = !{!0, !1, !2}
-!0 = !{ptr @test_maxntid, !"maxntidx", i32 32, !"maxntidz", i32 3}
-!1 = !{ptr @test_reqntid, !"reqntidx", i32 20}
-!2 = !{ptr @test_inlined, !"maxntidx", i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
index 3b73c36de4b89..02118fbf741bf 100644
--- a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
@@ -84,11 +84,4 @@ define internal void @bar() {
; CHECK: while.end:
; CHECK-NEXT: ret void
-; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" }
-
-; CHECK: [[META1:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidx", i32 1}
-; CHECK: [[META2:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidy", i32 1}
-; CHECK: [[META3:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidz", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1}
-; CHECK: [[META5:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1}
-; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1}
+; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" "nvvm.maxntid"="1" }
diff --git a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
index 51483296dd34f..ce8181e9a70e2 100644
--- a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
+++ b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
@@ -4,20 +4,16 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-unknown"
-; CHECK: .maxntid 128, 1, 1
+; CHECK: .maxntid 128
; CHECK: .minnctapersm 2
; CHECK_SM_90: .maxclusterrank 8
; CHECK_SM_80-NOT: .maxclusterrank 8
; Make sure that for SM version prior to 90 `.maxclusterrank` directive is
; silently ignored.
-define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" {
+define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" "nvvm.maxntid"="128" {
entry:
%a = alloca i32, align 4
store volatile i32 1, ptr %a, align 4
ret void
}
-
-!nvvm.annotations = !{!1}
-
-!1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128}
diff --git a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
index 3a1f59454493c..7c5d3d37d5508 100644
--- a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
+++ b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
@@ -48,7 +48,55 @@ define void @test_maxnreg() {
ret void
}
-!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6}
+define void @test_maxntid_1() {
+; CHECK-LABEL: define void @test_maxntid_1(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define void @test_maxntid_2() {
+; CHECK-LABEL: define void @test_maxntid_2(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define void @test_maxntid_3() {
+; CHECK-LABEL: define void @test_maxntid_3(
+; CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define void @test_maxntid_4() {
+; CHECK-LABEL: define void @test_maxntid_4(
+; CHECK-SAME: ) #[[ATTR7:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define void @test_reqntid() {
+; CHECK-LABEL: define void @test_reqntid(
+; CHECK-SAME: ) #[[ATTR8:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define void @test_cluster_dim() {
+; CHECK-LABEL: define void @test_cluster_dim(
+; CHECK-SAME: ) #[[ATTR9:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12}
!0 = !{ptr @test_align, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020010}
!1 = !{null, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020008}
@@ -57,12 +105,24 @@ define void @test_maxnreg() {
!4 = !{ptr @test_cluster_max_blocks, !"cluster_max_blocks", i32 3}
!5 = !{ptr @test_minctasm, !"minctasm", i32 4}
!6 = !{ptr @test_maxnreg, !"maxnreg", i32 5}
+!7 = !{ptr @test_maxntid_1, !"maxntidx", i32 50}
+!8 = !{ptr @test_maxntid_2, !"maxntidx", i32 11, !"maxntidy", i32 22, !"maxntidz", i32 33}
+!9 = !{ptr @test_maxntid_3, !"maxntidz", i32 11, !"maxntidy", i32 22, !"maxntidx", i32 33}
+!10 = !{ptr @test_maxntid_4, !"maxntidz", i32 100}
+!11 = !{ptr @test_reqntid, !"reqntidx", i32 31, !"reqntidy", i32 32, !"reqntidz", i32 33}
+!12 = !{ptr @test_cluster_dim, !"cluster_dim_x", i32 101, !"cluster_dim_y", i32 102, !"cluster_dim_z", i32 103}
;.
; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="2" }
; CHECK: attributes #[[ATTR1]] = { "nvvm.maxclusterrank"="3" }
; CHECK: attributes #[[ATTR2]] = { "nvvm.minctasm"="4" }
; CHECK: attributes #[[ATTR3]] = { "nvvm.maxnreg"="5" }
+; CHECK: attributes #[[ATTR4]] = { "nvvm.maxntid"="50" }
+; CHECK: attributes #[[ATTR5]] = { "nvvm.maxntid"="11,22,33" }
+; CHECK: attributes #[[ATTR6]] = { "nvvm.maxntid"="33,22,11" }
+; CHECK: attributes #[[ATTR7]] = { "nvvm.maxntid"="1,1,100" }
+; CHECK: attributes #[[ATTR8]] = { "nvvm.reqntid"="31,32,33" }
+; CHECK: attributes #[[ATTR9]] = { "nvvm.cluster_dim"="101,102,103" }
;.
; CHECK: [[META0:![0-9]+]] = !{ptr @test_align, !"align", i32 8}
;.
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index 8b13735774663..858beed959933 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -18,8 +18,10 @@
#include "mlir/Target/LLVMIR/ModuleTranslation.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/Support/FormatVariadic.h"
using namespace mlir;
using namespace mlir::LLVM;
@@ -183,48 +185,33 @@ class NVVMDialectLLVMIRTranslationInterface
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
return failure();
- llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
llvm::Function *llvmFunc = moduleTranslation.lookupFunction(func.getName());
- auto generateMetadata = [&](int dim, StringRef name) {
- llvm::Metadata *llvmMetadata[] = {
- llvm::ValueAsMetadata::get(llvmFunc),
- llvm::MDString::get(llvmContext, name),
- llvm::ValueAsMetadata::get(llvm::ConstantInt::get(
- llvm::Type::getInt32Ty(llvmContext), dim))};
- llvm::MDNode *llvmMetadataNode =
- llvm::MDNode::get(llvmContext, llvmMetadata);
- moduleTranslation.getOrInsertNamedModuleMetadata("nvvm.annotations")
- ->addOperand(llvmMetadataNode);
- };
if (attribute.getName() == NVVM::NVVMDialect::getMaxntidAttrName()) {
if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
return failure();
auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
- generateMetadata(values[0], NVVM::NVVMDialect::getMaxntidXName());
- if (values.size() > 1)
- generateMetadata(values[1], NVVM::NVVMDialect::getMaxntidYName());
- if (values.size() > 2)
- generateMetadata(values[2], NVVM::NVVMDialect::getMaxntidZName());
+ const std::string attr = llvm::formatv(
+ "{0:$[,]}", llvm::make_range(values.asArrayRef().begin(),
+ values.asArrayRef().end()));
+ llvmFunc->addFnAttr("nvvm.maxntid", attr);
} else if (attribute.getName() == NVVM::NVVMDialect::getReqntidAttrName()) {
if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
return failure();
auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
- generateMetadata(values[0], NVVM::NVVMDialect::getReqntidXName());
- if (values.size() > 1)
- generateMetadata(values[1], NVVM::NVVMDialect::getReqntidYName());
- if (values.size() > 2)
- generateMetadata(values[2], NVVM::NVVMDialect::getReqntidZName());
+ const std::string attr = llvm::formatv(
+ "{0:$[,]}", llvm::make_range(values.asArrayRef().begin(),
+ values.asArrayRef().end()));
+ llvmFunc->addFnAttr("nvvm.reqntid", attr);
} else if (attribute.getName() ==
NVVM::NVVMDialect::getClusterDimAttrName()) {
if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
return failure();
auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
- generateMetadata(values[0], NVVM::NVVMDialect::getClusterDimXName());
- if (values.size() > 1)
- generateMetadata(values[1], NVVM::NVVMDialect::getClusterDimYName());
- if (values.size() > 2)
- generateMetadata(values[2], NVVM::NVVMDialect::getClusterDimZName());
+ const std::string attr = llvm::formatv(
+ "{0:$[,]}", llvm::make_range(values.asArrayRef().begin(),
+ values.asArrayRef().end()));
+ llvmFunc->addFnAttr("nvvm.cluster_dim", attr);
} else if (attribute.getName() ==
NVVM::NVVMDialect::getClusterMaxBlocksAttrName()) {
auto value = dyn_cast<IntegerAttr>(attribute.getValue());
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 5ab593452ab66..a9717d26d7854 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -590,33 +590,24 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
llvm.return
}
-// CHECK: define ptx_kernel void @kernel_func
-// CHECK: !nvvm.annotations =
-// CHECK: {ptr @kernel_func, !"maxntidx", i32 1}
-// CHECK: {ptr @kernel_func, !"maxntidy", i32 23}
-// CHECK: {ptr @kernel_func, !"maxntidz", i32 32}
+// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
+// CHECK: attributes #[[ATTR0]] = { "nvvm.maxntid"="1,23,32" }
// -----
llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.reqntid = array<i32: 1, 23, 32>} {
llvm.return
}
-// CHECK: define ptx_kernel void @kernel_func
-// CHECK: !nvvm.annotations =
-// CHECK: {ptr @kernel_func, !"reqntidx", i32 1}
-// CHECK: {ptr @kernel_func, !"reqntidy", i32 23}
-// CHECK: {ptr @kernel_func, !"reqntidz", i32 32}
+// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
+// CHECK: attributes #[[ATTR0]] = { "nvvm.reqntid"="1,23,32" }
// -----
llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_dim = array<i32: 3, 5, 7>} {
llvm.return
}
-// CHECK: define ptx_kernel void @kernel_func
-// CHECK: !nvvm.annotations =
-// CHECK: {ptr @kernel_func, !"cluster_dim_x", i32 3}
-// CHECK: {ptr @kernel_func, !"cluster_dim_y", i32 5}
-// CHECK: {ptr @kernel_func, !"cluster_dim_z", i32 7}
+// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
+// CHECK: attributes #[[ATTR0]] = { "nvvm.cluster_dim"="3,5,7" }
// -----
llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_max_blocks = 8} {
@@ -650,11 +641,7 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
}
// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
-// CHECK: attributes #[[ATTR0]] = { "nvvm.maxnreg"="32" "nvvm.minctasm"="16" }
-// CHECK: !nvvm.annotations =
-// CHECK: {ptr @kernel_func, !"maxntidx", i32 1}
-// CHECK: {ptr @kernel_func, !"maxntidy", i32 23}
-// CHECK: {ptr @kernel_func, !"maxntidz", i32 32}
+// CHECK: attributes #[[ATTR0]] = { "nvvm.maxnreg"="32" "nvvm.maxntid"="1,23,32" "nvvm.minctasm"="16" }
// -----
// CHECK: define ptx_kernel void @kernel_func
>From 0a227b1a9eddd20b0a679f980faf61866c05fbe2 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Wed, 19 Feb 2025 20:52:54 +0000
Subject: [PATCH 2/2] address comments, fixup test
---
clang/test/OpenMP/ompx_attributes_codegen.cpp | 18 +++++++++---------
llvm/lib/IR/AutoUpgrade.cpp | 17 +++++++++++------
llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 14 +++++++-------
llvm/lib/Target/NVPTX/NVPTXUtilities.h | 4 ++--
4 files changed, 29 insertions(+), 24 deletions(-)
diff --git a/clang/test/OpenMP/ompx_attributes_codegen.cpp b/clang/test/OpenMP/ompx_attributes_codegen.cpp
index 7cdbe8b9d788a..d68f00a81335c 100644
--- a/clang/test/OpenMP/ompx_attributes_codegen.cpp
+++ b/clang/test/OpenMP/ompx_attributes_codegen.cpp
@@ -11,13 +11,13 @@
// Check that the target attributes are set on the generated kernel
void func() {
- // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #0
- // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}})
- // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4
+ // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #0
+ // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}})
+ // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #4
- // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]]
- // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]]
- // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]]
+ // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]]
+ // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]]
+ // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]]
#pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
{}
@@ -39,11 +39,11 @@ void func() {
// It is unclear if we should use the AMD annotations for other targets, we do for now.
// NVIDIA: attributes #[[ATTR0]]
-// NVIDIA-SAME: "omp_target_thread_limit"="20"
// NVIDIA-SAME: "nvvm.maxntid"="20"
+// NVIDIA-SAME: "omp_target_thread_limit"="20"
// NVIDIA: attributes #[[ATTR1]]
-// NVIDIA-SAME: "omp_target_thread_limit"="45"
// NVIDIA-SAME: "nvvm.maxntid"="45"
+// NVIDIA-SAME: "omp_target_thread_limit"="45"
// NVIDIA: attributes #[[ATTR2]]
-// NVIDIA-SAME: "omp_target_thread_limit"="17"
// NVIDIA-SAME: "nvvm.maxntid"="17"
+// NVIDIA-SAME: "omp_target_thread_limit"="17"
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index dc18ba9780fec..1daf15b481cf8 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5033,6 +5033,8 @@ static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC,
unsigned Length = 0;
if (F->hasFnAttribute(Attr)) {
+ // We expect the existing attribute to have the form "x[,y[,z]]". Here we
+ // parse these elements placing them into Vect3
StringRef S = F->getFnAttribute(Attr).getValueAsString();
for (; Length < 3 && !S.empty(); Length++) {
auto [Part, Rest] = S.split(',');
@@ -5041,12 +5043,11 @@ static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC,
}
}
- const uint64_t VInt = mdconst::extract<ConstantInt>(V)->getZExtValue();
- const std::string VStr = llvm::utostr(VInt);
-
const unsigned Dim = DimC - 'x';
assert(Dim >= 0 && Dim < 3 && "Unexpected dim char");
+ const uint64_t VInt = mdconst::extract<ConstantInt>(V)->getZExtValue();
+ const std::string VStr = llvm::utostr(VInt);
Vect3[Dim] = VStr;
Length = std::max(Length, Dim + 1);
@@ -5054,6 +5055,10 @@ static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC,
F->addFnAttr(Attr, NewAttr);
}
+static inline bool isXYZ(StringRef S) {
+ return S == "x" || S == "y" || S == "z";
+}
+
bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
const Metadata *V) {
if (K == "kernel") {
@@ -5092,15 +5097,15 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
cast<Function>(GV)->addFnAttr("nvvm.maxnreg", llvm::utostr(CV));
return true;
}
- if (K.consume_front("maxntid") && (K == "x" || K == "y" || K == "z")) {
+ if (K.consume_front("maxntid") && isXYZ(K)) {
upgradeNVVMFnVectorAttr("nvvm.maxntid", K[0], GV, V);
return true;
}
- if (K.consume_front("reqntid") && (K == "x" || K == "y" || K == "z")) {
+ if (K.consume_front("reqntid") && isXYZ(K)) {
upgradeNVVMFnVectorAttr("nvvm.reqntid", K[0], GV, V);
return true;
}
- if (K.consume_front("cluster_dim_") && (K == "x" || K == "y" || K == "z")) {
+ if (K.consume_front("cluster_dim_") && isXYZ(K)) {
upgradeNVVMFnVectorAttr("nvvm.cluster_dim", K[0], GV, V);
return true;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 0a707bf06095d..d44a38cbde72a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -22,6 +22,7 @@
#include "llvm/IR/Module.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/Mutex.h"
+#include <cstdint>
#include <cstring>
#include <map>
#include <mutex>
@@ -204,6 +205,8 @@ static SmallVector<unsigned, 3> getFnAttrParsedVector(const Function &F,
auto &Ctx = F.getContext();
if (F.hasFnAttribute(Attr)) {
+ // We expect the attribute value to be of the form "x[,y[,z]]", where x, y,
+ // and z are unsigned values.
StringRef S = F.getFnAttribute(Attr).getValueAsString();
for (unsigned I = 0; I < 3 && !S.empty(); I++) {
auto [First, Rest] = S.split(",");
@@ -218,14 +221,11 @@ static SmallVector<unsigned, 3> getFnAttrParsedVector(const Function &F,
return V;
}
-static std::optional<unsigned> getVectorProduct(ArrayRef<unsigned> V) {
+static std::optional<uint64_t> getVectorProduct(ArrayRef<unsigned> V) {
if (V.empty())
return std::nullopt;
- unsigned Product = 1;
- for (const unsigned E : V)
- Product *= E;
- return Product;
+ return std::accumulate(V.begin(), V.end(), 1, std::multiplies<uint64_t>{});
}
bool isParamGridConstant(const Value &V) {
@@ -298,7 +298,7 @@ SmallVector<unsigned, 3> getClusterDim(const Function &F) {
return getFnAttrParsedVector(F, "nvvm.cluster_dim");
}
-std::optional<unsigned> getOverallMaxNTID(const Function &F) {
+std::optional<uint64_t> getOverallMaxNTID(const Function &F) {
// Note: The semantics here are a bit strange. The PTX ISA states the
// following (11.4.2. Performance-Tuning Directives: .maxntid):
//
@@ -309,7 +309,7 @@ std::optional<unsigned> getOverallMaxNTID(const Function &F) {
return getVectorProduct(MaxNTID);
}
-std::optional<unsigned> getOverallReqNTID(const Function &F) {
+std::optional<uint64_t> getOverallReqNTID(const Function &F) {
// Note: The semantics here are a bit strange. See getMaxNTID.
const auto ReqNTID = getReqNTID(F);
return getVectorProduct(ReqNTID);
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index cecd003bf23a4..4ed379765fc20 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -52,8 +52,8 @@ SmallVector<unsigned, 3> getMaxNTID(const Function &);
SmallVector<unsigned, 3> getReqNTID(const Function &);
SmallVector<unsigned, 3> getClusterDim(const Function &);
-std::optional<unsigned> getOverallMaxNTID(const Function &);
-std::optional<unsigned> getOverallReqNTID(const Function &);
+std::optional<uint64_t> getOverallMaxNTID(const Function &);
+std::optional<uint64_t> getOverallReqNTID(const Function &);
std::optional<unsigned> getMaxClusterRank(const Function &);
std::optional<unsigned> getMinCTASm(const Function &);
More information about the Mlir-commits
mailing list