[llvm] [NVPTX] Add ranges to intrinsic definitions, cleanup NVVMIntrRange (PR #138338)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Mon May 5 14:17:09 PDT 2025
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/138338
>From f542b21d2647ee97884c6f04c708c0bd9d9951c4 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Fri, 2 May 2025 19:37:34 +0000
Subject: [PATCH 1/2] [NVPTX] Add ranges to intrinsic definitions, cleanup
NVVMIntrRange
---
llvm/include/llvm/IR/IntrinsicsNVVM.td | 95 +++++++++++----
llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 10 ++
llvm/lib/Target/NVPTX/NVPTXUtilities.h | 1 +
llvm/lib/Target/NVPTX/NVVMIntrRange.cpp | 126 ++++++++++----------
llvm/test/CodeGen/NVPTX/intr-range.ll | 140 +++++++++++++++++++++++
llvm/test/CodeGen/NVPTX/intrinsic-old.ll | 19 +--
6 files changed, 291 insertions(+), 100 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2c550de7a9203..6b385a30c804d 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -139,6 +139,16 @@ def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
// MISC
//
+defvar WARP_SIZE = 32;
+
+defvar MAX_GRID_SIZE_X = 0x7fffffff;
+defvar MAX_GRID_SIZE_Y = 0xffff;
+defvar MAX_GRID_SIZE_Z = 0xffff;
+
+defvar MAX_BLOCK_SIZE_X = 1024;
+defvar MAX_BLOCK_SIZE_Y = 1024;
+defvar MAX_BLOCK_SIZE_Z = 64;
+
// Helper class that concatenates list elements with
// a given separator 'sep' and returns the result.
// Handles empty strings.
@@ -4747,26 +4757,33 @@ def int_nvvm_sust_p_3d_v4i32_trap
// Accessing special registers.
-class PTXReadSRegIntrinsicNB_r32
- : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>]>;
-class PTXReadSRegIntrinsic_r32<string name>
- : PTXReadSRegIntrinsicNB_r32, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
+class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = []>
+ : DefaultAttrsIntrinsic<[llvm_i32_ty], [],
+ !listconcat([IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>], properties)>;
-multiclass PTXReadSRegIntrinsic_v4i32<string regname> {
+class PTXReadSRegIntrinsic_r32<string name,
+ list<IntrinsicProperty> properties = []>
+ : PTXReadSRegIntrinsicNB_r32<properties>,
+ ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
+
+multiclass PTXReadSRegIntrinsic_v4i32<string regname,
+ list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
// FIXME: Do we need the 128-bit integer type version?
// def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>;
// FIXME: Enable this once v4i32 support is enabled in back-end.
// def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
- foreach suffix = ["_x", "_y", "_z", "_w"] in
- def suffix : PTXReadSRegIntrinsic_r32<regname # suffix>;
+ defvar suffixes = ["_x", "_y", "_z", "_w"];
+ foreach i = !range(suffixes) in
+ def suffixes[i] : PTXReadSRegIntrinsic_r32<regname # suffixes[i], properties[i]>;
}
// Same, but without automatic clang builtins. It will be used for
// registers that require particular GPU or PTX version.
-multiclass PTXReadSRegIntrinsicNB_v4i32 {
- foreach suffix = ["_x", "_y", "_z", "_w"] in
- def suffix : PTXReadSRegIntrinsicNB_r32;
+multiclass PTXReadSRegIntrinsicNB_v4i32<list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
+ defvar suffixes = ["_x", "_y", "_z", "_w"];
+ foreach i = !range(suffixes) in
+ def suffixes[i] : PTXReadSRegIntrinsicNB_r32<properties[i]>;
}
class PTXReadSRegIntrinsic_r64<string name>
@@ -4782,15 +4799,41 @@ class PTXReadNCSRegIntrinsic_r64<string name>
: Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef<RetIndex>]>,
ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
-defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">;
-defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">;
+defm int_nvvm_read_ptx_sreg_tid
+ : PTXReadSRegIntrinsic_v4i32<"tid",
+ [[Range<RetIndex, 0, MAX_BLOCK_SIZE_X>],
+ [Range<RetIndex, 0, MAX_BLOCK_SIZE_Y>],
+ [Range<RetIndex, 0, MAX_BLOCK_SIZE_Z>],
+ [Range<RetIndex, 0, 1>]]>;
+
+defm int_nvvm_read_ptx_sreg_ntid
+ : PTXReadSRegIntrinsic_v4i32<"ntid",
+ [[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_X, 1)>],
+ [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Y, 1)>],
+ [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Z, 1)>],
+ [Range<RetIndex, 0, 1>]]>;
+
+def int_nvvm_read_ptx_sreg_laneid
+ : PTXReadSRegIntrinsic_r32<"laneid", [Range<RetIndex, 0, WARP_SIZE>]>;
-def int_nvvm_read_ptx_sreg_laneid : PTXReadSRegIntrinsic_r32<"laneid">;
def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32<"warpid">;
def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32<"nwarpid">;
-defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32<"ctaid">;
-defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32<"nctaid">;
+defvar MAX_GRID_ID_RANGE = [[Range<RetIndex, 0, MAX_GRID_SIZE_X>],
+ [Range<RetIndex, 0, MAX_GRID_SIZE_Y>],
+ [Range<RetIndex, 0, MAX_GRID_SIZE_Z>],
+ [Range<RetIndex, 0, 1>]];
+
+defvar MAX_GRID_NID_RANGE = [[Range<RetIndex, 1, !add(MAX_GRID_SIZE_X, 1)>],
+ [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Y, 1)>],
+ [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Z, 1)>],
+ [Range<RetIndex, 0, 1>]];
+
+defm int_nvvm_read_ptx_sreg_ctaid
+ : PTXReadSRegIntrinsic_v4i32<"ctaid", MAX_GRID_ID_RANGE>;
+
+defm int_nvvm_read_ptx_sreg_nctaid
+ : PTXReadSRegIntrinsic_v4i32<"nctaid", MAX_GRID_NID_RANGE>;
def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32<"smid">;
def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32<"nsmid">;
@@ -4817,13 +4860,25 @@ def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">;
def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">;
def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32<"pm3">;
-def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
+def int_nvvm_read_ptx_sreg_warpsize
+ : PTXReadSRegIntrinsic_r32<"warpsize",
+ [Range<RetIndex, WARP_SIZE, !add(WARP_SIZE, 1)>]>;
// sm90+, PTX7.8+
-defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32;
-defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32;
-defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32;
-defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32;
+
+// Note: Since clusters are subdivisions of the grid, we conservatively use the
+// maximum grid size as an upper bound for the clusterid and cluster_ctaid. In
+// practice, the clusterid will likely be much smaller. The CUDA programming
+// guide recommends 8 as a maximum portable value and H100s support 16.
+
+defm int_nvvm_read_ptx_sreg_clusterid
+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
+defm int_nvvm_read_ptx_sreg_nclusterid
+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
+defm int_nvvm_read_ptx_sreg_cluster_ctaid
+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
+defm int_nvvm_read_ptx_sreg_cluster_nctaid
+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 61b50b69b4e86..6586f925504f1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -330,6 +330,16 @@ std::optional<uint64_t> getOverallReqNTID(const Function &F) {
return getVectorProduct(ReqNTID);
}
+std::optional<uint64_t> getOverallClusterRank(const Function &F) {
+ // maxclusterrank and cluster_dim are mutually exclusive.
+ if (const auto ClusterRank = getMaxClusterRank(F))
+ return ClusterRank;
+
+ // Note: The semantics here are a bit strange. See getMaxNTID.
+ const auto ClusterDim = getClusterDim(F);
+ return getVectorProduct(ClusterDim);
+}
+
std::optional<unsigned> getMaxClusterRank(const Function &F) {
return getFnAttrParsedInt(F, "nvvm.maxclusterrank");
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index a1b4a0e5e7471..e792e441e49e6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -54,6 +54,7 @@ SmallVector<unsigned, 3> getClusterDim(const Function &);
std::optional<uint64_t> getOverallMaxNTID(const Function &);
std::optional<uint64_t> getOverallReqNTID(const Function &);
+std::optional<uint64_t> getOverallClusterRank(const Function &);
std::optional<unsigned> getMaxClusterRank(const Function &);
std::optional<unsigned> getMinCTASm(const Function &);
diff --git a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
index 91b8e470e055e..2c81989932a97 100644
--- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -58,87 +58,89 @@ static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) {
}
static bool runNVVMIntrRange(Function &F) {
- struct {
- unsigned x, y, z;
- } MaxBlockSize, MaxGridSize;
+ struct Vector3 {
+ unsigned X, Y, Z;
+ };
- const unsigned MetadataNTID = getOverallReqNTID(F).value_or(
- getOverallMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
+ // All these annotations are only valid for kernel functions.
+ if (!isKernelFunction(F))
+ return false;
- MaxBlockSize.x = std::min(1024u, MetadataNTID);
- MaxBlockSize.y = std::min(1024u, MetadataNTID);
- MaxBlockSize.z = std::min(64u, MetadataNTID);
+ const auto OverallReqNTID = getOverallReqNTID(F);
+ const auto OverallMaxNTID = getOverallMaxNTID(F);
+ const auto OverallClusterRank = getOverallClusterRank(F);
- MaxGridSize.x = 0x7fffffff;
- MaxGridSize.y = 0xffff;
- MaxGridSize.z = 0xffff;
+ // If this function lacks any range information, do nothing.
+ if (!(OverallReqNTID || OverallMaxNTID || OverallClusterRank))
+ return false;
- // Go through the calls in this function.
- bool Changed = false;
- for (Instruction &I : instructions(F)) {
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
- if (!II)
- continue;
+ const unsigned FunctionNTID = OverallReqNTID.value_or(
+ OverallMaxNTID.value_or(std::numeric_limits<unsigned>::max()));
+ const unsigned FunctionClusterRank =
+ OverallClusterRank.value_or(std::numeric_limits<unsigned>::max());
+
+ const Vector3 MaxBlockSize{std::min(1024u, FunctionNTID),
+ std::min(1024u, FunctionNTID),
+ std::min(64u, FunctionNTID)};
+
+ // We conservatively use the maximum grid size as an upper bound for the
+ // cluster rank.
+ const Vector3 MaxClusterRank{std::min(0x7fffffffu, FunctionClusterRank),
+ std::min(0xffffu, FunctionClusterRank),
+ std::min(0xffffu, FunctionClusterRank)};
+
+ const auto ProccessIntrinsic = [&](IntrinsicInst *II) -> bool {
switch (II->getIntrinsicID()) {
// Index within block
case Intrinsic::nvvm_read_ptx_sreg_tid_x:
- Changed |= addRangeAttr(0, MaxBlockSize.x, II);
- break;
+ return addRangeAttr(0, MaxBlockSize.X, II);
case Intrinsic::nvvm_read_ptx_sreg_tid_y:
- Changed |= addRangeAttr(0, MaxBlockSize.y, II);
- break;
+ return addRangeAttr(0, MaxBlockSize.Y, II);
case Intrinsic::nvvm_read_ptx_sreg_tid_z:
- Changed |= addRangeAttr(0, MaxBlockSize.z, II);
- break;
+ return addRangeAttr(0, MaxBlockSize.Z, II);
// Block size
case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
- Changed |= addRangeAttr(1, MaxBlockSize.x + 1, II);
- break;
+ return addRangeAttr(1, MaxBlockSize.X + 1, II);
case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
- Changed |= addRangeAttr(1, MaxBlockSize.y + 1, II);
- break;
+ return addRangeAttr(1, MaxBlockSize.Y + 1, II);
case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
- Changed |= addRangeAttr(1, MaxBlockSize.z + 1, II);
- break;
-
- // Index within grid
- case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
- Changed |= addRangeAttr(0, MaxGridSize.x, II);
- break;
- case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
- Changed |= addRangeAttr(0, MaxGridSize.y, II);
+ return addRangeAttr(1, MaxBlockSize.Z + 1, II);
+
+ // Cluster size
+ case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x:
+ return addRangeAttr(0, MaxClusterRank.X, II);
+ case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y:
+ return addRangeAttr(0, MaxClusterRank.Y, II);
+ case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z:
+ return addRangeAttr(0, MaxClusterRank.Z, II);
+ case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x:
+ return addRangeAttr(1, MaxClusterRank.X + 1, II);
+ case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y:
+ return addRangeAttr(1, MaxClusterRank.Y + 1, II);
+ case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z:
+ return addRangeAttr(1, MaxClusterRank.Z + 1, II);
+
+ case Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank:
+ if (OverallClusterRank)
+ return addRangeAttr(0, FunctionClusterRank, II);
break;
- case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
- Changed |= addRangeAttr(0, MaxGridSize.z, II);
+ case Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank:
+ if (OverallClusterRank)
+ return addRangeAttr(1, FunctionClusterRank + 1, II);
break;
-
- // Grid size
- case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
- Changed |= addRangeAttr(1, MaxGridSize.x + 1, II);
- break;
- case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
- Changed |= addRangeAttr(1, MaxGridSize.y + 1, II);
- break;
- case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
- Changed |= addRangeAttr(1, MaxGridSize.z + 1, II);
- break;
-
- // warp size is constant 32.
- case Intrinsic::nvvm_read_ptx_sreg_warpsize:
- Changed |= addRangeAttr(32, 32 + 1, II);
- break;
-
- // Lane ID is [0..warpsize)
- case Intrinsic::nvvm_read_ptx_sreg_laneid:
- Changed |= addRangeAttr(0, 32, II);
- break;
-
default:
- break;
+ return false;
}
- }
+ return false;
+ };
+
+ // Go through the calls in this function.
+ bool Changed = false;
+ for (Instruction &I : instructions(F))
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
+ Changed |= ProccessIntrinsic(II);
return Changed;
}
diff --git a/llvm/test/CodeGen/NVPTX/intr-range.ll b/llvm/test/CodeGen/NVPTX/intr-range.ll
index 884a4b1a3584f..48fa3e06629b4 100644
--- a/llvm/test/CodeGen/NVPTX/intr-range.ll
+++ b/llvm/test/CodeGen/NVPTX/intr-range.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s --check-prefix=DEFAULT
define ptx_kernel i32 @test_maxntid() "nvvm.maxntid"="32,1,3" {
; CHECK-LABEL: define ptx_kernel i32 @test_maxntid(
@@ -74,10 +75,149 @@ define ptx_kernel i32 @test_inlined() "nvvm.maxntid"="4" {
ret i32 %1
}
+define ptx_kernel i32 @test_cluster_ctaid() "nvvm.maxclusterrank"="8" {
+; CHECK-LABEL: define ptx_kernel i32 @test_cluster_ctaid(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]]
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP5]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP6]]
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP7]]
+; CHECK-NEXT: ret i32 [[TMP15]]
+;
+ %1 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+ %2 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+ %3 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+ %4 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+ %5 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+ %6 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+ %7 = add i32 %1, %2
+ %8 = add i32 %7, %3
+ %9 = add i32 %8, %4
+ %10 = add i32 %9, %5
+ %11 = add i32 %10, %6
+ ret i32 %11
+}
+
+define ptx_kernel i32 @test_cluster_dim() "nvvm.cluster_dim"="4,4,1" {
+; CHECK-LABEL: define ptx_kernel i32 @test_cluster_dim(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]]
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP5]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP6]]
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP7]]
+; CHECK-NEXT: ret i32 [[TMP15]]
+;
+ %1 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+ %2 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+ %3 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+ %4 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+ %5 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+ %6 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+ %7 = add i32 %1, %2
+ %8 = add i32 %7, %3
+ %9 = add i32 %8, %4
+ %10 = add i32 %9, %5
+ %11 = add i32 %10, %6
+ ret i32 %11
+}
+
+
+; DEFAULT-DAG: declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; DEFAULT-DAG: declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+; DEFAULT-DAG: declare noundef range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.tid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+; DEFAULT-DAG: declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+; DEFAULT-DAG: declare noundef range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid()
+; DEFAULT-DAG: declare noundef range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+
+; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.clusterid.x()
+; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.clusterid.y()
+; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.clusterid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.clusterid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x()
+; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y()
+; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.w()
+
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.w()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.w()
diff --git a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll
index 72c4cda16db47..f595df837f91f 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll
@@ -1,13 +1,10 @@
; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s
-; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \
-; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
define ptx_device i32 @test_tid_x() {
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x;
-; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %x
@@ -15,7 +12,6 @@ define ptx_device i32 @test_tid_x() {
define ptx_device i32 @test_tid_y() {
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y;
-; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %x
@@ -23,7 +19,6 @@ define ptx_device i32 @test_tid_y() {
define ptx_device i32 @test_tid_z() {
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z;
-; RANGE: call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
ret i32 %x
@@ -38,7 +33,6 @@ define ptx_device i32 @test_tid_w() {
define ptx_device i32 @test_ntid_x() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x;
-; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
ret i32 %x
@@ -46,7 +40,6 @@ define ptx_device i32 @test_ntid_x() {
define ptx_device i32 @test_ntid_y() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y;
-; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
ret i32 %x
@@ -54,7 +47,6 @@ define ptx_device i32 @test_ntid_y() {
define ptx_device i32 @test_ntid_z() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z;
-; RANGE: call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
ret i32 %x
@@ -69,7 +61,6 @@ define ptx_device i32 @test_ntid_w() {
define ptx_device i32 @test_laneid() {
; CHECK: mov.u32 %r{{[0-9]+}}, %laneid;
-; RANGE: call range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
ret i32 %x
@@ -77,7 +68,6 @@ define ptx_device i32 @test_laneid() {
define ptx_device i32 @test_warpsize() {
; CHECK: mov.u32 %r{{[0-9]+}}, WARP_SZ;
-; RANGE: call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
ret i32 %x
@@ -99,7 +89,6 @@ define ptx_device i32 @test_nwarpid() {
define ptx_device i32 @test_ctaid_y() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y;
-; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %x
@@ -107,7 +96,6 @@ define ptx_device i32 @test_ctaid_y() {
define ptx_device i32 @test_ctaid_z() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z;
-; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
ret i32 %x
@@ -115,7 +103,6 @@ define ptx_device i32 @test_ctaid_z() {
define ptx_device i32 @test_ctaid_x() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x;
-; RANGE: call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %x
@@ -130,7 +117,6 @@ define ptx_device i32 @test_ctaid_w() {
define ptx_device i32 @test_nctaid_y() {
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y;
-; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
ret i32 %x
@@ -138,7 +124,6 @@ define ptx_device i32 @test_nctaid_y() {
define ptx_device i32 @test_nctaid_z() {
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z;
-; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
ret i32 %x
@@ -146,7 +131,6 @@ define ptx_device i32 @test_nctaid_z() {
define ptx_device i32 @test_nctaid_x() {
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x;
-; RANGE: call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
ret i32 %x
@@ -154,7 +138,7 @@ define ptx_device i32 @test_nctaid_x() {
define ptx_device i32 @test_already_has_range_md() {
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x;
-; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range ![[ALREADY:[0-9]+]]
+; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range !0
ret i32 %x
}
@@ -316,4 +300,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.pm3()
declare void @llvm.nvvm.bar.sync(i32 %i)
!0 = !{i32 0, i32 19}
-; RANGE-DAG: ![[ALREADY]] = !{i32 0, i32 19}
>From 7a9a0eddfd0c03975800b801cde70954ad2b4523 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Sun, 4 May 2025 00:03:14 +0000
Subject: [PATCH 2/2] address comments
---
llvm/include/llvm/IR/IntrinsicsNVVM.td | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 6b385a30c804d..8b87822d3fdda 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -141,6 +141,9 @@ def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
defvar WARP_SIZE = 32;
+// Note: the maximum grid size in the x-dimension is the lower value of 65535
+// on sm_20. We conservatively use the larger value here as it required for
+// sm_30+ and also correct for sm_20.
defvar MAX_GRID_SIZE_X = 0x7fffffff;
defvar MAX_GRID_SIZE_Y = 0xffff;
defvar MAX_GRID_SIZE_Z = 0xffff;
@@ -4768,6 +4771,7 @@ class PTXReadSRegIntrinsic_r32<string name,
multiclass PTXReadSRegIntrinsic_v4i32<string regname,
list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
+ assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
// FIXME: Do we need the 128-bit integer type version?
// def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>;
@@ -4781,6 +4785,7 @@ multiclass PTXReadSRegIntrinsic_v4i32<string regname,
// Same, but without automatic clang builtins. It will be used for
// registers that require particular GPU or PTX version.
multiclass PTXReadSRegIntrinsicNB_v4i32<list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
+ assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
defvar suffixes = ["_x", "_y", "_z", "_w"];
foreach i = !range(suffixes) in
def suffixes[i] : PTXReadSRegIntrinsicNB_r32<properties[i]>;
More information about the llvm-commits
mailing list