[llvm] 37da5a1 - [NVPTX] Add ranges to intrinsic definitions, cleanup NVVMIntrRange (#138338)

via llvm-commits llvm-commits at lists.llvm.org
Mon May 5 16:22:28 PDT 2025


Author: Alex MacLean
Date: 2025-05-05T16:22:25-07:00
New Revision: 37da5a10f0cf8375c8d1237e9015efae8a8161c1

URL: https://github.com/llvm/llvm-project/commit/37da5a10f0cf8375c8d1237e9015efae8a8161c1
DIFF: https://github.com/llvm/llvm-project/commit/37da5a10f0cf8375c8d1237e9015efae8a8161c1.diff

LOG: [NVPTX] Add ranges to intrinsic definitions, cleanup NVVMIntrRange (#138338)

Pull the global intrinsic ranges out of NVVMIntrRange and into the
intrinsic table-gen definitions. Also improve range inference for
cluster SReg intrinsics.

Added: 
    

Modified: 
    llvm/include/llvm/IR/IntrinsicsNVVM.td
    llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
    llvm/lib/Target/NVPTX/NVPTXUtilities.h
    llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
    llvm/test/CodeGen/NVPTX/intr-range.ll
    llvm/test/CodeGen/NVPTX/intrinsic-old.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2c550de7a9203..8b87822d3fdda 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -139,6 +139,19 @@ def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>;  // (shared_cluster)ptr
 // MISC
 //
 
+defvar WARP_SIZE = 32;
+
+// Note: the maximum grid size in the x-dimension is the lower value of 65535
+// on sm_20. We conservatively use the larger value here as it required for
+// sm_30+ and also correct for sm_20.
+defvar MAX_GRID_SIZE_X = 0x7fffffff;
+defvar MAX_GRID_SIZE_Y = 0xffff;
+defvar MAX_GRID_SIZE_Z = 0xffff;
+
+defvar MAX_BLOCK_SIZE_X = 1024;
+defvar MAX_BLOCK_SIZE_Y = 1024;
+defvar MAX_BLOCK_SIZE_Z = 64;
+
 // Helper class that concatenates list elements with
 // a given separator 'sep' and returns the result.
 // Handles empty strings.
@@ -4747,26 +4760,35 @@ def int_nvvm_sust_p_3d_v4i32_trap
 
 // Accessing special registers.
 
-class PTXReadSRegIntrinsicNB_r32
-  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>]>;
-class PTXReadSRegIntrinsic_r32<string name>
-  : PTXReadSRegIntrinsicNB_r32, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
+class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = []>
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [],
+      !listconcat([IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>], properties)>;
 
-multiclass PTXReadSRegIntrinsic_v4i32<string regname> {
+class PTXReadSRegIntrinsic_r32<string name,
+                               list<IntrinsicProperty> properties = []>
+  : PTXReadSRegIntrinsicNB_r32<properties>,
+    ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
+
+multiclass PTXReadSRegIntrinsic_v4i32<string regname,
+                                      list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
+  assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
 // FIXME: Do we need the 128-bit integer type version?
 //    def _r64   : Intrinsic<[llvm_i128_ty],   [], [IntrNoMem, IntrSpeculatable]>;
 
 // FIXME: Enable this once v4i32 support is enabled in back-end.
 //    def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
-  foreach suffix = ["_x", "_y", "_z", "_w"] in
-    def suffix : PTXReadSRegIntrinsic_r32<regname # suffix>;
+  defvar suffixes = ["_x", "_y", "_z", "_w"];
+  foreach i = !range(suffixes) in
+    def suffixes[i] : PTXReadSRegIntrinsic_r32<regname # suffixes[i], properties[i]>;
 }
 
 // Same, but without automatic clang builtins. It will be used for
 // registers that require particular GPU or PTX version.
-multiclass PTXReadSRegIntrinsicNB_v4i32 {
-  foreach suffix = ["_x", "_y", "_z", "_w"] in
-    def suffix : PTXReadSRegIntrinsicNB_r32;
+multiclass PTXReadSRegIntrinsicNB_v4i32<list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
+  assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
+  defvar suffixes = ["_x", "_y", "_z", "_w"];
+  foreach i = !range(suffixes) in
+    def suffixes[i] : PTXReadSRegIntrinsicNB_r32<properties[i]>;
 }
 
 class PTXReadSRegIntrinsic_r64<string name>
@@ -4782,15 +4804,41 @@ class PTXReadNCSRegIntrinsic_r64<string name>
   : Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef<RetIndex>]>,
     ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
 
-defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">;
-defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">;
+defm int_nvvm_read_ptx_sreg_tid
+  : PTXReadSRegIntrinsic_v4i32<"tid",
+                              [[Range<RetIndex, 0, MAX_BLOCK_SIZE_X>],
+                               [Range<RetIndex, 0, MAX_BLOCK_SIZE_Y>],
+                               [Range<RetIndex, 0, MAX_BLOCK_SIZE_Z>],
+                               [Range<RetIndex, 0, 1>]]>;
+
+defm int_nvvm_read_ptx_sreg_ntid
+  : PTXReadSRegIntrinsic_v4i32<"ntid",
+                               [[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_X, 1)>],
+                                [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Y, 1)>],
+                                [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Z, 1)>],
+                                [Range<RetIndex, 0, 1>]]>;
+
+def int_nvvm_read_ptx_sreg_laneid
+  : PTXReadSRegIntrinsic_r32<"laneid", [Range<RetIndex, 0, WARP_SIZE>]>;
 
-def int_nvvm_read_ptx_sreg_laneid : PTXReadSRegIntrinsic_r32<"laneid">;
 def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32<"warpid">;
 def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32<"nwarpid">;
 
-defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32<"ctaid">;
-defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32<"nctaid">;
+defvar MAX_GRID_ID_RANGE = [[Range<RetIndex, 0, MAX_GRID_SIZE_X>],
+                            [Range<RetIndex, 0, MAX_GRID_SIZE_Y>],
+                            [Range<RetIndex, 0, MAX_GRID_SIZE_Z>],
+                            [Range<RetIndex, 0, 1>]];
+
+defvar MAX_GRID_NID_RANGE = [[Range<RetIndex, 1, !add(MAX_GRID_SIZE_X, 1)>],
+                             [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Y, 1)>],
+                             [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Z, 1)>],
+                             [Range<RetIndex, 0, 1>]];
+
+defm int_nvvm_read_ptx_sreg_ctaid
+  : PTXReadSRegIntrinsic_v4i32<"ctaid", MAX_GRID_ID_RANGE>;
+
+defm int_nvvm_read_ptx_sreg_nctaid
+  : PTXReadSRegIntrinsic_v4i32<"nctaid", MAX_GRID_NID_RANGE>;
 
 def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32<"smid">;
 def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32<"nsmid">;
@@ -4817,13 +4865,25 @@ def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">;
 def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">;
 def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32<"pm3">;
 
-def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
+def int_nvvm_read_ptx_sreg_warpsize
+  : PTXReadSRegIntrinsic_r32<"warpsize",
+                             [Range<RetIndex, WARP_SIZE, !add(WARP_SIZE, 1)>]>;
 
 // sm90+, PTX7.8+
-defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32;
-defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32;
-defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32;
-defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32;
+
+// Note: Since clusters are subdivisions of the grid, we conservatively use the
+// maximum grid size as an upper bound for the clusterid and cluster_ctaid. In
+// practice, the clusterid will likely be much smaller. The CUDA programming
+// guide recommends 8 as a maximum portable value and H100s support 16.
+
+defm int_nvvm_read_ptx_sreg_clusterid
+  : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
+defm int_nvvm_read_ptx_sreg_nclusterid
+  : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
+defm int_nvvm_read_ptx_sreg_cluster_ctaid
+  : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
+defm int_nvvm_read_ptx_sreg_cluster_nctaid
+  : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
 
 def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
 def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;

diff  --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 61b50b69b4e86..6586f925504f1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -330,6 +330,16 @@ std::optional<uint64_t> getOverallReqNTID(const Function &F) {
   return getVectorProduct(ReqNTID);
 }
 
+std::optional<uint64_t> getOverallClusterRank(const Function &F) {
+  // maxclusterrank and cluster_dim are mutually exclusive.
+  if (const auto ClusterRank = getMaxClusterRank(F))
+    return ClusterRank;
+
+  // Note: The semantics here are a bit strange. See getMaxNTID.
+  const auto ClusterDim = getClusterDim(F);
+  return getVectorProduct(ClusterDim);
+}
+
 std::optional<unsigned> getMaxClusterRank(const Function &F) {
   return getFnAttrParsedInt(F, "nvvm.maxclusterrank");
 }

diff  --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index a1b4a0e5e7471..e792e441e49e6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -54,6 +54,7 @@ SmallVector<unsigned, 3> getClusterDim(const Function &);
 
 std::optional<uint64_t> getOverallMaxNTID(const Function &);
 std::optional<uint64_t> getOverallReqNTID(const Function &);
+std::optional<uint64_t> getOverallClusterRank(const Function &);
 
 std::optional<unsigned> getMaxClusterRank(const Function &);
 std::optional<unsigned> getMinCTASm(const Function &);

diff  --git a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
index 91b8e470e055e..2c81989932a97 100644
--- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -58,87 +58,89 @@ static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) {
 }
 
 static bool runNVVMIntrRange(Function &F) {
-  struct {
-    unsigned x, y, z;
-  } MaxBlockSize, MaxGridSize;
+  struct Vector3 {
+    unsigned X, Y, Z;
+  };
 
-  const unsigned MetadataNTID = getOverallReqNTID(F).value_or(
-      getOverallMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
+  // All these annotations are only valid for kernel functions.
+  if (!isKernelFunction(F))
+    return false;
 
-  MaxBlockSize.x = std::min(1024u, MetadataNTID);
-  MaxBlockSize.y = std::min(1024u, MetadataNTID);
-  MaxBlockSize.z = std::min(64u, MetadataNTID);
+  const auto OverallReqNTID = getOverallReqNTID(F);
+  const auto OverallMaxNTID = getOverallMaxNTID(F);
+  const auto OverallClusterRank = getOverallClusterRank(F);
 
-  MaxGridSize.x = 0x7fffffff;
-  MaxGridSize.y = 0xffff;
-  MaxGridSize.z = 0xffff;
+  // If this function lacks any range information, do nothing.
+  if (!(OverallReqNTID || OverallMaxNTID || OverallClusterRank))
+    return false;
 
-  // Go through the calls in this function.
-  bool Changed = false;
-  for (Instruction &I : instructions(F)) {
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
-    if (!II)
-      continue;
+  const unsigned FunctionNTID = OverallReqNTID.value_or(
+      OverallMaxNTID.value_or(std::numeric_limits<unsigned>::max()));
 
+  const unsigned FunctionClusterRank =
+      OverallClusterRank.value_or(std::numeric_limits<unsigned>::max());
+
+  const Vector3 MaxBlockSize{std::min(1024u, FunctionNTID),
+                             std::min(1024u, FunctionNTID),
+                             std::min(64u, FunctionNTID)};
+
+  // We conservatively use the maximum grid size as an upper bound for the
+  // cluster rank.
+  const Vector3 MaxClusterRank{std::min(0x7fffffffu, FunctionClusterRank),
+                               std::min(0xffffu, FunctionClusterRank),
+                               std::min(0xffffu, FunctionClusterRank)};
+
+  const auto ProccessIntrinsic = [&](IntrinsicInst *II) -> bool {
     switch (II->getIntrinsicID()) {
     // Index within block
     case Intrinsic::nvvm_read_ptx_sreg_tid_x:
-      Changed |= addRangeAttr(0, MaxBlockSize.x, II);
-      break;
+      return addRangeAttr(0, MaxBlockSize.X, II);
     case Intrinsic::nvvm_read_ptx_sreg_tid_y:
-      Changed |= addRangeAttr(0, MaxBlockSize.y, II);
-      break;
+      return addRangeAttr(0, MaxBlockSize.Y, II);
     case Intrinsic::nvvm_read_ptx_sreg_tid_z:
-      Changed |= addRangeAttr(0, MaxBlockSize.z, II);
-      break;
+      return addRangeAttr(0, MaxBlockSize.Z, II);
 
     // Block size
     case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
-      Changed |= addRangeAttr(1, MaxBlockSize.x + 1, II);
-      break;
+      return addRangeAttr(1, MaxBlockSize.X + 1, II);
     case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
-      Changed |= addRangeAttr(1, MaxBlockSize.y + 1, II);
-      break;
+      return addRangeAttr(1, MaxBlockSize.Y + 1, II);
     case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
-      Changed |= addRangeAttr(1, MaxBlockSize.z + 1, II);
-      break;
-
-    // Index within grid
-    case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
-      Changed |= addRangeAttr(0, MaxGridSize.x, II);
-      break;
-    case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
-      Changed |= addRangeAttr(0, MaxGridSize.y, II);
+      return addRangeAttr(1, MaxBlockSize.Z + 1, II);
+
+    // Cluster size
+    case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x:
+      return addRangeAttr(0, MaxClusterRank.X, II);
+    case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y:
+      return addRangeAttr(0, MaxClusterRank.Y, II);
+    case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z:
+      return addRangeAttr(0, MaxClusterRank.Z, II);
+    case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x:
+      return addRangeAttr(1, MaxClusterRank.X + 1, II);
+    case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y:
+      return addRangeAttr(1, MaxClusterRank.Y + 1, II);
+    case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z:
+      return addRangeAttr(1, MaxClusterRank.Z + 1, II);
+
+    case Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank:
+      if (OverallClusterRank)
+        return addRangeAttr(0, FunctionClusterRank, II);
       break;
-    case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
-      Changed |= addRangeAttr(0, MaxGridSize.z, II);
+    case Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank:
+      if (OverallClusterRank)
+        return addRangeAttr(1, FunctionClusterRank + 1, II);
       break;
-
-    // Grid size
-    case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
-      Changed |= addRangeAttr(1, MaxGridSize.x + 1, II);
-      break;
-    case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
-      Changed |= addRangeAttr(1, MaxGridSize.y + 1, II);
-      break;
-    case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
-      Changed |= addRangeAttr(1, MaxGridSize.z + 1, II);
-      break;
-
-    // warp size is constant 32.
-    case Intrinsic::nvvm_read_ptx_sreg_warpsize:
-      Changed |= addRangeAttr(32, 32 + 1, II);
-      break;
-
-    // Lane ID is [0..warpsize)
-    case Intrinsic::nvvm_read_ptx_sreg_laneid:
-      Changed |= addRangeAttr(0, 32, II);
-      break;
-
     default:
-      break;
+      return false;
     }
-  }
+    return false;
+  };
+
+  // Go through the calls in this function.
+  bool Changed = false;
+  for (Instruction &I : instructions(F))
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
+      Changed |= ProccessIntrinsic(II);
 
   return Changed;
 }

diff  --git a/llvm/test/CodeGen/NVPTX/intr-range.ll b/llvm/test/CodeGen/NVPTX/intr-range.ll
index 884a4b1a3584f..48fa3e06629b4 100644
--- a/llvm/test/CodeGen/NVPTX/intr-range.ll
+++ b/llvm/test/CodeGen/NVPTX/intr-range.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
 ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s --check-prefix=DEFAULT
 
 define ptx_kernel i32 @test_maxntid() "nvvm.maxntid"="32,1,3" {
 ; CHECK-LABEL: define ptx_kernel i32 @test_maxntid(
@@ -74,10 +75,149 @@ define ptx_kernel i32 @test_inlined() "nvvm.maxntid"="4" {
   ret i32 %1
 }
 
+define ptx_kernel i32 @test_cluster_ctaid() "nvvm.maxclusterrank"="8" {
+; CHECK-LABEL: define ptx_kernel i32 @test_cluster_ctaid(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+; CHECK-NEXT:    [[TMP2:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+; CHECK-NEXT:    [[TMP3:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+; CHECK-NEXT:    [[TMP5:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP7]]
+; CHECK-NEXT:    ret i32 [[TMP15]]
+;
+  %1 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+  %2 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+  %3 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+  %4 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+  %5 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+  %6 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+  %7 = add i32 %1, %2
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %4
+  %10 = add i32 %9, %5
+  %11 = add i32 %10, %6
+  ret i32 %11
+}
+
+define ptx_kernel i32 @test_cluster_dim() "nvvm.cluster_dim"="4,4,1" {
+; CHECK-LABEL: define ptx_kernel i32 @test_cluster_dim(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+; CHECK-NEXT:    [[TMP2:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+; CHECK-NEXT:    [[TMP3:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+; CHECK-NEXT:    [[TMP5:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP7]]
+; CHECK-NEXT:    ret i32 [[TMP15]]
+;
+  %1 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+  %2 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+  %3 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+  %4 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+  %5 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+  %6 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+  %7 = add i32 %1, %2
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %4
+  %10 = add i32 %9, %5
+  %11 = add i32 %10, %6
+  ret i32 %11
+}
+
+
+; DEFAULT-DAG: declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; DEFAULT-DAG: declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+; DEFAULT-DAG: declare noundef range(i32 0, 64)   i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1)    i32 @llvm.nvvm.read.ptx.sreg.tid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+; DEFAULT-DAG: declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+; DEFAULT-DAG: declare noundef range(i32 1, 65)   i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1)    i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 0, 32)  i32 @llvm.nvvm.read.ptx.sreg.laneid()
+; DEFAULT-DAG: declare noundef range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+
+; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+; DEFAULT-DAG: declare noundef range(i32 0, 65535)      i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+; DEFAULT-DAG: declare noundef range(i32 0, 65535)      i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1)          i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+; DEFAULT-DAG: declare noundef range(i32 1, 65536)       i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+; DEFAULT-DAG: declare noundef range(i32 1, 65536)       i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1)           i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.clusterid.x()
+; DEFAULT-DAG: declare noundef range(i32 0, 65535)      i32 @llvm.nvvm.read.ptx.sreg.clusterid.y()
+; DEFAULT-DAG: declare noundef range(i32 0, 65535)      i32 @llvm.nvvm.read.ptx.sreg.clusterid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1)          i32 @llvm.nvvm.read.ptx.sreg.clusterid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x()
+; DEFAULT-DAG: declare noundef range(i32 1, 65536)       i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y()
+; DEFAULT-DAG: declare noundef range(i32 1, 65536)       i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1)           i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+; DEFAULT-DAG: declare noundef range(i32 0, 65535)      i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+; DEFAULT-DAG: declare noundef range(i32 0, 65535)      i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1)          i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.w()
+
+; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+; DEFAULT-DAG: declare noundef range(i32 1, 65536)       i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+; DEFAULT-DAG: declare noundef range(i32 1, 65536)       i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+; DEFAULT-DAG: declare noundef range(i32 0, 1)           i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.w()
+
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.w()
 
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.w()

diff  --git a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll
index 72c4cda16db47..f595df837f91f 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll
@@ -1,13 +1,10 @@
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s
-; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \
-; RUN:   | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s
 ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define ptx_device i32 @test_tid_x() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x;
-; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 	ret i32 %x
@@ -15,7 +12,6 @@ define ptx_device i32 @test_tid_x() {
 
 define ptx_device i32 @test_tid_y() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y;
-; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 	ret i32 %x
@@ -23,7 +19,6 @@ define ptx_device i32 @test_tid_y() {
 
 define ptx_device i32 @test_tid_z() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z;
-; RANGE: call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 	ret i32 %x
@@ -38,7 +33,6 @@ define ptx_device i32 @test_tid_w() {
 
 define ptx_device i32 @test_ntid_x() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x;
-; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 	ret i32 %x
@@ -46,7 +40,6 @@ define ptx_device i32 @test_ntid_x() {
 
 define ptx_device i32 @test_ntid_y() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y;
-; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
 	ret i32 %x
@@ -54,7 +47,6 @@ define ptx_device i32 @test_ntid_y() {
 
 define ptx_device i32 @test_ntid_z() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z;
-; RANGE: call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
 	ret i32 %x
@@ -69,7 +61,6 @@ define ptx_device i32 @test_ntid_w() {
 
 define ptx_device i32 @test_laneid() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %laneid;
-; RANGE: call range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
 	ret i32 %x
@@ -77,7 +68,6 @@ define ptx_device i32 @test_laneid() {
 
 define ptx_device i32 @test_warpsize() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, WARP_SZ;
-; RANGE: call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
 	ret i32 %x
@@ -99,7 +89,6 @@ define ptx_device i32 @test_nwarpid() {
 
 define ptx_device i32 @test_ctaid_y() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y;
-; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
 	ret i32 %x
@@ -107,7 +96,6 @@ define ptx_device i32 @test_ctaid_y() {
 
 define ptx_device i32 @test_ctaid_z() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z;
-; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
 	ret i32 %x
@@ -115,7 +103,6 @@ define ptx_device i32 @test_ctaid_z() {
 
 define ptx_device i32 @test_ctaid_x() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x;
-; RANGE: call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 	ret i32 %x
@@ -130,7 +117,6 @@ define ptx_device i32 @test_ctaid_w() {
 
 define ptx_device i32 @test_nctaid_y() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y;
-; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
 	ret i32 %x
@@ -138,7 +124,6 @@ define ptx_device i32 @test_nctaid_y() {
 
 define ptx_device i32 @test_nctaid_z() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z;
-; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
 	ret i32 %x
@@ -146,7 +131,6 @@ define ptx_device i32 @test_nctaid_z() {
 
 define ptx_device i32 @test_nctaid_x() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x;
-; RANGE: call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
 ; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
 	ret i32 %x
@@ -154,7 +138,7 @@ define ptx_device i32 @test_nctaid_x() {
 
 define ptx_device i32 @test_already_has_range_md() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x;
-; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range ![[ALREADY:[0-9]+]]
+; CHECK: ret;
 	%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range !0
 	ret i32 %x
 }
@@ -316,4 +300,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.pm3()
 declare void @llvm.nvvm.bar.sync(i32 %i)
 
 !0 = !{i32 0, i32 19}
-; RANGE-DAG: ![[ALREADY]] = !{i32 0, i32 19}


        


More information about the llvm-commits mailing list