[llvm-branch-commits] [mlir] [mlir][GPU] Plumb range information through the NVVM lowterings (PR #107659)

Mon Sep 9 16:16:43 PDT 2024

https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/107659

>From c7d3804afe9a7e6325f6af230f060c19aceca09b Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 6 Sep 2024 23:45:52 +0000
Subject: [PATCH 1/3] [mlir][GPU] Plumb range information through the NVVM
 lowterings

Update the GPU to NVVM lowerings to correctly propagate range
information on IDs and dimension queries, etiher from
known_{block,grid}_size attributes or from `upperBound` annotations on
the operations themselves.
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   | 283 +++++++++---------
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        |  50 +++-
 .../Dialect/NVVM/LLVMIRToNVVMTranslation.cpp  |   1 +
 .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir     |  18 +-
 mlir/test/Target/LLVMIR/Import/nvvmir.ll      |   3 +
 mlir/test/Target/LLVMIR/nvvmir.mlir           |   7 +-
 6 files changed, 207 insertions(+), 155 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 709dd922b8fa2f..66ac9f289d233b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -86,8 +86,8 @@ class NVVM_Op<string mnemonic, list<Trait> traits = []> :
   LLVM_OpBase<NVVM_Dialect, mnemonic, traits> {
 }
 
-/// Base class that defines BasicPtxBuilderOpInterface. 
-class NVVM_PTXBuilder_Op<string mnemonic, 
+/// Base class that defines BasicPtxBuilderOpInterface.
+class NVVM_PTXBuilder_Op<string mnemonic,
   list<Trait> traits = [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]> :
   LLVM_OpBase<NVVM_Dialect, mnemonic, traits> {
 }
@@ -123,52 +123,67 @@ class NVVM_SpecialRegisterOp<string mnemonic, list<Trait> traits = []> :
   let assemblyFormat = "attr-dict `:` type($res)";
 }
 
+class NVVM_SpecialRangeableRegisterOp<string mnemonic, list<Trait> traits = []> :
+  NVVM_SpecialRegisterOp<mnemonic, traits> {
+  let arguments = (ins OptionalAttr<LLVM_ConstantRangeAttr>:$range);
+  let assemblyFormat = "(`range` $range^)? attr-dict `:` type($res)";
+  let llvmBuilder = baseLlvmBuilder # setRangeRetAttrCode # baseLlvmBuilderCoda;
+  let mlirBuilder = baseMlirBuilder # importRangeRetAttrCode # baseMlirBuilderCoda;
+
+  // Backwards-compatibility builder for an unspecified range.
+  let builders = [
+    OpBuilder<(ins "Type":$resultType), [{
+      build($_builder, $_state, resultType, ::mlir::LLVM::ConstantRangeAttr{});
+    }]>
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // Lane index and range
-def NVVM_LaneIdOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.laneid">;
-def NVVM_WarpSizeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.warpsize">;
+def NVVM_LaneIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.laneid">;
+def NVVM_WarpSizeOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.warpsize">;
 
 //===----------------------------------------------------------------------===//
 // Thread index and range
-def NVVM_ThreadIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.x">;
-def NVVM_ThreadIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.y">;
-def NVVM_ThreadIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.z">;
-def NVVM_BlockDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.x">;
-def NVVM_BlockDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.y">;
-def NVVM_BlockDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.z">;
+def NVVM_ThreadIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.x">;
+def NVVM_ThreadIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.y">;
+def NVVM_ThreadIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.z">;
+def NVVM_BlockDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.x">;
+def NVVM_BlockDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.y">;
+def NVVM_BlockDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.z">;
 
 //===----------------------------------------------------------------------===//
 // Block index and range
-def NVVM_BlockIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.x">;
-def NVVM_BlockIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.y">;
-def NVVM_BlockIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.z">;
-def NVVM_GridDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.x">;
-def NVVM_GridDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.y">;
-def NVVM_GridDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.z">;
+def NVVM_BlockIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.x">;
+def NVVM_BlockIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.y">;
+def NVVM_BlockIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.z">;
+def NVVM_GridDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.x">;
+def NVVM_GridDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.y">;
+def NVVM_GridDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.z">;
 
 //===----------------------------------------------------------------------===//
 // CTA Cluster index and range
-def NVVM_ClusterIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.x">;
-def NVVM_ClusterIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.y">;
-def NVVM_ClusterIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.z">;
-def NVVM_ClusterDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.x">;
-def NVVM_ClusterDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.y">;
-def NVVM_ClusterDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.z">;
+def NVVM_ClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.x">;
+def NVVM_ClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.y">;
+def NVVM_ClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.z">;
+def NVVM_ClusterDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.x">;
+def NVVM_ClusterDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.y">;
+def NVVM_ClusterDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.z">;
 
 
 //===----------------------------------------------------------------------===//
 // CTA index and range within Cluster
-def NVVM_BlockInClusterIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.x">;
-def NVVM_BlockInClusterIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.y">;
-def NVVM_BlockInClusterIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.z">;
-def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
-def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
-def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
+def NVVM_BlockInClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.x">;
+def NVVM_BlockInClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.y">;
+def NVVM_BlockInClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.z">;
+def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
+def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
+def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
 
 //===----------------------------------------------------------------------===//
 // CTA index and across Cluster dimensions
-def NVVM_ClusterId : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctarank">;
-def NVVM_ClusterDim : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctarank">;
+def NVVM_ClusterId : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank">;
+def NVVM_ClusterDim : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">;
 
 //===----------------------------------------------------------------------===//
 // Clock registers
@@ -197,11 +212,11 @@ def ReduxKindMin  : I32EnumAttrCase<"MIN", 4, "min">;
 def ReduxKindOr   : I32EnumAttrCase<"OR", 5, "or">;
 def ReduxKindUmax : I32EnumAttrCase<"UMAX", 6, "umax">;
 def ReduxKindUmin : I32EnumAttrCase<"UMIN", 7, "umin">;
-def ReduxKindXor  : I32EnumAttrCase<"XOR", 8, "xor">; 
+def ReduxKindXor  : I32EnumAttrCase<"XOR", 8, "xor">;
 
 /// Enum attribute of the different kinds.
 def ReduxKind : I32EnumAttr<"ReduxKind", "NVVM redux kind",
-  [ReduxKindAdd, ReduxKindAnd, ReduxKindMax, ReduxKindMin, ReduxKindOr, 
+  [ReduxKindAdd, ReduxKindAnd, ReduxKindMax, ReduxKindMin, ReduxKindOr,
     ReduxKindUmax, ReduxKindUmin, ReduxKindXor]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
@@ -221,7 +236,7 @@ def NVVM_ReduxOp :
   }];
   let assemblyFormat = [{
     $kind $val `,` $mask_and_clamp  attr-dict `:` type($val) `->` type($res)
-   }];   
+   }];
 }
 
 //===----------------------------------------------------------------------===//
@@ -308,7 +323,7 @@ def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.
   let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
 }
 
-def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,  
+def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,
   Arguments<(ins LLVM_AnyPointer:$addr, I32:$txcount, PtxPredicate:$predicate)> {
   let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
   let extraClassDefinition = [{
@@ -316,16 +331,16 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
   }];
 }
 
-def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx.shared">,  
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> {    
+def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx.shared">,
+  Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> {
   let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() { return std::string("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"); }
   }];
 }
 
-def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">,  
-  Arguments<(ins LLVM_AnyPointer:$addr, I32:$phase, I32:$ticks)> {  
+def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">,
+  Arguments<(ins LLVM_AnyPointer:$addr, I32:$phase, I32:$ticks)> {
   let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
@@ -338,13 +353,13 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity"
         "bra.uni     LAB_WAIT; \n\t"
         "DONE: \n\t"
         "}"
-      ); 
+      );
     }
   }];
 }
 
-def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity.shared">,  
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> {  
+def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity.shared">,
+  Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> {
   let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
@@ -357,7 +372,7 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.p
         "bra.uni     LAB_WAIT; \n\t"
         "DONE: \n\t"
         "}"
-      ); 
+      );
     }
   }];
 }
@@ -392,7 +407,7 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
 }
 
 def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
-  let arguments = (ins     
+  let arguments = (ins
     Optional<I32>:$barrierId,
     Optional<I32>:$numberOfThreads);
   string llvmBuilder = [{
@@ -401,7 +416,7 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
                 {$barrierId, $numberOfThreads});
     } else if($barrierId) {
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_n,
-                {$barrierId});   
+                {$barrierId});
     } else {
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier0);
     }
@@ -410,27 +425,27 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
   let assemblyFormat = "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? attr-dict";
 }
 
-def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive"> 
+def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive">
 {
   let arguments = (ins Optional<I32>:$barrierId, I32:$numberOfThreads);
 
   let description = [{
-    Thread that executes this op announces their arrival at the barrier with 
+    Thread that executes this op announces their arrival at the barrier with
     given id and continue their execution.
 
-    The default barrier id is 0 that is similar to `nvvm.barrier` Op. When 
-    `barrierId` is not present, the default barrier id is used. 
+    The default barrier id is 0 that is similar to `nvvm.barrier` Op. When
+    `barrierId` is not present, the default barrier id is used.
 
     [For more information, see PTX ISA]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
   }];
-  
+
   let assemblyFormat = "(`id` `=` $barrierId^)? `number_of_threads` `=` $numberOfThreads attr-dict";
 
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
       std::string ptx = "bar.arrive ";
-      if (getBarrierId()) { ptx += "%0, %1;"; } 
+      if (getBarrierId()) { ptx += "%0, %1;"; }
       else { ptx += "0, %0;"; }
       return ptx;
     }
@@ -553,7 +568,7 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
     [For more information, see PTX ISA]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
   }];
-  
+
   let assemblyFormat = "attr-dict";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
@@ -671,9 +686,9 @@ def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> {
     [For more information, see PTX ISA]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
   }];
-  
+
   let assemblyFormat = "attr-dict";
-  let extraClassDefinition = [{        
+  let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
       return std::string("fence.mbarrier_init.release.cluster;");
     }
@@ -749,13 +764,13 @@ def NVVM_SyncWarpOp :
 }
 
 
-def NVVM_ElectSyncOp : NVVM_Op<"elect.sync", 
+def NVVM_ElectSyncOp : NVVM_Op<"elect.sync",
                   [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>
-{  
+{
   let results = (outs I1:$pred);
-  let assemblyFormat = "attr-dict `->` type(results)";  
-  let extraClassDefinition = [{        
-    std::string $cppClass::getPtx() { 
+  let assemblyFormat = "attr-dict `->` type(results)";
+  let extraClassDefinition = [{
+    std::string $cppClass::getPtx() {
       return std::string(
         "{                                  \n"
         ".reg .u32 rx;                      \n"
@@ -764,7 +779,7 @@ def NVVM_ElectSyncOp : NVVM_Op<"elect.sync",
         "    elect.sync rx | px, 0xFFFFFFFF;\n"
         "@px mov.pred %0, 1;                \n"
         "}\n"
-      ); 
+      );
     }
   }];
 }
@@ -776,16 +791,16 @@ def LoadCacheModifierLU : I32EnumAttrCase<"LU", 3, "lu">;
 def LoadCacheModifierCV : I32EnumAttrCase<"CV", 4, "cv">;
 
 /// Enum attribute of the different kinds.
-def LoadCacheModifierKind : I32EnumAttr<"LoadCacheModifierKind", 
+def LoadCacheModifierKind : I32EnumAttr<"LoadCacheModifierKind",
                                 "NVVM load cache modifier kind",
-  [LoadCacheModifierCA, LoadCacheModifierCG, LoadCacheModifierCS, 
+  [LoadCacheModifierCA, LoadCacheModifierCG, LoadCacheModifierCS,
     LoadCacheModifierLU, LoadCacheModifierCV]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
   let description = [{
     Enum attribute of the different kinds of cache operators for load instructions.
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#id62)    
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#id62)
   }];
 }
 
@@ -811,7 +826,7 @@ def NVVM_CpAsyncOp : NVVM_PTXBuilder_Op<"cp.async.shared.global">,
             id = llvm::Intrinsic::nvvm_cp_async_cg_shared_global_16;
           else if($modifier == NVVM::LoadCacheModifierKind::CA)
             id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_16;
-          else 
+          else
             llvm_unreachable("unsupported cache modifier");
           break;
         default:
@@ -824,21 +839,21 @@ def NVVM_CpAsyncOp : NVVM_PTXBuilder_Op<"cp.async.shared.global">,
   let extraClassDeclaration = [{
     bool hasIntrinsic() { if(getCpSize()) return false; return true; }
 
-    void getAsmValues(RewriterBase &rewriter, 
+    void getAsmValues(RewriterBase &rewriter,
         llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>> &asmValues) {
       asmValues.push_back({getDst(), PTXRegisterMod::Read});
       asmValues.push_back({getSrc(), PTXRegisterMod::Read});
       asmValues.push_back({makeConstantI32(rewriter, getSize()), PTXRegisterMod::Read});
       asmValues.push_back({getCpSize(), PTXRegisterMod::Read});
-    }        
+    }
   }];
-  let extraClassDefinition = [{        
-    std::string $cppClass::getPtx() { 
+  let extraClassDefinition = [{
+    std::string $cppClass::getPtx() {
       if(getModifier() == NVVM::LoadCacheModifierKind::CG)
         return std::string("cp.async.cg.shared.global [%0], [%1], %2, %3;\n");
       if(getModifier() == NVVM::LoadCacheModifierKind::CA)
         return std::string("cp.async.ca.shared.global [%0], [%1], %2, %3;\n");
-      llvm_unreachable("unsupported cache modifier");      
+      llvm_unreachable("unsupported cache modifier");
     }
   }];
 }
@@ -1526,9 +1541,9 @@ def NVVM_WMMAMmaOp : NVVM_Op<"wmma.mma">,
   let hasVerifier = 1;
 }
 
-def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">, 
-  Arguments<(ins LLVM_PointerShared:$ptr, 
-                 Variadic<I32>:$sources, 
+def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">,
+  Arguments<(ins LLVM_PointerShared:$ptr,
+                 Variadic<I32>:$sources,
                  MMALayoutAttr:$layout)> {
   let summary = "cooperative matrix store";
   let description = [{
@@ -1537,7 +1552,7 @@ def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">,
     [For more information, see PTX ISA]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix)
   }];
-  
+
   let assemblyFormat = "$ptr `,` $sources attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
@@ -1757,25 +1772,25 @@ def NVVM_CpAsyncBulkCommitGroupOp : NVVM_Op<"cp.async.bulk.commit.group">,
 }
 
 def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">,
-  Arguments<(ins 
-    ConfinedAttr<I32Attr, [IntMinValue<0>]>:$group, 
+  Arguments<(ins
+    ConfinedAttr<I32Attr, [IntMinValue<0>]>:$group,
     OptionalAttr<UnitAttr>:$read)> {
   let assemblyFormat = "$group attr-dict";
   let description = [{
     Op waits for completion of the most recent bulk async-groups.
 
     The `$group` operand tells waiting has to be done until for $group or fewer
-    of the most recent bulk async-groups. If `$group` is 0, the op wait until 
+    of the most recent bulk async-groups. If `$group` is 0, the op wait until
     all the most recent bulk async-groups have completed.
 
-    The `$read` indicates that the waiting has to be done until all the bulk 
-    async operations in the specified bulk async-group have completed reading 
+    The `$read` indicates that the waiting has to be done until all the bulk
+    async operations in the specified bulk async-group have completed reading
     from their source locations.
 
     [For more information, see PTX ISA]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group)
   }];
-  
+
   string llvmBuilder = [{
     auto intId = op.getRead() ?
       llvm::Intrinsic::nvvm_cp_async_bulk_wait_group_read :
@@ -1784,53 +1799,53 @@ def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">,
   }];
 }
 
-def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : 
-  NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global", 
-  [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, 
+def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp :
+  NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global",
+  [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>,
   AttrSizedOperandSegments]>,
   Arguments<(ins  LLVM_PointerShared:$dstMem,
                   LLVM_AnyPointer:$tmaDescriptor,
                   Variadic<I32>:$coordinates,
-                  LLVM_PointerShared:$mbar,                  
+                  LLVM_PointerShared:$mbar,
                   Variadic<I16>:$im2colOffsets,
                   Optional<I16>:$multicastMask,
                   Optional<I64>:$l2CacheHint,
                   PtxPredicate:$predicate)> {
   let description = [{
-    Initiates an asynchronous copy operation on the tensor data from global 
-    memory to shared memory. 
+    Initiates an asynchronous copy operation on the tensor data from global
+    memory to shared memory.
 
     The Op operates has two load modes:
-    1) Tiled Mode: It's the default mode. The source multi-dimensional tensor 
-    layout is preserved at the destination. 
+    1) Tiled Mode: It's the default mode. The source multi-dimensional tensor
+    layout is preserved at the destination.
 
     2) Im2col Mode: This mode is used when `im2colOffsets` operands are present.
     the elements in the Bounding Box of the source tensor are rearranged into
-    columns at the destination. In this mode, the tensor has to be at least 
-    3-dimensional. 
+    columns at the destination. In this mode, the tensor has to be at least
+    3-dimensional.
 
     The `multicastMask` operand is optional. When it is present, the Op copies
     data from global memory to shared memory of multiple CTAs in the cluster.
-    Operand `multicastMask` specifies the destination CTAs in the cluster such 
+    Operand `multicastMask` specifies the destination CTAs in the cluster such
     that each bit position in the 16-bit `multicastMask` operand corresponds to
-    the `nvvm.read.ptx.sreg.ctaid` of the destination CTA.     
+    the `nvvm.read.ptx.sreg.ctaid` of the destination CTA.
 
-    The `l2CacheHint` operand is optional, and it is used to specify cache 
+    The `l2CacheHint` operand is optional, and it is used to specify cache
     eviction policy that may be used during the memory access.
-    
+
     [For more information, see PTX ISA]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor)
   }];
 
-  let assemblyFormat = [{ 
-    $dstMem `,` 
-    $tmaDescriptor `,` 
-    $mbar `,` 
-    `box` `[`$coordinates `]` 
+  let assemblyFormat = [{
+    $dstMem `,`
+    $tmaDescriptor `,`
+    $mbar `,`
+    `box` `[`$coordinates `]`
     (`im2col` `[` $im2colOffsets^ `]` )?
     (`multicast_mask` `=` $multicastMask^ )?
     (`l2_cache_hint` `=` $l2CacheHint^ )?
-    (`predicate` `=` $predicate^)? 
+    (`predicate` `=` $predicate^)?
     attr-dict  `:` type($dstMem) `,` type($tmaDescriptor)
   }];
 
@@ -1840,16 +1855,16 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp :
       int dim = getCoordinates().size();
       std::string ptx = "cp.async.bulk.tensor.";
       ptx += std::to_string(dim) + "d.";
-      ptx += "shared::cluster.global.mbarrier::complete_tx::bytes";      
+      ptx += "shared::cluster.global.mbarrier::complete_tx::bytes";
       if(im2colDim) ptx += ".im2col";
-      if(getMulticastMask()) ptx += ".multicast::cluster";      
+      if(getMulticastMask()) ptx += ".multicast::cluster";
       if(getL2CacheHint()) ptx += ".L2::cache_hint";
-      
+
       auto preg = [](int r) { return "%" + std::to_string(r); };
 
       // Build Registers
       ptx += " [%0], [%1, {";
-      int r = 2;      
+      int r = 2;
       for(int i = 0; i < dim; i++) ptx += preg(r+i) + ",";
       ptx.pop_back(); r += dim;
       ptx += "} ], [%" + std::to_string(r++) + "]";
@@ -1868,19 +1883,19 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp :
   let hasVerifier = 1;
 }
 
-def NVVM_CpAsyncBulkTensorSharedCTAToGlobalOp : 
-  NVVM_Op<"cp.async.bulk.tensor.global.shared.cta", 
-  [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, 
+def NVVM_CpAsyncBulkTensorSharedCTAToGlobalOp :
+  NVVM_Op<"cp.async.bulk.tensor.global.shared.cta",
+  [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>,
   AttrSizedOperandSegments]>,
   Arguments<(ins  LLVM_AnyPointer:$tmaDescriptor,
                   LLVM_PointerShared:$srcMem,
                   Variadic<I32>:$coordinates,
                   PtxPredicate:$predicate)> {
-  let assemblyFormat = [{ 
-    $tmaDescriptor `,` 
-    $srcMem `,` 
-    `box` `[`$coordinates `]` 
-    (`,` `predicate` `=` $predicate^)?  
+  let assemblyFormat = [{
+    $tmaDescriptor `,`
+    $srcMem `,`
+    `box` `[`$coordinates `]`
+    (`,` `predicate` `=` $predicate^)?
     attr-dict  `:` type(operands)
   }];
   let extraClassDefinition = [{
@@ -1905,7 +1920,7 @@ def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
   Arguments<(ins LLVM_AnyPointer:$tmaDescriptor, PtxPredicate:$predicate)> {
   let assemblyFormat = "$tmaDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
   let extraClassDefinition = [{
-    std::string $cppClass::getPtx() { 
+    std::string $cppClass::getPtx() {
       return std::string("prefetch.tensormap [%0];");
     }
   }];
@@ -1918,9 +1933,9 @@ def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
 def NVVM_WgmmaFenceAlignedOp : NVVM_PTXBuilder_Op<"wgmma.fence.aligned"> {
   let arguments = (ins);
   let description = [{
-    Enforce an ordering of register accesses between warpgroup level matrix 
-    multiplication and other operations. 
-    
+    Enforce an ordering of register accesses between warpgroup level matrix
+    multiplication and other operations.
+
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions-wgmma-fence)
   }];
   let assemblyFormat = "attr-dict";
@@ -1934,7 +1949,7 @@ def NVVM_WgmmaGroupSyncAlignedOp : NVVM_PTXBuilder_Op<"wgmma.commit.group.sync.a
   let assemblyFormat = "attr-dict";
   let description = [{
     Commits all prior uncommitted warpgroup level matrix multiplication operations.
-    
+
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group)
   }];
   let extraClassDefinition = [{
@@ -1947,7 +1962,7 @@ def NVVM_WgmmaWaitGroupSyncOp : NVVM_PTXBuilder_Op<"wgmma.wait.group.sync.aligne
   let assemblyFormat = "attr-dict $group";
   let description = [{
     Signal the completion of a preceding warpgroup operation.
-    
+
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions-wgmma-wait-group)
   }];
   let extraClassDefinition = [{
@@ -1994,7 +2009,7 @@ def WGMMATypeS32 : I32EnumAttrCase<"s32", 9>;
 def WGMMATypes : I32EnumAttr<"WGMMATypes", "NVVM WGMMA types",
   [WGMMATypeF16, WGMMATypeTF32,
     WGMMATypeU8, WGMMATypeS8,
-    WGMMATypeB1, WGMMATypeBF16, WGMMATypeF8E4M3, 
+    WGMMATypeB1, WGMMATypeBF16, WGMMATypeF8E4M3,
     WGMMATypeF8E5M2, WGMMATypeF32, WGMMATypeS32]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
@@ -2004,44 +2019,44 @@ def WGMMATypesAttr : EnumAttr<NVVM_Dialect, WGMMATypes, "wgmma_type"> {
 }
 
 
-def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async", 
+def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async",
               [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>,
                 PredOpTrait<"input struct and result struct must be the same type",
-                  TCresIsSameAsOpBase<0, 0>>,]> 
+                  TCresIsSameAsOpBase<0, 0>>,]>
 {
   let results = (outs LLVM_AnyStruct:$results);
-  let arguments = (ins 
+  let arguments = (ins
     LLVM_AnyStruct:$inouts,
-    I64:$descriptorA, 
-    I64:$descriptorB, 
+    I64:$descriptorA,
+    I64:$descriptorB,
     NVVM_MMAShapeAttr:$shape,
     WGMMATypesAttr:$typeA,
     WGMMATypesAttr:$typeB,
     WGMMATypesAttr:$typeD,
     WGMMAScaleOutAttr:$scaleD,
     WGMMAScaleInAttr:$scaleA,
-    WGMMAScaleInAttr:$scaleB, 
+    WGMMAScaleInAttr:$scaleB,
     MMALayoutAttr:$layoutA,
     MMALayoutAttr:$layoutB,
     OptionalAttr<MMAIntOverflowAttr>:$satfinite
-  );  
-  
-   let assemblyFormat = [{ 
+  );
+
+   let assemblyFormat = [{
       $descriptorA `,` $descriptorB `,` $inouts `,` $shape `,`
       `D` `[` $typeD `,` $scaleD (`,` $satfinite^)? `]` `,`
-      `A` `[` $typeA `,` $scaleA `,` $layoutA `]` `,` 
+      `A` `[` $typeA `,` $scaleA `,` $layoutA `]` `,`
       `B` `[` $typeB `,` $scaleB `,` $layoutB `]`
-      attr-dict `:` 
+      attr-dict `:`
       type($inouts) `->` type($results)
     }];
-  
+
   let description = [{
-    The warpgroup (128 threads) level matrix multiply and accumulate operation 
+    The warpgroup (128 threads) level matrix multiply and accumulate operation
     has either of the following forms, where matrix D is called accumulator:
       D = A * B + D
       D = A * B, where the input from accumulator D is disabled.
 
-    Supported shapes:  
+    Supported shapes:
     ```
     |--------------|--------------|------------|--------------|---------------|
     |              |              |            |              |f16+=e4m3*e4m3 |
@@ -2089,14 +2104,14 @@ def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async",
     |--------------|--------------|------------|--------------|---------------|
     ```
 
-    
+
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions)
   }];
-  
+
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{
-    void getAsmValues(RewriterBase &rewriter, 
+    void getAsmValues(RewriterBase &rewriter,
         llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>> &asmValues);
   }];
 }
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 9b1be198f77a82..c93688fb04c3ff 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -209,7 +209,12 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = op->getLoc();
     MLIRContext *context = rewriter.getContext();
-    Value newOp = rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type());
+    LLVM::ConstantRangeAttr bounds = nullptr;
+    if (std::optional<APInt> upperBound = op.getUpperBound())
+      bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
+          32, 0, upperBound->getZExtValue());
+    Value newOp =
+        rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type(), bounds);
     // Truncate or extend the result depending on the index bitwidth specified
     // by the LLVMTypeConverter options.
     const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
@@ -340,27 +345,40 @@ void mlir::populateGpuSubgroupReduceOpLoweringPattern(
 
 void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
                                                RewritePatternSet &patterns) {
+  using gpu::index_lowering::IndexKind;
+  using gpu::index_lowering::IntrType;
   populateWithGenerated(patterns);
   patterns.add<GPUPrintfOpToVPrintfLowering>(converter);
   patterns.add<
       gpu::index_lowering::OpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
-                                      NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
+                                      NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>>(
+      converter, IndexKind::Block, IntrType::Id);
+  patterns.add<
       gpu::index_lowering::OpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
-                                      NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
+                                      NVVM::BlockDimYOp, NVVM::BlockDimZOp>>(
+      converter, IndexKind::Block, IntrType::Dim);
+  patterns.add<
       gpu::index_lowering::OpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
-                                      NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>,
-      gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
-                                      NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
-      gpu::index_lowering::OpLowering<
-          gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
-          NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>,
-      gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
-                                      NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
-      gpu::index_lowering::OpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
-                                      NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
-      gpu::index_lowering::OpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
-                                      NVVM::GridDimYOp, NVVM::GridDimZOp>,
-      GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(converter);
+                                      NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>>(
+      converter, IndexKind::Other, IntrType::Id);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp,
+      NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
+      NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>>(
+      converter, IndexKind::Other, IntrType::Id);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp,
+      NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::BlockIdOp, NVVM::BlockIdXOp, NVVM::BlockIdYOp, NVVM::BlockIdZOp>>(
+      converter, IndexKind::Block, IntrType::Id);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::GridDimOp, NVVM::GridDimXOp, NVVM::GridDimYOp, NVVM::GridDimZOp>>(
+      converter, IndexKind::Grid, IntrType::Dim);
+  patterns.add<GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(
+      converter);
 
   patterns.add<GPUDynamicSharedMemoryOpLowering>(
       converter, NVVM::kSharedMemoryAlignmentBit);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
index 855abc12a909ef..bc830a77f3c580 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Target/LLVMIR/ModuleImport.h"
 
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 
 using namespace mlir;
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 8f2ec289c9252c..ec93f131d5ec13 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -699,9 +699,21 @@ gpu.module @test_module_32 {
 }
 
 gpu.module @test_module_33 {
-// CHECK-LABEL: func @kernel_with_block_size()
-// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 128, 1, 1>}
-  gpu.func @kernel_with_block_size() kernel attributes {known_block_size = array<i32: 128, 1, 1>} {
+// CHECK-LABEL: func @kernel_with_block_size(
+// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 32, 4, 2>, nvvm.kernel, nvvm.maxntid = array<i32: 32, 4, 2>}
+  gpu.func @kernel_with_block_size(%arg0: !llvm.ptr) kernel attributes {known_block_size = array<i32: 32, 4, 2>} {
+    // CHECK: = nvvm.read.ptx.sreg.tid.x range <0 : i32, 32 : i32> : i32
+    %0 = gpu.thread_id x
+    // CHECK: = nvvm.read.ptx.sreg.tid.y range <0 : i32, 4 : i32> : i32
+    %1 = gpu.thread_id y
+    // CHECK: = nvvm.read.ptx.sreg.tid.z range <0 : i32, 2 : i32> : i32
+    %2 = gpu.thread_id z
+
+    // Fake usage to prevent dead code elimination
+    %3 = arith.addi %0, %1 : index
+    %4 = arith.addi %3, %2 : index
+    %5 = arith.index_cast %4 : index to i64
+    llvm.store %5, %arg0 : i64, !llvm.ptr
     gpu.return
   }
 }
diff --git a/mlir/test/Target/LLVMIR/Import/nvvmir.ll b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
index e4a8773e2dd806..131e9065b2d883 100644
--- a/mlir/test/Target/LLVMIR/Import/nvvmir.ll
+++ b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
@@ -58,6 +58,9 @@ define i32 @nvvm_special_regs() {
   %27 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank()
   ; CHECK: = nvvm.read.ptx.sreg.cluster.nctarank : i32
   %28 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank()
+
+  ; CHECK = nvvm.read.ptx.sreg.tid.x range <0 : i32, 64 : i32> : i32
+  %29 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   ret i32 %1
 }
 
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 6e2787d121ae64..eaf3750958dfd5 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -62,7 +62,10 @@ llvm.func @nvvm_special_regs() -> i32 {
   %29 = nvvm.read.ptx.sreg.clock : i32
   // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64
   %30 = nvvm.read.ptx.sreg.clock64 : i64
-  
+
+  // CHECK: %31 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %31 = nvvm.read.ptx.sreg.tid.x range <0 : i32, 64 : i32> : i32
+
   llvm.return %1 : i32
 }
 
@@ -84,7 +87,7 @@ llvm.func @llvm_nvvm_barrier0() {
 // CHECK-SAME: i32 %[[barId:.*]], i32 %[[numThreads:.*]])
 llvm.func @llvm_nvvm_barrier(%barID : i32, %numberOfThreads : i32) {
   // CHECK: call void @llvm.nvvm.barrier0()
-  nvvm.barrier 
+  nvvm.barrier
   // CHECK: call void @llvm.nvvm.barrier.n(i32 %[[barId]])
   nvvm.barrier id = %barID
   // CHECK: call void @llvm.nvvm.barrier(i32 %[[barId]], i32 %[[numThreads]])

>From e62da14a21a25b07a3ac64f1f7e3b0b31032d004 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Mon, 9 Sep 2024 15:33:40 +0000
Subject: [PATCH 2/3] Put trailing whitespace back

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 212 ++++++++++----------
 mlir/test/Target/LLVMIR/nvvmir.mlir         |   2 +-
 2 files changed, 107 insertions(+), 107 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 66ac9f289d233b..f99be3c307e6c5 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -86,8 +86,8 @@ class NVVM_Op<string mnemonic, list<Trait> traits = []> :
   LLVM_OpBase<NVVM_Dialect, mnemonic, traits> {
 }
 
-/// Base class that defines BasicPtxBuilderOpInterface.
-class NVVM_PTXBuilder_Op<string mnemonic,
+/// Base class that defines BasicPtxBuilderOpInterface. 
+class NVVM_PTXBuilder_Op<string mnemonic, 
   list<Trait> traits = [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]> :
   LLVM_OpBase<NVVM_Dialect, mnemonic, traits> {
 }
@@ -212,11 +212,11 @@ def ReduxKindMin  : I32EnumAttrCase<"MIN", 4, "min">;
 def ReduxKindOr   : I32EnumAttrCase<"OR", 5, "or">;
 def ReduxKindUmax : I32EnumAttrCase<"UMAX", 6, "umax">;
 def ReduxKindUmin : I32EnumAttrCase<"UMIN", 7, "umin">;
-def ReduxKindXor  : I32EnumAttrCase<"XOR", 8, "xor">;
+def ReduxKindXor  : I32EnumAttrCase<"XOR", 8, "xor">; 
 
 /// Enum attribute of the different kinds.
 def ReduxKind : I32EnumAttr<"ReduxKind", "NVVM redux kind",
-  [ReduxKindAdd, ReduxKindAnd, ReduxKindMax, ReduxKindMin, ReduxKindOr,
+  [ReduxKindAdd, ReduxKindAnd, ReduxKindMax, ReduxKindMin, ReduxKindOr, 
     ReduxKindUmax, ReduxKindUmin, ReduxKindXor]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
@@ -236,7 +236,7 @@ def NVVM_ReduxOp :
   }];
   let assemblyFormat = [{
     $kind $val `,` $mask_and_clamp  attr-dict `:` type($val) `->` type($res)
-   }];
+   }];   
 }
 
 //===----------------------------------------------------------------------===//
@@ -323,7 +323,7 @@ def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.
   let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
 }
 
-def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,
+def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,  
   Arguments<(ins LLVM_AnyPointer:$addr, I32:$txcount, PtxPredicate:$predicate)> {
   let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
   let extraClassDefinition = [{
@@ -331,16 +331,16 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
   }];
 }
 
-def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx.shared">,
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> {
+def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx.shared">,  
+  Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> {    
   let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() { return std::string("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"); }
   }];
 }
 
-def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">,
-  Arguments<(ins LLVM_AnyPointer:$addr, I32:$phase, I32:$ticks)> {
+def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">,  
+  Arguments<(ins LLVM_AnyPointer:$addr, I32:$phase, I32:$ticks)> {  
   let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
@@ -353,13 +353,13 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity"
         "bra.uni     LAB_WAIT; \n\t"
         "DONE: \n\t"
         "}"
-      );
+      ); 
     }
   }];
 }
 
-def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity.shared">,
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> {
+def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity.shared">,  
+  Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> {  
   let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
@@ -372,7 +372,7 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.p
         "bra.uni     LAB_WAIT; \n\t"
         "DONE: \n\t"
         "}"
-      );
+      ); 
     }
   }];
 }
@@ -407,7 +407,7 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
 }
 
 def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
-  let arguments = (ins
+  let arguments = (ins     
     Optional<I32>:$barrierId,
     Optional<I32>:$numberOfThreads);
   string llvmBuilder = [{
@@ -416,7 +416,7 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
                 {$barrierId, $numberOfThreads});
     } else if($barrierId) {
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_n,
-                {$barrierId});
+                {$barrierId});   
     } else {
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier0);
     }
@@ -425,27 +425,27 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
   let assemblyFormat = "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? attr-dict";
 }
 
-def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive">
+def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive"> 
 {
   let arguments = (ins Optional<I32>:$barrierId, I32:$numberOfThreads);
 
   let description = [{
-    Thread that executes this op announces their arrival at the barrier with
+    Thread that executes this op announces their arrival at the barrier with 
     given id and continue their execution.
 
-    The default barrier id is 0 that is similar to `nvvm.barrier` Op. When
-    `barrierId` is not present, the default barrier id is used.
+    The default barrier id is 0 that is similar to `nvvm.barrier` Op. When 
+    `barrierId` is not present, the default barrier id is used. 
 
     [For more information, see PTX ISA]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
   }];
-
+  
   let assemblyFormat = "(`id` `=` $barrierId^)? `number_of_threads` `=` $numberOfThreads attr-dict";
 
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
       std::string ptx = "bar.arrive ";
-      if (getBarrierId()) { ptx += "%0, %1;"; }
+      if (getBarrierId()) { ptx += "%0, %1;"; } 
       else { ptx += "0, %0;"; }
       return ptx;
     }
@@ -568,7 +568,7 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
     [For more information, see PTX ISA]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
   }];
-
+  
   let assemblyFormat = "attr-dict";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
@@ -686,9 +686,9 @@ def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> {
     [For more information, see PTX ISA]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
   }];
-
+  
   let assemblyFormat = "attr-dict";
-  let extraClassDefinition = [{
+  let extraClassDefinition = [{        
     std::string $cppClass::getPtx() {
       return std::string("fence.mbarrier_init.release.cluster;");
     }
@@ -764,13 +764,13 @@ def NVVM_SyncWarpOp :
 }
 
 
-def NVVM_ElectSyncOp : NVVM_Op<"elect.sync",
+def NVVM_ElectSyncOp : NVVM_Op<"elect.sync", 
                   [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>
-{
+{  
   let results = (outs I1:$pred);
-  let assemblyFormat = "attr-dict `->` type(results)";
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() {
+  let assemblyFormat = "attr-dict `->` type(results)";  
+  let extraClassDefinition = [{        
+    std::string $cppClass::getPtx() { 
       return std::string(
         "{                                  \n"
         ".reg .u32 rx;                      \n"
@@ -779,7 +779,7 @@ def NVVM_ElectSyncOp : NVVM_Op<"elect.sync",
         "    elect.sync rx | px, 0xFFFFFFFF;\n"
         "@px mov.pred %0, 1;                \n"
         "}\n"
-      );
+      ); 
     }
   }];
 }
@@ -791,16 +791,16 @@ def LoadCacheModifierLU : I32EnumAttrCase<"LU", 3, "lu">;
 def LoadCacheModifierCV : I32EnumAttrCase<"CV", 4, "cv">;
 
 /// Enum attribute of the different kinds.
-def LoadCacheModifierKind : I32EnumAttr<"LoadCacheModifierKind",
+def LoadCacheModifierKind : I32EnumAttr<"LoadCacheModifierKind", 
                                 "NVVM load cache modifier kind",
-  [LoadCacheModifierCA, LoadCacheModifierCG, LoadCacheModifierCS,
+  [LoadCacheModifierCA, LoadCacheModifierCG, LoadCacheModifierCS, 
     LoadCacheModifierLU, LoadCacheModifierCV]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
   let description = [{
     Enum attribute of the different kinds of cache operators for load instructions.
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#id62)
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#id62)    
   }];
 }
 
@@ -826,7 +826,7 @@ def NVVM_CpAsyncOp : NVVM_PTXBuilder_Op<"cp.async.shared.global">,
             id = llvm::Intrinsic::nvvm_cp_async_cg_shared_global_16;
           else if($modifier == NVVM::LoadCacheModifierKind::CA)
             id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_16;
-          else
+          else 
             llvm_unreachable("unsupported cache modifier");
           break;
         default:
@@ -839,21 +839,21 @@ def NVVM_CpAsyncOp : NVVM_PTXBuilder_Op<"cp.async.shared.global">,
   let extraClassDeclaration = [{
     bool hasIntrinsic() { if(getCpSize()) return false; return true; }
 
-    void getAsmValues(RewriterBase &rewriter,
+    void getAsmValues(RewriterBase &rewriter, 
         llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>> &asmValues) {
       asmValues.push_back({getDst(), PTXRegisterMod::Read});
       asmValues.push_back({getSrc(), PTXRegisterMod::Read});
       asmValues.push_back({makeConstantI32(rewriter, getSize()), PTXRegisterMod::Read});
       asmValues.push_back({getCpSize(), PTXRegisterMod::Read});
-    }
+    }        
   }];
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() {
+  let extraClassDefinition = [{        
+    std::string $cppClass::getPtx() { 
       if(getModifier() == NVVM::LoadCacheModifierKind::CG)
         return std::string("cp.async.cg.shared.global [%0], [%1], %2, %3;\n");
       if(getModifier() == NVVM::LoadCacheModifierKind::CA)
         return std::string("cp.async.ca.shared.global [%0], [%1], %2, %3;\n");
-      llvm_unreachable("unsupported cache modifier");
+      llvm_unreachable("unsupported cache modifier");      
     }
   }];
 }
@@ -1541,9 +1541,9 @@ def NVVM_WMMAMmaOp : NVVM_Op<"wmma.mma">,
   let hasVerifier = 1;
 }
 
-def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">,
-  Arguments<(ins LLVM_PointerShared:$ptr,
-                 Variadic<I32>:$sources,
+def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">, 
+  Arguments<(ins LLVM_PointerShared:$ptr, 
+                 Variadic<I32>:$sources, 
                  MMALayoutAttr:$layout)> {
   let summary = "cooperative matrix store";
   let description = [{
@@ -1552,7 +1552,7 @@ def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">,
     [For more information, see PTX ISA]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix)
   }];
-
+  
   let assemblyFormat = "$ptr `,` $sources attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
@@ -1772,25 +1772,25 @@ def NVVM_CpAsyncBulkCommitGroupOp : NVVM_Op<"cp.async.bulk.commit.group">,
 }
 
 def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">,
-  Arguments<(ins
-    ConfinedAttr<I32Attr, [IntMinValue<0>]>:$group,
+  Arguments<(ins 
+    ConfinedAttr<I32Attr, [IntMinValue<0>]>:$group, 
     OptionalAttr<UnitAttr>:$read)> {
   let assemblyFormat = "$group attr-dict";
   let description = [{
     Op waits for completion of the most recent bulk async-groups.
 
     The `$group` operand tells waiting has to be done until for $group or fewer
-    of the most recent bulk async-groups. If `$group` is 0, the op wait until
+    of the most recent bulk async-groups. If `$group` is 0, the op wait until 
     all the most recent bulk async-groups have completed.
 
-    The `$read` indicates that the waiting has to be done until all the bulk
-    async operations in the specified bulk async-group have completed reading
+    The `$read` indicates that the waiting has to be done until all the bulk 
+    async operations in the specified bulk async-group have completed reading 
     from their source locations.
 
     [For more information, see PTX ISA]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group)
   }];
-
+  
   string llvmBuilder = [{
     auto intId = op.getRead() ?
       llvm::Intrinsic::nvvm_cp_async_bulk_wait_group_read :
@@ -1799,53 +1799,53 @@ def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">,
   }];
 }
 
-def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp :
-  NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global",
-  [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>,
+def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : 
+  NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global", 
+  [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, 
   AttrSizedOperandSegments]>,
   Arguments<(ins  LLVM_PointerShared:$dstMem,
                   LLVM_AnyPointer:$tmaDescriptor,
                   Variadic<I32>:$coordinates,
-                  LLVM_PointerShared:$mbar,
+                  LLVM_PointerShared:$mbar,                  
                   Variadic<I16>:$im2colOffsets,
                   Optional<I16>:$multicastMask,
                   Optional<I64>:$l2CacheHint,
                   PtxPredicate:$predicate)> {
   let description = [{
-    Initiates an asynchronous copy operation on the tensor data from global
-    memory to shared memory.
+    Initiates an asynchronous copy operation on the tensor data from global 
+    memory to shared memory. 
 
     The Op operates has two load modes:
-    1) Tiled Mode: It's the default mode. The source multi-dimensional tensor
-    layout is preserved at the destination.
+    1) Tiled Mode: It's the default mode. The source multi-dimensional tensor 
+    layout is preserved at the destination. 
 
     2) Im2col Mode: This mode is used when `im2colOffsets` operands are present.
     the elements in the Bounding Box of the source tensor are rearranged into
-    columns at the destination. In this mode, the tensor has to be at least
-    3-dimensional.
+    columns at the destination. In this mode, the tensor has to be at least 
+    3-dimensional. 
 
     The `multicastMask` operand is optional. When it is present, the Op copies
     data from global memory to shared memory of multiple CTAs in the cluster.
-    Operand `multicastMask` specifies the destination CTAs in the cluster such
+    Operand `multicastMask` specifies the destination CTAs in the cluster such 
     that each bit position in the 16-bit `multicastMask` operand corresponds to
-    the `nvvm.read.ptx.sreg.ctaid` of the destination CTA.
+    the `nvvm.read.ptx.sreg.ctaid` of the destination CTA.     
 
-    The `l2CacheHint` operand is optional, and it is used to specify cache
+    The `l2CacheHint` operand is optional, and it is used to specify cache 
     eviction policy that may be used during the memory access.
-
+    
     [For more information, see PTX ISA]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor)
   }];
 
-  let assemblyFormat = [{
-    $dstMem `,`
-    $tmaDescriptor `,`
-    $mbar `,`
-    `box` `[`$coordinates `]`
+  let assemblyFormat = [{ 
+    $dstMem `,` 
+    $tmaDescriptor `,` 
+    $mbar `,` 
+    `box` `[`$coordinates `]` 
     (`im2col` `[` $im2colOffsets^ `]` )?
     (`multicast_mask` `=` $multicastMask^ )?
     (`l2_cache_hint` `=` $l2CacheHint^ )?
-    (`predicate` `=` $predicate^)?
+    (`predicate` `=` $predicate^)? 
     attr-dict  `:` type($dstMem) `,` type($tmaDescriptor)
   }];
 
@@ -1855,16 +1855,16 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp :
       int dim = getCoordinates().size();
       std::string ptx = "cp.async.bulk.tensor.";
       ptx += std::to_string(dim) + "d.";
-      ptx += "shared::cluster.global.mbarrier::complete_tx::bytes";
+      ptx += "shared::cluster.global.mbarrier::complete_tx::bytes";      
       if(im2colDim) ptx += ".im2col";
-      if(getMulticastMask()) ptx += ".multicast::cluster";
+      if(getMulticastMask()) ptx += ".multicast::cluster";      
       if(getL2CacheHint()) ptx += ".L2::cache_hint";
-
+      
       auto preg = [](int r) { return "%" + std::to_string(r); };
 
       // Build Registers
       ptx += " [%0], [%1, {";
-      int r = 2;
+      int r = 2;      
       for(int i = 0; i < dim; i++) ptx += preg(r+i) + ",";
       ptx.pop_back(); r += dim;
       ptx += "} ], [%" + std::to_string(r++) + "]";
@@ -1883,19 +1883,19 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp :
   let hasVerifier = 1;
 }
 
-def NVVM_CpAsyncBulkTensorSharedCTAToGlobalOp :
-  NVVM_Op<"cp.async.bulk.tensor.global.shared.cta",
-  [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>,
+def NVVM_CpAsyncBulkTensorSharedCTAToGlobalOp : 
+  NVVM_Op<"cp.async.bulk.tensor.global.shared.cta", 
+  [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, 
   AttrSizedOperandSegments]>,
   Arguments<(ins  LLVM_AnyPointer:$tmaDescriptor,
                   LLVM_PointerShared:$srcMem,
                   Variadic<I32>:$coordinates,
                   PtxPredicate:$predicate)> {
-  let assemblyFormat = [{
-    $tmaDescriptor `,`
-    $srcMem `,`
-    `box` `[`$coordinates `]`
-    (`,` `predicate` `=` $predicate^)?
+  let assemblyFormat = [{ 
+    $tmaDescriptor `,` 
+    $srcMem `,` 
+    `box` `[`$coordinates `]` 
+    (`,` `predicate` `=` $predicate^)?  
     attr-dict  `:` type(operands)
   }];
   let extraClassDefinition = [{
@@ -1920,7 +1920,7 @@ def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
   Arguments<(ins LLVM_AnyPointer:$tmaDescriptor, PtxPredicate:$predicate)> {
   let assemblyFormat = "$tmaDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
   let extraClassDefinition = [{
-    std::string $cppClass::getPtx() {
+    std::string $cppClass::getPtx() { 
       return std::string("prefetch.tensormap [%0];");
     }
   }];
@@ -1933,9 +1933,9 @@ def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
 def NVVM_WgmmaFenceAlignedOp : NVVM_PTXBuilder_Op<"wgmma.fence.aligned"> {
   let arguments = (ins);
   let description = [{
-    Enforce an ordering of register accesses between warpgroup level matrix
-    multiplication and other operations.
-
+    Enforce an ordering of register accesses between warpgroup level matrix 
+    multiplication and other operations. 
+    
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions-wgmma-fence)
   }];
   let assemblyFormat = "attr-dict";
@@ -1949,7 +1949,7 @@ def NVVM_WgmmaGroupSyncAlignedOp : NVVM_PTXBuilder_Op<"wgmma.commit.group.sync.a
   let assemblyFormat = "attr-dict";
   let description = [{
     Commits all prior uncommitted warpgroup level matrix multiplication operations.
-
+    
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group)
   }];
   let extraClassDefinition = [{
@@ -1962,7 +1962,7 @@ def NVVM_WgmmaWaitGroupSyncOp : NVVM_PTXBuilder_Op<"wgmma.wait.group.sync.aligne
   let assemblyFormat = "attr-dict $group";
   let description = [{
     Signal the completion of a preceding warpgroup operation.
-
+    
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions-wgmma-wait-group)
   }];
   let extraClassDefinition = [{
@@ -2009,7 +2009,7 @@ def WGMMATypeS32 : I32EnumAttrCase<"s32", 9>;
 def WGMMATypes : I32EnumAttr<"WGMMATypes", "NVVM WGMMA types",
   [WGMMATypeF16, WGMMATypeTF32,
     WGMMATypeU8, WGMMATypeS8,
-    WGMMATypeB1, WGMMATypeBF16, WGMMATypeF8E4M3,
+    WGMMATypeB1, WGMMATypeBF16, WGMMATypeF8E4M3, 
     WGMMATypeF8E5M2, WGMMATypeF32, WGMMATypeS32]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
@@ -2019,44 +2019,44 @@ def WGMMATypesAttr : EnumAttr<NVVM_Dialect, WGMMATypes, "wgmma_type"> {
 }
 
 
-def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async",
+def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async", 
               [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>,
                 PredOpTrait<"input struct and result struct must be the same type",
-                  TCresIsSameAsOpBase<0, 0>>,]>
+                  TCresIsSameAsOpBase<0, 0>>,]> 
 {
   let results = (outs LLVM_AnyStruct:$results);
-  let arguments = (ins
+  let arguments = (ins 
     LLVM_AnyStruct:$inouts,
-    I64:$descriptorA,
-    I64:$descriptorB,
+    I64:$descriptorA, 
+    I64:$descriptorB, 
     NVVM_MMAShapeAttr:$shape,
     WGMMATypesAttr:$typeA,
     WGMMATypesAttr:$typeB,
     WGMMATypesAttr:$typeD,
     WGMMAScaleOutAttr:$scaleD,
     WGMMAScaleInAttr:$scaleA,
-    WGMMAScaleInAttr:$scaleB,
+    WGMMAScaleInAttr:$scaleB, 
     MMALayoutAttr:$layoutA,
     MMALayoutAttr:$layoutB,
     OptionalAttr<MMAIntOverflowAttr>:$satfinite
-  );
-
-   let assemblyFormat = [{
+  );  
+  
+   let assemblyFormat = [{ 
       $descriptorA `,` $descriptorB `,` $inouts `,` $shape `,`
       `D` `[` $typeD `,` $scaleD (`,` $satfinite^)? `]` `,`
-      `A` `[` $typeA `,` $scaleA `,` $layoutA `]` `,`
+      `A` `[` $typeA `,` $scaleA `,` $layoutA `]` `,` 
       `B` `[` $typeB `,` $scaleB `,` $layoutB `]`
-      attr-dict `:`
+      attr-dict `:` 
       type($inouts) `->` type($results)
     }];
-
+  
   let description = [{
-    The warpgroup (128 threads) level matrix multiply and accumulate operation
+    The warpgroup (128 threads) level matrix multiply and accumulate operation 
     has either of the following forms, where matrix D is called accumulator:
       D = A * B + D
       D = A * B, where the input from accumulator D is disabled.
 
-    Supported shapes:
+    Supported shapes:  
     ```
     |--------------|--------------|------------|--------------|---------------|
     |              |              |            |              |f16+=e4m3*e4m3 |
@@ -2104,14 +2104,14 @@ def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async",
     |--------------|--------------|------------|--------------|---------------|
     ```
 
-
+    
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions)
   }];
-
+  
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{
-    void getAsmValues(RewriterBase &rewriter,
+    void getAsmValues(RewriterBase &rewriter, 
         llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>> &asmValues);
   }];
 }
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index eaf3750958dfd5..a8df7864d1bbc6 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -87,7 +87,7 @@ llvm.func @llvm_nvvm_barrier0() {
 // CHECK-SAME: i32 %[[barId:.*]], i32 %[[numThreads:.*]])
 llvm.func @llvm_nvvm_barrier(%barID : i32, %numberOfThreads : i32) {
   // CHECK: call void @llvm.nvvm.barrier0()
-  nvvm.barrier
+  nvvm.barrier 
   // CHECK: call void @llvm.nvvm.barrier.n(i32 %[[barId]])
   nvvm.barrier id = %barID
   // CHECK: call void @llvm.nvvm.barrier(i32 %[[barId]], i32 %[[numThreads]])

>From d7a2149ac435fd3e4e8c7932f89182dee6217d55 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Mon, 9 Sep 2024 23:16:05 +0000
Subject: [PATCH 3/3] Add test, update for new range syntax from PR below

---
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        |  2 +-
 .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir     | 20 ++++++++++++++++---
 mlir/test/Target/LLVMIR/nvvmir.mlir           |  2 +-
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index c93688fb04c3ff..378eeab12d2e94 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -212,7 +212,7 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
     LLVM::ConstantRangeAttr bounds = nullptr;
     if (std::optional<APInt> upperBound = op.getUpperBound())
       bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
-          32, 0, upperBound->getZExtValue());
+          /*bitWidth=*/32, /*lower=*/0, upperBound->getZExtValue());
     Value newOp =
         rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type(), bounds);
     // Truncate or extend the result depending on the index bitwidth specified
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index ec93f131d5ec13..c3df23f1862ef8 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -702,11 +702,11 @@ gpu.module @test_module_33 {
 // CHECK-LABEL: func @kernel_with_block_size(
 // CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 32, 4, 2>, nvvm.kernel, nvvm.maxntid = array<i32: 32, 4, 2>}
   gpu.func @kernel_with_block_size(%arg0: !llvm.ptr) kernel attributes {known_block_size = array<i32: 32, 4, 2>} {
-    // CHECK: = nvvm.read.ptx.sreg.tid.x range <0 : i32, 32 : i32> : i32
+    // CHECK: = nvvm.read.ptx.sreg.tid.x range <i32, 0, 32> : i32
     %0 = gpu.thread_id x
-    // CHECK: = nvvm.read.ptx.sreg.tid.y range <0 : i32, 4 : i32> : i32
+    // CHECK: = nvvm.read.ptx.sreg.tid.y range <i32, 0, 4> : i32
     %1 = gpu.thread_id y
-    // CHECK: = nvvm.read.ptx.sreg.tid.z range <0 : i32, 2 : i32> : i32
+    // CHECK: = nvvm.read.ptx.sreg.tid.z range <i32, 0, 2> : i32
     %2 = gpu.thread_id z
 
     // Fake usage to prevent dead code elimination
@@ -929,6 +929,20 @@ gpu.module @test_module_48 {
   }
 }
 
+gpu.module @test_module_49 {
+// CHECK-LABEL: func @explicit_id_bounds()
+  func.func @explicit_id_bounds() -> (index, index, index) {
+    // CHECK: = nvvm.read.ptx.sreg.tid.x range <i32, 0, 32> : i32
+    %0 = gpu.thread_id x upper_bound 32
+    // CHECK: = nvvm.read.ptx.sreg.ntid.x range <i32, 1, 33> : i32
+    %1 = gpu.block_dim x upper_bound 32
+    // CHECK: = nvvm.read.ptx.sreg.laneid range <i32, 0, 32> : i32
+    %2 = gpu.lane_id upper_bound 32
+
+    return %0, %1, %2 : index, index, index
+  }
+}
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
     %gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index a8df7864d1bbc6..48d73bfc85e469 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -64,7 +64,7 @@ llvm.func @nvvm_special_regs() -> i32 {
   %30 = nvvm.read.ptx.sreg.clock64 : i64
 
   // CHECK: %31 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %31 = nvvm.read.ptx.sreg.tid.x range <0 : i32, 64 : i32> : i32
+  %31 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 64> : i32
 
   llvm.return %1 : i32
 }