[llvm] [mlir] [MLIR][NVVM] Add globaltimer_lo support in NVVM Dialect and NVPTX backend (PR #154672)

Dharuni R Acharya via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 21 03:04:02 PDT 2025


https://github.com/DharuniRAcharya updated https://github.com/llvm/llvm-project/pull/154672

>From 5edc271751743970ddeb4a67e9cf5469d3b6be13 Mon Sep 17 00:00:00 2001
From: Dharuni R Acharya <dharunira at nvidia.com>
Date: Thu, 21 Aug 2025 04:59:19 +0000
Subject: [PATCH 1/2] [MLIR][NVPTX] Expose globaltimer_lo in NVVM Dialect and
 Add corresponding intrinsics in NVPTX backend

---
 llvm/include/llvm/IR/IntrinsicsNVVM.td      |  1 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td    |  2 +
 llvm/test/CodeGen/NVPTX/intrinsics.ll       | 17 ++++
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td |  1 +
 mlir/test/Target/LLVMIR/nvvmir.mlir         | 90 +++++++++++----------
 5 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 1bcc442a3f77f..49eaf9770b717 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1792,6 +1792,7 @@ def int_nvvm_read_ptx_sreg_clock : PTXReadNCSRegIntrinsic_r32;
 def int_nvvm_read_ptx_sreg_clock64 : PTXReadNCSRegIntrinsic_r64;
 
 def int_nvvm_read_ptx_sreg_globaltimer : PTXReadNCSRegIntrinsic_r64;
+def int_nvvm_read_ptx_sreg_globaltimer_lo : PTXReadNCSRegIntrinsic_r32;
 
 def int_nvvm_read_ptx_sreg_pm0 : PTXReadNCSRegIntrinsic_r32;
 def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 721afae4db51c..c280b68d2298d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -4358,10 +4358,12 @@ let hasSideEffects = 1 in {
   def SREG_CLOCK : PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>;
   def SREG_CLOCK64 : PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>;
   def SREG_GLOBALTIMER : PTX_READ_SREG_R64<"globaltimer", int_nvvm_read_ptx_sreg_globaltimer>;
+  def SREG_GLOBALTIMER_LO : PTX_READ_SREG_R32<"globaltimer_lo", int_nvvm_read_ptx_sreg_globaltimer_lo>;
 }
 
 def: Pat <(i64 (readcyclecounter)), (SREG_CLOCK64)>;
 def: Pat <(i64 (readsteadycounter)), (SREG_GLOBALTIMER)>;
+def: Pat <(i32 (readsteadycounter)), (SREG_GLOBALTIMER_LO)>;
 
 def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>;
 def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index 4ed50632251cb..6bdb8ead7a64a 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -267,6 +267,23 @@ define i64 @test_globaltimer() {
   ret i64 %ret
 }
 
+define i32 @test_globaltimer_lo(){
+; CHECK-LABEL: test_globaltimer_lo(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u32 %r1, %globaltimer_lo;
+; CHECK-NEXT:    mov.u32 %r2, %globaltimer_lo;
+; CHECK-NEXT:    add.s32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %a = tail call i32 @llvm.nvvm.read.ptx.sreg.globaltimer.lo()
+  %b = tail call i32 @llvm.nvvm.read.ptx.sreg.globaltimer.lo()
+  %ret = add i32 %a, %b
+  ret i32 %ret
+}
+
 define i64 @test_cyclecounter() {
 ; CHECK-LABEL: test_cyclecounter(
 ; CHECK:       {
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index f9cd58de8915f..a3c440be9489a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -258,6 +258,7 @@ def NVVM_ClusterDim : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster
 def NVVM_ClockOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clock">;
 def NVVM_Clock64Op : NVVM_SpecialRegisterOp<"read.ptx.sreg.clock64">;
 def NVVM_GlobalTimerOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.globaltimer">;
+def NVVM_GlobalTimerLoOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.globaltimer.lo">;
 
 //===----------------------------------------------------------------------===//
 // envreg registers
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index c8ba91efbff4d..22e6b648068a2 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -64,92 +64,94 @@ llvm.func @nvvm_special_regs() -> i32 {
   %30 = nvvm.read.ptx.sreg.clock64 : i64
   // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.globaltimer
   %31 = nvvm.read.ptx.sreg.globaltimer : i64
-  // CHECK: %32 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %32 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 64> : i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.globaltimer.lo()
+  %32 = nvvm.read.ptx.sreg.globaltimer.lo : i32
+  // CHECK: %33 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %33 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 64> : i32
   // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid
-  %33 = nvvm.read.ptx.sreg.warpid : i32
+  %34 = nvvm.read.ptx.sreg.warpid : i32
   // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid
-  %34 = nvvm.read.ptx.sreg.nwarpid : i32
+  %35 = nvvm.read.ptx.sreg.nwarpid : i32
   // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid
-  %35 = nvvm.read.ptx.sreg.smid : i32
+  %36 = nvvm.read.ptx.sreg.smid : i32
   // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid
-  %36 = nvvm.read.ptx.sreg.nsmid : i32
+  %37 = nvvm.read.ptx.sreg.nsmid : i32
   // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid
-  %37 = nvvm.read.ptx.sreg.gridid : i32
+  %38 = nvvm.read.ptx.sreg.gridid : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg0
-  %38 = nvvm.read.ptx.sreg.envreg0 : i32
+  %39 = nvvm.read.ptx.sreg.envreg0 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg1
-  %39 = nvvm.read.ptx.sreg.envreg1 : i32
+  %40 = nvvm.read.ptx.sreg.envreg1 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg2
-  %40 = nvvm.read.ptx.sreg.envreg2 : i32
+  %41 = nvvm.read.ptx.sreg.envreg2 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg3
-  %41 = nvvm.read.ptx.sreg.envreg3 : i32
+  %42 = nvvm.read.ptx.sreg.envreg3 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg4
-  %42 = nvvm.read.ptx.sreg.envreg4 : i32
+  %43 = nvvm.read.ptx.sreg.envreg4 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg5
-  %43 = nvvm.read.ptx.sreg.envreg5 : i32
+  %44 = nvvm.read.ptx.sreg.envreg5 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg6
-  %44 = nvvm.read.ptx.sreg.envreg6 : i32
+  %45 = nvvm.read.ptx.sreg.envreg6 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg7
-  %45 = nvvm.read.ptx.sreg.envreg7 : i32
+  %46 = nvvm.read.ptx.sreg.envreg7 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg8
-  %46 = nvvm.read.ptx.sreg.envreg8 : i32
+  %47 = nvvm.read.ptx.sreg.envreg8 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg9
-  %47 = nvvm.read.ptx.sreg.envreg9 : i32
+  %48 = nvvm.read.ptx.sreg.envreg9 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg10
-  %48 = nvvm.read.ptx.sreg.envreg10 : i32
+  %49 = nvvm.read.ptx.sreg.envreg10 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg11
-  %49 = nvvm.read.ptx.sreg.envreg11 : i32
+  %50 = nvvm.read.ptx.sreg.envreg11 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg12
-  %50 = nvvm.read.ptx.sreg.envreg12 : i32
+  %51 = nvvm.read.ptx.sreg.envreg12 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg13
-  %51 = nvvm.read.ptx.sreg.envreg13 : i32
+  %52 = nvvm.read.ptx.sreg.envreg13 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg14
-  %52 = nvvm.read.ptx.sreg.envreg14 : i32
+  %53 = nvvm.read.ptx.sreg.envreg14 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg15
-  %53 = nvvm.read.ptx.sreg.envreg15 : i32
+  %54 = nvvm.read.ptx.sreg.envreg15 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg16
-  %54 = nvvm.read.ptx.sreg.envreg16 : i32
+  %55 = nvvm.read.ptx.sreg.envreg16 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg17
-  %55 = nvvm.read.ptx.sreg.envreg17 : i32
+  %56 = nvvm.read.ptx.sreg.envreg17 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg18
-  %56 = nvvm.read.ptx.sreg.envreg18 : i32
+  %57 = nvvm.read.ptx.sreg.envreg18 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg19
-  %57 = nvvm.read.ptx.sreg.envreg19 : i32
+  %58 = nvvm.read.ptx.sreg.envreg19 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg20
-  %58 = nvvm.read.ptx.sreg.envreg20 : i32
+  %59 = nvvm.read.ptx.sreg.envreg20 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg21
-  %59 = nvvm.read.ptx.sreg.envreg21 : i32
+  %60 = nvvm.read.ptx.sreg.envreg21 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg22
-  %60 = nvvm.read.ptx.sreg.envreg22 : i32
+  %61 = nvvm.read.ptx.sreg.envreg22 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg23
-  %61 = nvvm.read.ptx.sreg.envreg23 : i32
+  %62 = nvvm.read.ptx.sreg.envreg23 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg24
-  %62 = nvvm.read.ptx.sreg.envreg24 : i32
+  %63 = nvvm.read.ptx.sreg.envreg24 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg25
-  %63 = nvvm.read.ptx.sreg.envreg25 : i32
+  %64 = nvvm.read.ptx.sreg.envreg25 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg26
-  %64 = nvvm.read.ptx.sreg.envreg26 : i32
+  %65 = nvvm.read.ptx.sreg.envreg26 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg27
-  %65 = nvvm.read.ptx.sreg.envreg27 : i32
+  %66 = nvvm.read.ptx.sreg.envreg27 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg28
-  %66 = nvvm.read.ptx.sreg.envreg28 : i32
+  %67 = nvvm.read.ptx.sreg.envreg28 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg29
-  %67 = nvvm.read.ptx.sreg.envreg29 : i32
+  %68 = nvvm.read.ptx.sreg.envreg29 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg30
-  %68 = nvvm.read.ptx.sreg.envreg30 : i32
+  %69 = nvvm.read.ptx.sreg.envreg30 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg31
-  %69 = nvvm.read.ptx.sreg.envreg31 : i32
+  %70 = nvvm.read.ptx.sreg.envreg31 : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq
-  %70 = nvvm.read.ptx.sreg.lanemask.eq : i32
+  %71 = nvvm.read.ptx.sreg.lanemask.eq : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le
-  %71 = nvvm.read.ptx.sreg.lanemask.le : i32
+  %72 = nvvm.read.ptx.sreg.lanemask.le : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt
-  %72 = nvvm.read.ptx.sreg.lanemask.lt : i32
+  %73 = nvvm.read.ptx.sreg.lanemask.lt : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge
-  %73 = nvvm.read.ptx.sreg.lanemask.ge : i32
+  %74 = nvvm.read.ptx.sreg.lanemask.ge : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt
-  %74 = nvvm.read.ptx.sreg.lanemask.gt : i32
+  %75 = nvvm.read.ptx.sreg.lanemask.gt : i32
   llvm.return %1 : i32
 }
 

>From ce31f4952352b2aefc3334a835665c7e2d9bc80e Mon Sep 17 00:00:00 2001
From: Dharuni R Acharya <dharunira at nvidia.com>
Date: Thu, 21 Aug 2025 09:54:53 +0000
Subject: [PATCH 2/2] [MLIR][NVVM] Add globaltimer_lo support in NVVM Dialect
 and NVPTX backend This patch adds support for reading the global timer low
 register in the NVVM dialect and NVPTX backend. This change includes addition
 of NVVM_GlobalTimerLoOp operation to NVVM dialect and
 int_nvvm_read_ptx_sreg_globaltimer_lo intrinsic to NVPTX backend. All the lit
 tests have been added. Adding the relevant PTX link here for reference:
 https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers-globaltimer

---
 llvm/include/llvm/IR/IntrinsicsNVVM.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 49eaf9770b717..688fc3d9e226e 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1791,8 +1791,8 @@ def int_nvvm_read_ptx_sreg_lanemask_gt : PTXReadSRegIntrinsic_r32;
 def int_nvvm_read_ptx_sreg_clock : PTXReadNCSRegIntrinsic_r32;
 def int_nvvm_read_ptx_sreg_clock64 : PTXReadNCSRegIntrinsic_r64;
 
-def int_nvvm_read_ptx_sreg_globaltimer : PTXReadNCSRegIntrinsic_r64;
-def int_nvvm_read_ptx_sreg_globaltimer_lo : PTXReadNCSRegIntrinsic_r32;
+def int_nvvm_read_ptx_sreg_globaltimer : PTXReadNCSRegIntrinsic<llvm_i64_ty>;
+def int_nvvm_read_ptx_sreg_globaltimer_lo : PTXReadNCSRegIntrinsic<llvm_i32_ty>;
 
 def int_nvvm_read_ptx_sreg_pm0 : PTXReadNCSRegIntrinsic_r32;
 def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32;



More information about the llvm-commits mailing list