[llvm] [NVPTX] prefetch.tensormap pattern rewriter fix (PR #159253)

Abhilash Majumder via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 16 23:50:04 PDT 2025


https://github.com/abhilash1910 created https://github.com/llvm/llvm-project/pull/159253

Context: Highlighted from #156830 , there is an Isel lowering issue in NVPTX backend for prefetch.tensormap intrinsic which is caused due to unguarded pattern rewrite during infer address-space pass. it is observed that for ptx_kernel attributed kernel functions the rewriter for prefetch.tensormap intrinsics is not able to map to generic (addrspace 0) memory space as it implicitly gets allocated to global addresspace. This causes an Isel failure which is fixed by this PR.  Should not involve any MLIR op changes as it is failing only at backend. 

cc @durga4github 

FYI: @Wolfram70 @rupprecht  @castigli  

>From be0d3afdfed00ea7964bb04039e5ac1ea4441987 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder at nvidia.com>
Date: Wed, 17 Sep 2025 12:09:39 +0530
Subject: [PATCH 1/2] pattern rewriter fix

---
 .../Target/NVPTX/NVPTXTargetTransformInfo.cpp |   7 +-
 .../CodeGen/NVPTX/prefetch-inferas-test.ll    |  45 +++++++-
 llvm/test/CodeGen/NVPTX/prefetch.ll           |  43 ++++++-
 llvm/test/CodeGen/NVPTX/prefetch.s            | 105 ++++++++++++++++++
 4 files changed, 194 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/prefetch.s

diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index f4f89613b358d..b920da0d04203 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -588,10 +588,11 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
       return ConstantInt::get(II->getType(), *R);
     return nullptr;
   }
-  case Intrinsic::nvvm_prefetch_tensormap: {
-    IRBuilder<> Builder(II);
-    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,
+  const unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+    if (NewAS == NVPTXAS::ADDRESS_SPACE_CONST || NewAS == NVPTXAS::ADDRESS_SPACE_PARAM)
+      return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,
                                         NewV);
+    return nullptr;
   }
   }
   return nullptr;
diff --git a/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll
index bc67471209bf8..ed625876869cc 100644
--- a/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll
+++ b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll
@@ -1,6 +1,8 @@
 ; RUN: opt < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix=INFER
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s --check-prefix=PTX
 ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
+; RUN: not llc -march=nvptx64 %s -o - 2>&1 | FileCheck %s --check-prefix=ERR
+; XFAIL: *
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
@@ -11,7 +13,6 @@ target triple = "nvptx64-unknown-unknown"
 define void @test_infer_const_from_cast() {
 ; INFER-LABEL: @test_infer_const_from_cast
 ; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
-; BOTH: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
 ; PTX-LABEL: .visible .func test_infer_const_from_cast(
 ; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap;
 ; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
@@ -69,7 +70,47 @@ entry:
   %cast1 = addrspacecast ptr addrspace(4) @constant_tensormap to ptr
   %cast2 = addrspacecast ptr %cast1 to ptr addrspace(4)
   %cast3 = addrspacecast ptr addrspace(4) %cast2 to ptr
-  call void @llvm.nvvm.prefetch.tensormap(ptr %cast3)
+  call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast3)
+  ret void
+}
+
+; Kernel Function Test
+; Cast from Const space to Generic
+define ptx_kernel void @test_const_to_generic_cast_kernel(ptr addrspace(4) %const_ptr) {
+; INFER-LABEL: @test_const_to_generic_cast_kernel
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+; PTX-LABEL: .visible .entry test_const_to_generic_cast_kernel(
+; PTX: prefetch.const.tensormap [%rd{{[0-9]+}}];
+entry:
+  %cast = addrspacecast ptr addrspace(4) %const_ptr to ptr
+  call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast)
+  ret void
+}
+
+; Kernel Function Test
+; Multiple casts in sequence
+define ptx_kernel void @test_infer_through_multiple_casts_kernel() {
+; INFER-LABEL: @test_infer_through_multiple_casts_kernel
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
+; PTX-LABEL: .visible .entry test_infer_through_multiple_casts_kernel(
+; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap;
+; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
+; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
+entry:
+  %cast1 = addrspacecast ptr addrspace(4) @constant_tensormap to ptr
+  %cast2 = addrspacecast ptr %cast1 to ptr addrspace(4)
+  %cast3 = addrspacecast ptr addrspace(4) %cast2 to ptr
+  call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast3)
+  ret void
+}
+
+
+; Negative test case for global to generic addrspace cast
+define void @test_global_to_generic_cast(ptr addrspace(1) %global_ptr) {
+; ERR: unsupported prefetch address space cast
+entry:
+  %cast = addrspacecast ptr addrspace(1) %global_ptr to ptr
+  call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/prefetch.ll b/llvm/test/CodeGen/NVPTX/prefetch.ll
index a1c5ec8f50a6b..0b1f0fdd5e85f 100644
--- a/llvm/test/CodeGen/NVPTX/prefetch.ll
+++ b/llvm/test/CodeGen/NVPTX/prefetch.ll
@@ -121,4 +121,45 @@ define void @prefetch_param_tensormap(ptr addrspace(101) %param_ptr) {
 ; CHECK-PTX64-NEXT:    ret;
   tail call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
   ret void
-}
\ No newline at end of file
+}
+
+define ptx_kernel void @prefetch_tensormap_kernel(ptr %ptr) {
+; CHECK-PTX64-LABEL: prefetch_tensormap_kernel(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetch_tensormap_kernel_param_0];
+; CHECK-PTX64-NEXT:    prefetch.tensormap [%rd1];
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
+  ret void
+}
+
+define ptx_kernel void @prefetch_const_tensormap_kernel(ptr addrspace(4) %const_ptr) {
+; CHECK-PTX64-LABEL: prefetch_const_tensormap_kernel(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetch_const_tensormap_kernel_param_0];
+; CHECK-PTX64-NEXT:    prefetch.const.tensormap [%rd1];
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+  ret void
+}
+
+define ptx_kernel void @prefetch_param_tensormap_kernel(ptr addrspace(101) %param_ptr) {
+; CHECK-PTX64-LABEL: prefetch_param_tensormap_kernel(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetch_param_tensormap_kernel_param_0];
+; CHECK-PTX64-NEXT:    prefetch.param.tensormap [%rd1];
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
+  ret void
+}
+
+
diff --git a/llvm/test/CodeGen/NVPTX/prefetch.s b/llvm/test/CodeGen/NVPTX/prefetch.s
new file mode 100644
index 0000000000000..31d0ac68a1472
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/prefetch.s
@@ -0,0 +1,105 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.0
+.target sm_90a
+.address_size 64
+
+	// .globl	prefetch_local          // -- Begin function prefetch_local
+                                        // @prefetch_local
+.visible .func prefetch_local(
+	.param .b64 prefetch_local_param_0
+)
+{
+	.reg .b64 	%rd<2>;
+
+// %bb.0:
+	ld.param.b64 	%rd1, [prefetch_local_param_0];
+	prefetch.local.L1 	[%rd1];
+	prefetch.local.L2 	[%rd1];
+	ret;
+                                        // -- End function
+}
+	// .globl	prefetch_global         // -- Begin function prefetch_global
+.visible .func prefetch_global(
+	.param .b64 prefetch_global_param_0
+)                                       // @prefetch_global
+{
+	.reg .b64 	%rd<2>;
+
+// %bb.0:
+	ld.param.b64 	%rd1, [prefetch_global_param_0];
+	prefetch.global.L1 	[%rd1];
+	prefetch.global.L2 	[%rd1];
+	prefetch.global.L2::evict_normal 	[%rd1];
+	prefetch.global.L2::evict_last 	[%rd1];
+	ret;
+                                        // -- End function
+}
+	// .globl	prefetch_               // -- Begin function prefetch_
+.visible .func prefetch_(
+	.param .b64 prefetch__param_0
+)                                       // @prefetch_
+{
+	.reg .b64 	%rd<2>;
+
+// %bb.0:
+	ld.param.b64 	%rd1, [prefetch__param_0];
+	prefetch.L1 	[%rd1];
+	prefetch.L2 	[%rd1];
+	ret;
+                                        // -- End function
+}
+	// .globl	prefetchu_l1            // -- Begin function prefetchu_l1
+.visible .func prefetchu_l1(
+	.param .b64 prefetchu_l1_param_0
+)                                       // @prefetchu_l1
+{
+	.reg .b64 	%rd<2>;
+
+// %bb.0:
+	ld.param.b64 	%rd1, [prefetchu_l1_param_0];
+	prefetchu.L1 	[%rd1];
+	ret;
+                                        // -- End function
+}
+	// .globl	prefetch_tensormap      // -- Begin function prefetch_tensormap
+.visible .func prefetch_tensormap(
+	.param .b64 prefetch_tensormap_param_0
+)                                       // @prefetch_tensormap
+{
+	.reg .b64 	%rd<2>;
+
+// %bb.0:
+	ld.param.b64 	%rd1, [prefetch_tensormap_param_0];
+	prefetch.tensormap 	[%rd1];
+	ret;
+                                        // -- End function
+}
+	// .globl	prefetch_const_tensormap // -- Begin function prefetch_const_tensormap
+.visible .func prefetch_const_tensormap(
+	.param .b64 prefetch_const_tensormap_param_0
+)                                       // @prefetch_const_tensormap
+{
+	.reg .b64 	%rd<2>;
+
+// %bb.0:
+	ld.param.b64 	%rd1, [prefetch_const_tensormap_param_0];
+	prefetch.const.tensormap 	[%rd1];
+	ret;
+                                        // -- End function
+}
+	// .globl	prefetch_param_tensormap // -- Begin function prefetch_param_tensormap
+.visible .func prefetch_param_tensormap(
+	.param .b64 prefetch_param_tensormap_param_0
+)                                       // @prefetch_param_tensormap
+{
+	.reg .b64 	%rd<2>;
+
+// %bb.0:
+	ld.param.b64 	%rd1, [prefetch_param_tensormap_param_0];
+	prefetch.param.tensormap 	[%rd1];
+	ret;
+                                        // -- End function
+}

>From 32bf60a241dae7482a94d7cbc60777378313d930 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder at nvidia.com>
Date: Wed, 17 Sep 2025 12:19:02 +0530
Subject: [PATCH 2/2] remove s

---
 llvm/test/CodeGen/NVPTX/prefetch.s | 105 -----------------------------
 1 file changed, 105 deletions(-)
 delete mode 100644 llvm/test/CodeGen/NVPTX/prefetch.s

diff --git a/llvm/test/CodeGen/NVPTX/prefetch.s b/llvm/test/CodeGen/NVPTX/prefetch.s
deleted file mode 100644
index 31d0ac68a1472..0000000000000
--- a/llvm/test/CodeGen/NVPTX/prefetch.s
+++ /dev/null
@@ -1,105 +0,0 @@
-//
-// Generated by LLVM NVPTX Back-End
-//
-
-.version 8.0
-.target sm_90a
-.address_size 64
-
-	// .globl	prefetch_local          // -- Begin function prefetch_local
-                                        // @prefetch_local
-.visible .func prefetch_local(
-	.param .b64 prefetch_local_param_0
-)
-{
-	.reg .b64 	%rd<2>;
-
-// %bb.0:
-	ld.param.b64 	%rd1, [prefetch_local_param_0];
-	prefetch.local.L1 	[%rd1];
-	prefetch.local.L2 	[%rd1];
-	ret;
-                                        // -- End function
-}
-	// .globl	prefetch_global         // -- Begin function prefetch_global
-.visible .func prefetch_global(
-	.param .b64 prefetch_global_param_0
-)                                       // @prefetch_global
-{
-	.reg .b64 	%rd<2>;
-
-// %bb.0:
-	ld.param.b64 	%rd1, [prefetch_global_param_0];
-	prefetch.global.L1 	[%rd1];
-	prefetch.global.L2 	[%rd1];
-	prefetch.global.L2::evict_normal 	[%rd1];
-	prefetch.global.L2::evict_last 	[%rd1];
-	ret;
-                                        // -- End function
-}
-	// .globl	prefetch_               // -- Begin function prefetch_
-.visible .func prefetch_(
-	.param .b64 prefetch__param_0
-)                                       // @prefetch_
-{
-	.reg .b64 	%rd<2>;
-
-// %bb.0:
-	ld.param.b64 	%rd1, [prefetch__param_0];
-	prefetch.L1 	[%rd1];
-	prefetch.L2 	[%rd1];
-	ret;
-                                        // -- End function
-}
-	// .globl	prefetchu_l1            // -- Begin function prefetchu_l1
-.visible .func prefetchu_l1(
-	.param .b64 prefetchu_l1_param_0
-)                                       // @prefetchu_l1
-{
-	.reg .b64 	%rd<2>;
-
-// %bb.0:
-	ld.param.b64 	%rd1, [prefetchu_l1_param_0];
-	prefetchu.L1 	[%rd1];
-	ret;
-                                        // -- End function
-}
-	// .globl	prefetch_tensormap      // -- Begin function prefetch_tensormap
-.visible .func prefetch_tensormap(
-	.param .b64 prefetch_tensormap_param_0
-)                                       // @prefetch_tensormap
-{
-	.reg .b64 	%rd<2>;
-
-// %bb.0:
-	ld.param.b64 	%rd1, [prefetch_tensormap_param_0];
-	prefetch.tensormap 	[%rd1];
-	ret;
-                                        // -- End function
-}
-	// .globl	prefetch_const_tensormap // -- Begin function prefetch_const_tensormap
-.visible .func prefetch_const_tensormap(
-	.param .b64 prefetch_const_tensormap_param_0
-)                                       // @prefetch_const_tensormap
-{
-	.reg .b64 	%rd<2>;
-
-// %bb.0:
-	ld.param.b64 	%rd1, [prefetch_const_tensormap_param_0];
-	prefetch.const.tensormap 	[%rd1];
-	ret;
-                                        // -- End function
-}
-	// .globl	prefetch_param_tensormap // -- Begin function prefetch_param_tensormap
-.visible .func prefetch_param_tensormap(
-	.param .b64 prefetch_param_tensormap_param_0
-)                                       // @prefetch_param_tensormap
-{
-	.reg .b64 	%rd<2>;
-
-// %bb.0:
-	ld.param.b64 	%rd1, [prefetch_param_tensormap_param_0];
-	prefetch.param.tensormap 	[%rd1];
-	ret;
-                                        // -- End function
-}



More information about the llvm-commits mailing list