[llvm] [SelectionDAG] Preserve volatile undef stores. (PR #99918)

Matt Davis via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 23 11:04:43 PDT 2024


https://github.com/enferex updated https://github.com/llvm/llvm-project/pull/99918

>From 75fcfe07b7f8199e834f79a318b10c41a5118810 Mon Sep 17 00:00:00 2001
From: Matt Davis <mattd at nvidia.com>
Date: Fri, 28 Jun 2024 11:04:33 -0700
Subject: [PATCH 1/5] [SelectionDAG] Preserve volatile undef stores.

This patch preserves undef SDNodes that are volatile qualified.
Previously, these nodes would be discarded. The motivation behind this
change is to adhere to the LangRef, even though that doc is mostly in
terms of LLVM-IR, it seems reasonable to imply that the volatile
constraints also imply to SDNodes.

>From the LangRef: https://llvm.org/docs/LangRef.html#volatile-memory-accesses
> Certain memory accesses, such as load's, store's, and llvm.memcpy's may
be marked volatile. The optimizers must not change the number of
volatile operations or change their order of execution relative to other
volatile operations. The optimizers may change the order of volatile
operations relative to non-volatile operations.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  2 +-
 .../abi-attribute-hints-undefined-behavior.ll |  2 +-
 .../CodeGen/AMDGPU/bitcast-vector-extract.ll  |  4 ++--
 .../AMDGPU/lds-global-non-entry-func.ll       |  2 +-
 .../lower-work-group-id-intrinsics-pal.ll     | 23 ++++++++++++++-----
 llvm/test/CodeGen/AMDGPU/mem-builtins.ll      | 12 +++++-----
 llvm/test/CodeGen/NVPTX/store-undef.ll        | 16 +++++++++++++
 7 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index aa9032ea2574c..9089bfd2a0381 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -21413,7 +21413,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   }
 
   // Turn 'store undef, Ptr' -> nothing.
-  if (Value.isUndef() && ST->isUnindexed())
+  if (Value.isUndef() && ST->isUnindexed() && !ST->isVolatile())
     return Chain;
 
   // Try to infer better alignment information than the store already has.
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index ae20ab1de3a2d..32f6c79355d42 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -255,7 +255,7 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
   %queue.load = load volatile i8, ptr addrspace(4) %queue.ptr
   %implicitarg.load = load volatile i8, ptr addrspace(4) %implicitarg.ptr
   %dispatch.load = load volatile i8, ptr addrspace(4) %dispatch.ptr
-  store volatile i64 %dispatch.id, ptr addrspace(1) %ptr
+  store i64 %dispatch.id, ptr addrspace(1) %ptr
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
index 80732d5de1e20..ca339938161bd 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -73,7 +73,7 @@ define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(ptr addrspace(
 define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 999) #1
   %bc = bitcast i64 %undef to <2 x i32>
-  store volatile <2 x i32> %bc, ptr addrspace(1) %out
+  store <2 x i32> %bc, ptr addrspace(1) %out
   ret void
 }
 
@@ -83,7 +83,7 @@ define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractel
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 9999) #1
   %bc = bitcast i64 %undef to <2 x i32>
   %elt1 = extractelement <2 x i32> %bc, i32 1
-  store volatile i32 %elt1, ptr addrspace(1) %out
+  store i32 %elt1, ptr addrspace(1) %out
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 3b3e107a62967..db04af0861293 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -153,7 +153,7 @@ define void @func_use_lds_global_constexpr_cast() {
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ; GISEL-NEXT:  .LBB1_2:
 ; GISEL-NEXT:    s_endpgm
-  store volatile i32 ptrtoint (ptr addrspace(3) @lds to i32), ptr addrspace(1) poison, align 4
+  store i32 ptrtoint (ptr addrspace(3) @lds to i32), ptr addrspace(1) poison, align 4
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index 8009f917aef5a..f90753652baa5 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-SDAG %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-GISEL %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
@@ -126,10 +126,21 @@ define amdgpu_cs void @caller() {
 declare amdgpu_gfx void @callee(i32)
 
 define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
-; GFX9-LABEL: workgroup_ids_gfx:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: workgroup_ids_gfx:
+; GFX9-SDAG:           ; %bb.0:
+; GFX9-SDAG-NEXT:        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:        global_store_dword v[0:1], v0, off
+; GFX9-SDAG-NEXT:        s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:        global_store_dword v[2:3], v0, off
+; GFX9-SDAG-NEXT:        s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:        global_store_dword v[4:5], v0, off
+; GFX9-SDAG-NEXT:        s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:        s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: workgroup_ids_gfx:
+; GFX9-GISEL:          ; %bb.0:
+; GFX9-GISEL-NEXT:       s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:       s_setpc_b64 s[30:31]
 ;
 ; GFX9ARCH-SDAG-LABEL: workgroup_ids_gfx:
 ; GFX9ARCH-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
index dd892ec3d59b3..0c43facb760b4 100644
--- a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
+++ b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
@@ -16,7 +16,7 @@ declare hidden i32 @strcmp(ptr nocapture, ptr nocapture) #1
 define amdgpu_kernel void @test_memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, ptr nocapture %p) #0 {
 entry:
   %cmp = tail call i32 @memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, i64 2)
-  store volatile i32 %cmp, ptr addrspace(1) undef
+  store i32 %cmp, ptr addrspace(1) undef
   ret void
 }
 
@@ -26,7 +26,7 @@ entry:
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, memchr at rel32@hi+12
 define amdgpu_kernel void @test_memchr(ptr addrspace(1) %src, i32 %char, i64 %len) #0 {
   %res = call ptr addrspace(1) @memchr(ptr addrspace(1) %src, i32 %char, i64 %len)
-  store volatile ptr addrspace(1) %res, ptr addrspace(1) undef
+  store ptr addrspace(1) %res, ptr addrspace(1) undef
   ret void
 }
 
@@ -36,7 +36,7 @@ define amdgpu_kernel void @test_memchr(ptr addrspace(1) %src, i32 %char, i64 %le
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcpy at rel32@hi+12
 define amdgpu_kernel void @test_strcpy(ptr %dst, ptr %src) #0 {
   %res = call ptr @strcpy(ptr %dst, ptr %src)
-  store volatile ptr %res, ptr addrspace(1) undef
+  store ptr %res, ptr addrspace(1) undef
   ret void
 }
 
@@ -46,7 +46,7 @@ define amdgpu_kernel void @test_strcpy(ptr %dst, ptr %src) #0 {
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcmp at rel32@hi+12
 define amdgpu_kernel void @test_strcmp(ptr %src0, ptr %src1) #0 {
   %res = call i32 @strcmp(ptr %src0, ptr %src1)
-  store volatile i32 %res, ptr addrspace(1) undef
+  store i32 %res, ptr addrspace(1) undef
   ret void
 }
 
@@ -56,7 +56,7 @@ define amdgpu_kernel void @test_strcmp(ptr %src0, ptr %src1) #0 {
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strlen at rel32@hi+12
 define amdgpu_kernel void @test_strlen(ptr %src) #0 {
   %res = call i32 @strlen(ptr %src)
-  store volatile i32 %res, ptr addrspace(1) undef
+  store i32 %res, ptr addrspace(1) undef
   ret void
 }
 
@@ -66,7 +66,7 @@ define amdgpu_kernel void @test_strlen(ptr %src) #0 {
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strnlen at rel32@hi+12
 define amdgpu_kernel void @test_strnlen(ptr %src, i32 %size) #0 {
   %res = call i32 @strnlen(ptr %src, i32 %size)
-  store volatile i32 %res, ptr addrspace(1) undef
+  store i32 %res, ptr addrspace(1) undef
   ret void
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll
index 4941760a78c79..6d54a15af25d9 100644
--- a/llvm/test/CodeGen/NVPTX/store-undef.ll
+++ b/llvm/test/CodeGen/NVPTX/store-undef.ll
@@ -90,3 +90,19 @@ define void @test_store_def(i64 %param0, i32 %param1, ptr %out) {
   store %struct.T %S2, ptr %out
   ret void
 }
+
+define void @test_store_volatile_undef(ptr %out) {
+; CHECK-LABEL: test_store_volatile_undef(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_volatile_undef_param_0];
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd1+16], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    st.volatile.v2.u32 [%rd1+8], {%r5, %r6};
+; CHECK-NEXT:    st.volatile.u64 [%rd1], %rd2;
+; CHECK-NEXT:    ret;
+  store volatile %struct.T undef, ptr %out
+  ret void
+}

>From 13dad1bb08faa566994dca30585743615f92c7ca Mon Sep 17 00:00:00 2001
From: Matt Davis <mattd at nvidia.com>
Date: Tue, 23 Jul 2024 04:27:37 -0700
Subject: [PATCH 2/5] Remove dummy stores from a few test cases and add a
 store-to-poison test.

The AMDGPU tests were cleaned up to remove the store instead of dropping
the volatile qualifier as I had done in the original patch.

The NVPTX test was expanded to test storing of a vector to poison.
---
 .../AMDGPU/abi-attribute-hints-undefined-behavior.ll  |  1 -
 llvm/test/CodeGen/AMDGPU/mem-builtins.ll              |  6 ------
 llvm/test/CodeGen/NVPTX/store-undef.ll                | 11 ++++++++---
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 32f6c79355d42..e53653408feb4 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -255,7 +255,6 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
   %queue.load = load volatile i8, ptr addrspace(4) %queue.ptr
   %implicitarg.load = load volatile i8, ptr addrspace(4) %implicitarg.ptr
   %dispatch.load = load volatile i8, ptr addrspace(4) %dispatch.ptr
-  store i64 %dispatch.id, ptr addrspace(1) %ptr
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
index 0c43facb760b4..a4d1cdbd55b61 100644
--- a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
+++ b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
@@ -16,7 +16,6 @@ declare hidden i32 @strcmp(ptr nocapture, ptr nocapture) #1
 define amdgpu_kernel void @test_memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, ptr nocapture %p) #0 {
 entry:
   %cmp = tail call i32 @memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, i64 2)
-  store i32 %cmp, ptr addrspace(1) undef
   ret void
 }
 
@@ -26,7 +25,6 @@ entry:
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, memchr at rel32@hi+12
 define amdgpu_kernel void @test_memchr(ptr addrspace(1) %src, i32 %char, i64 %len) #0 {
   %res = call ptr addrspace(1) @memchr(ptr addrspace(1) %src, i32 %char, i64 %len)
-  store ptr addrspace(1) %res, ptr addrspace(1) undef
   ret void
 }
 
@@ -36,7 +34,6 @@ define amdgpu_kernel void @test_memchr(ptr addrspace(1) %src, i32 %char, i64 %le
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcpy at rel32@hi+12
 define amdgpu_kernel void @test_strcpy(ptr %dst, ptr %src) #0 {
   %res = call ptr @strcpy(ptr %dst, ptr %src)
-  store ptr %res, ptr addrspace(1) undef
   ret void
 }
 
@@ -46,7 +43,6 @@ define amdgpu_kernel void @test_strcpy(ptr %dst, ptr %src) #0 {
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcmp at rel32@hi+12
 define amdgpu_kernel void @test_strcmp(ptr %src0, ptr %src1) #0 {
   %res = call i32 @strcmp(ptr %src0, ptr %src1)
-  store i32 %res, ptr addrspace(1) undef
   ret void
 }
 
@@ -56,7 +52,6 @@ define amdgpu_kernel void @test_strcmp(ptr %src0, ptr %src1) #0 {
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strlen at rel32@hi+12
 define amdgpu_kernel void @test_strlen(ptr %src) #0 {
   %res = call i32 @strlen(ptr %src)
-  store i32 %res, ptr addrspace(1) undef
   ret void
 }
 
@@ -66,7 +61,6 @@ define amdgpu_kernel void @test_strlen(ptr %src) #0 {
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strnlen at rel32@hi+12
 define amdgpu_kernel void @test_strnlen(ptr %src, i32 %size) #0 {
   %res = call i32 @strnlen(ptr %src, i32 %size)
-  store i32 %res, ptr addrspace(1) undef
   ret void
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll
index 6d54a15af25d9..7aca0835c7862 100644
--- a/llvm/test/CodeGen/NVPTX/store-undef.ll
+++ b/llvm/test/CodeGen/NVPTX/store-undef.ll
@@ -91,18 +91,23 @@ define void @test_store_def(i64 %param0, i32 %param1, ptr %out) {
   ret void
 }
 
-define void @test_store_volatile_undef(ptr %out) {
+define void @test_store_volatile_undef(ptr %out, <8 x i32> %vec) {
 ; CHECK-LABEL: test_store_volatile_undef(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_volatile_undef_param_0];
 ; CHECK-NEXT:    st.volatile.v4.u32 [%rd1+16], {%r1, %r2, %r3, %r4};
 ; CHECK-NEXT:    st.volatile.v2.u32 [%rd1+8], {%r5, %r6};
 ; CHECK-NEXT:    st.volatile.u64 [%rd1], %rd2;
+; CHECK-NEXT:    ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_store_volatile_undef_param_1];
+; CHECK-NEXT:    ld.param.v4.u32 {%r11, %r12, %r13, %r14}, [test_store_volatile_undef_param_1+16];
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd3], {%r11, %r12, %r13, %r14};
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd4], {%r7, %r8, %r9, %r10};
 ; CHECK-NEXT:    ret;
   store volatile %struct.T undef, ptr %out
+  store volatile <8 x i32> %vec, <8 x i32>* poison
   ret void
 }

>From bbbfe89d74a1b10d8870617f0eded895c3fa616e Mon Sep 17 00:00:00 2001
From: Matt Davis <mattd at nvidia.com>
Date: Tue, 23 Jul 2024 05:51:41 -0700
Subject: [PATCH 3/5] Update the AMDGPU/mem-builtins.ll to store to a non-dummy
 output formal.

---
 llvm/test/CodeGen/AMDGPU/mem-builtins.ll | 30 ++++++++++++++----------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
index a4d1cdbd55b61..99090da4da513 100644
--- a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
+++ b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
@@ -9,58 +9,64 @@ declare hidden i32 @strnlen(ptr nocapture, i32) #1
 declare hidden i32 @strcmp(ptr nocapture, ptr nocapture) #1
 
 
-; ERROR: error: <unknown>:0:0: in function test_memcmp void (ptr addrspace(1), ptr addrspace(1), ptr): unsupported call to function memcmp
+; ERROR: error: <unknown>:0:0: in function test_memcmp void (ptr addrspace(1), ptr addrspace(1), ptr, ptr addrspace(1)): unsupported call to function memcmp
 
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, memcmp at rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, memcmp at rel32@hi+12
-define amdgpu_kernel void @test_memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, ptr nocapture %p) #0 {
+define amdgpu_kernel void @test_memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, ptr nocapture %p, ptr addrspace(1) %out) #0 {
 entry:
   %cmp = tail call i32 @memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, i64 2)
+  store i32 %cmp, ptr addrspace(1) %out
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function test_memchr void (ptr addrspace(1), i32, i64): unsupported call to function memchr
+; ERROR: error: <unknown>:0:0: in function test_memchr void (ptr addrspace(1), i32, i64, ptr addrspace(1)): unsupported call to function memchr
 
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, memchr at rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, memchr at rel32@hi+12
-define amdgpu_kernel void @test_memchr(ptr addrspace(1) %src, i32 %char, i64 %len) #0 {
+define amdgpu_kernel void @test_memchr(ptr addrspace(1) %src, i32 %char, i64 %len, ptr addrspace(1) %out) #0 {
   %res = call ptr addrspace(1) @memchr(ptr addrspace(1) %src, i32 %char, i64 %len)
+  store ptr addrspace(1) %res, ptr addrspace(1) %out
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function test_strcpy void (ptr, ptr): unsupported call to function strcpy
+; ERROR: error: <unknown>:0:0: in function test_strcpy void (ptr, ptr, ptr addrspace(1)): unsupported call to function strcpy
 
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcpy at rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcpy at rel32@hi+12
-define amdgpu_kernel void @test_strcpy(ptr %dst, ptr %src) #0 {
+define amdgpu_kernel void @test_strcpy(ptr %dst, ptr %src, ptr addrspace(1) %out) #0 {
   %res = call ptr @strcpy(ptr %dst, ptr %src)
+  store ptr %res, ptr addrspace(1) %out
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function test_strcmp void (ptr, ptr): unsupported call to function strcmp
+; ERROR: error: <unknown>:0:0: in function test_strcmp void (ptr, ptr, ptr addrspace(1)): unsupported call to function strcmp
 
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcmp at rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcmp at rel32@hi+12
-define amdgpu_kernel void @test_strcmp(ptr %src0, ptr %src1) #0 {
+define amdgpu_kernel void @test_strcmp(ptr %src0, ptr %src1, ptr addrspace(1) %out) #0 {
   %res = call i32 @strcmp(ptr %src0, ptr %src1)
+  store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function test_strlen void (ptr): unsupported call to function strlen
+; ERROR: error: <unknown>:0:0: in function test_strlen void (ptr, ptr addrspace(1)): unsupported call to function strlen
 
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, strlen at rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strlen at rel32@hi+12
-define amdgpu_kernel void @test_strlen(ptr %src) #0 {
+define amdgpu_kernel void @test_strlen(ptr %src, ptr addrspace(1) %out) #0 {
   %res = call i32 @strlen(ptr %src)
+  store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function test_strnlen void (ptr, i32): unsupported call to function strnlen
+; ERROR: error: <unknown>:0:0: in function test_strnlen void (ptr, i32, ptr addrspace(1)): unsupported call to function strnlen
 
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, strnlen at rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strnlen at rel32@hi+12
-define amdgpu_kernel void @test_strnlen(ptr %src, i32 %size) #0 {
+define amdgpu_kernel void @test_strnlen(ptr %src, i32 %size, ptr addrspace(1) %out) #0 {
   %res = call i32 @strnlen(ptr %src, i32 %size)
+  store i32 %res, ptr addrspace(1) %out
   ret void
 }
 

>From be407ec0ff7e0ae2b9cbeb733b2a84dc2d7d8292 Mon Sep 17 00:00:00 2001
From: Matt Davis <mattd at nvidia.com>
Date: Tue, 23 Jul 2024 06:11:18 -0700
Subject: [PATCH 4/5] Update AMDGPU/lds-global-non-entry-func.ll: store to a
 dummy output formal.

---
 llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index db04af0861293..397502711283e 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -98,8 +98,8 @@ define void @func_use_lds_global() {
   ret void
 }
 
-; ERR: warning: <unknown>:0:0: in function func_use_lds_global_constexpr_cast void (): local memory global used by non-kernel function
-define void @func_use_lds_global_constexpr_cast() {
+; ERR: warning: <unknown>:0:0: in function func_use_lds_global_constexpr_cast void (ptr addrspace(1)): local memory global used by non-kernel function
+define void @func_use_lds_global_constexpr_cast(ptr addrspace(1) %out) {
 ; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -153,7 +153,7 @@ define void @func_use_lds_global_constexpr_cast() {
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ; GISEL-NEXT:  .LBB1_2:
 ; GISEL-NEXT:    s_endpgm
-  store i32 ptrtoint (ptr addrspace(3) @lds to i32), ptr addrspace(1) poison, align 4
+  store i32 ptrtoint (ptr addrspace(3) @lds to i32), ptr addrspace(1) %out, align 4
   ret void
 }
 

>From 1372c6ce047c75b945651a0412d8c91690f817f8 Mon Sep 17 00:00:00 2001
From: Matt Davis <mattd at nvidia.com>
Date: Tue, 23 Jul 2024 10:26:05 -0700
Subject: [PATCH 5/5] Create separate test routines to exercise testing poison.

---
 llvm/test/CodeGen/NVPTX/store-undef.ll | 38 ++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll
index 7aca0835c7862..9f5461339a923 100644
--- a/llvm/test/CodeGen/NVPTX/store-undef.ll
+++ b/llvm/test/CodeGen/NVPTX/store-undef.ll
@@ -91,7 +91,7 @@ define void @test_store_def(i64 %param0, i32 %param1, ptr %out) {
   ret void
 }
 
-define void @test_store_volatile_undef(ptr %out, <8 x i32> %vec) {
+define void @test_store_volatile_undef(ptr %out, <8 x i32> %outVec) {
 ; CHECK-LABEL: test_store_volatile_undef(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<15>;
@@ -108,6 +108,40 @@ define void @test_store_volatile_undef(ptr %out, <8 x i32> %vec) {
 ; CHECK-NEXT:    st.volatile.v4.u32 [%rd4], {%r7, %r8, %r9, %r10};
 ; CHECK-NEXT:    ret;
   store volatile %struct.T undef, ptr %out
-  store volatile <8 x i32> %vec, <8 x i32>* poison
+  store volatile <8 x i32> %outVec, ptr undef
+  ret void
+}
+
+define void @test_store_volatile_of_poison(ptr %out) {
+; CHECK-LABEL: test_store_volatile_of_poison(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_volatile_of_poison_param_0];
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd1+16], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    st.volatile.v2.u32 [%rd1+8], {%r5, %r6};
+; CHECK-NEXT:    st.volatile.u64 [%rd1], %rd2;
+; CHECK-NEXT:    ret;
+  store volatile %struct.T poison, ptr %out
+  ret void
+}
+
+define void @test_store_volatile_to_poison(%struct.T %param) {
+; CHECK-LABEL: test_store_volatile_to_poison(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_volatile_to_poison_param_0];
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_store_volatile_to_poison_param_0+8];
+; CHECK-NEXT:    ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_store_volatile_to_poison_param_0+16];
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd2], {%r3, %r4, %r5, %r6};
+; CHECK-NEXT:    st.volatile.v2.u32 [%rd3], {%r1, %r2};
+; CHECK-NEXT:    st.volatile.u64 [%rd4], %rd1;
+; CHECK-NEXT:    ret;
+  store volatile %struct.T %param, ptr poison
   ret void
 }



More information about the llvm-commits mailing list