[llvm] [SelectionDAG] Preserve volatile undef stores. (PR #99918)
Matt Davis via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 22 12:04:24 PDT 2024
https://github.com/enferex created https://github.com/llvm/llvm-project/pull/99918
This patch preserves `undef` SDNodes that are `volatile` qualified. Previously, these nodes would be discarded. The motivation behind this change is to adhere to the [LangRef](https://llvm.org/docs/LangRef.html#volatile-memory-accesses), even though that doc is mostly in terms of LLVM-IR, it seems reasonable to imply that the volatile constraints also imply to SDNodes.
> Certain memory accesses, such as [load](https://llvm.org/docs/LangRef.html#i-load)’s, [store](https://llvm.org/docs/LangRef.html#i-store)’s, and [llvm.memcpy](https://llvm.org/docs/LangRef.html#int-memcpy)’s may be marked volatile. The optimizers must not change the number of volatile operations or change their order of execution relative to other volatile operations. The optimizers may change the order of volatile operations relative to non-volatile operations. This is not Java’s “volatile” and has no cross-thread synchronization behavior.
Source: https://llvm.org/docs/LangRef.html#volatile-memory-accesses
>From 75fcfe07b7f8199e834f79a318b10c41a5118810 Mon Sep 17 00:00:00 2001
From: Matt Davis <mattd at nvidia.com>
Date: Fri, 28 Jun 2024 11:04:33 -0700
Subject: [PATCH] [SelectionDAG] Preserve volatile undef stores.
This patch preserves undef SDNodes that are volatile qualified.
Previously, these nodes would be discarded. The motivation behind this
change is to adhere to the LangRef, even though that doc is mostly in
terms of LLVM-IR, it seems reasonable to imply that the volatile
constraints also imply to SDNodes.
>From the LangRef: https://llvm.org/docs/LangRef.html#volatile-memory-accesses
> Certain memory accesses, such as load's, store's, and llvm.memcpy's may
be marked volatile. The optimizers must not change the number of
volatile operations or change their order of execution relative to other
volatile operations. The optimizers may change the order of volatile
operations relative to non-volatile operations.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
.../abi-attribute-hints-undefined-behavior.ll | 2 +-
.../CodeGen/AMDGPU/bitcast-vector-extract.ll | 4 ++--
.../AMDGPU/lds-global-non-entry-func.ll | 2 +-
.../lower-work-group-id-intrinsics-pal.ll | 23 ++++++++++++++-----
llvm/test/CodeGen/AMDGPU/mem-builtins.ll | 12 +++++-----
llvm/test/CodeGen/NVPTX/store-undef.ll | 16 +++++++++++++
7 files changed, 44 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index aa9032ea2574c..9089bfd2a0381 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -21413,7 +21413,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
}
// Turn 'store undef, Ptr' -> nothing.
- if (Value.isUndef() && ST->isUnindexed())
+ if (Value.isUndef() && ST->isUnindexed() && !ST->isVolatile())
return Chain;
// Try to infer better alignment information than the store already has.
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index ae20ab1de3a2d..32f6c79355d42 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -255,7 +255,7 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
%queue.load = load volatile i8, ptr addrspace(4) %queue.ptr
%implicitarg.load = load volatile i8, ptr addrspace(4) %implicitarg.ptr
%dispatch.load = load volatile i8, ptr addrspace(4) %dispatch.ptr
- store volatile i64 %dispatch.id, ptr addrspace(1) %ptr
+ store i64 %dispatch.id, ptr addrspace(1) %ptr
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
index 80732d5de1e20..ca339938161bd 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -73,7 +73,7 @@ define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(ptr addrspace(
define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
%undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 999) #1
%bc = bitcast i64 %undef to <2 x i32>
- store volatile <2 x i32> %bc, ptr addrspace(1) %out
+ store <2 x i32> %bc, ptr addrspace(1) %out
ret void
}
@@ -83,7 +83,7 @@ define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractel
%undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 9999) #1
%bc = bitcast i64 %undef to <2 x i32>
%elt1 = extractelement <2 x i32> %bc, i32 1
- store volatile i32 %elt1, ptr addrspace(1) %out
+ store i32 %elt1, ptr addrspace(1) %out
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 3b3e107a62967..db04af0861293 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -153,7 +153,7 @@ define void @func_use_lds_global_constexpr_cast() {
; GISEL-NEXT: s_setpc_b64 s[30:31]
; GISEL-NEXT: .LBB1_2:
; GISEL-NEXT: s_endpgm
- store volatile i32 ptrtoint (ptr addrspace(3) @lds to i32), ptr addrspace(1) poison, align 4
+ store i32 ptrtoint (ptr addrspace(3) @lds to i32), ptr addrspace(1) poison, align 4
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index 8009f917aef5a..f90753652baa5 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-SDAG %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-GISEL %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
@@ -126,10 +126,21 @@ define amdgpu_cs void @caller() {
declare amdgpu_gfx void @callee(i32)
define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
-; GFX9-LABEL: workgroup_ids_gfx:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: workgroup_ids_gfx:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_store_dword v[2:3], v0, off
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: workgroup_ids_gfx:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9ARCH-SDAG-LABEL: workgroup_ids_gfx:
; GFX9ARCH-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
index dd892ec3d59b3..0c43facb760b4 100644
--- a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
+++ b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
@@ -16,7 +16,7 @@ declare hidden i32 @strcmp(ptr nocapture, ptr nocapture) #1
define amdgpu_kernel void @test_memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, ptr nocapture %p) #0 {
entry:
%cmp = tail call i32 @memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, i64 2)
- store volatile i32 %cmp, ptr addrspace(1) undef
+ store i32 %cmp, ptr addrspace(1) undef
ret void
}
@@ -26,7 +26,7 @@ entry:
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, memchr at rel32@hi+12
define amdgpu_kernel void @test_memchr(ptr addrspace(1) %src, i32 %char, i64 %len) #0 {
%res = call ptr addrspace(1) @memchr(ptr addrspace(1) %src, i32 %char, i64 %len)
- store volatile ptr addrspace(1) %res, ptr addrspace(1) undef
+ store ptr addrspace(1) %res, ptr addrspace(1) undef
ret void
}
@@ -36,7 +36,7 @@ define amdgpu_kernel void @test_memchr(ptr addrspace(1) %src, i32 %char, i64 %le
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcpy at rel32@hi+12
define amdgpu_kernel void @test_strcpy(ptr %dst, ptr %src) #0 {
%res = call ptr @strcpy(ptr %dst, ptr %src)
- store volatile ptr %res, ptr addrspace(1) undef
+ store ptr %res, ptr addrspace(1) undef
ret void
}
@@ -46,7 +46,7 @@ define amdgpu_kernel void @test_strcpy(ptr %dst, ptr %src) #0 {
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcmp at rel32@hi+12
define amdgpu_kernel void @test_strcmp(ptr %src0, ptr %src1) #0 {
%res = call i32 @strcmp(ptr %src0, ptr %src1)
- store volatile i32 %res, ptr addrspace(1) undef
+ store i32 %res, ptr addrspace(1) undef
ret void
}
@@ -56,7 +56,7 @@ define amdgpu_kernel void @test_strcmp(ptr %src0, ptr %src1) #0 {
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strlen at rel32@hi+12
define amdgpu_kernel void @test_strlen(ptr %src) #0 {
%res = call i32 @strlen(ptr %src)
- store volatile i32 %res, ptr addrspace(1) undef
+ store i32 %res, ptr addrspace(1) undef
ret void
}
@@ -66,7 +66,7 @@ define amdgpu_kernel void @test_strlen(ptr %src) #0 {
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strnlen at rel32@hi+12
define amdgpu_kernel void @test_strnlen(ptr %src, i32 %size) #0 {
%res = call i32 @strnlen(ptr %src, i32 %size)
- store volatile i32 %res, ptr addrspace(1) undef
+ store i32 %res, ptr addrspace(1) undef
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll
index 4941760a78c79..6d54a15af25d9 100644
--- a/llvm/test/CodeGen/NVPTX/store-undef.ll
+++ b/llvm/test/CodeGen/NVPTX/store-undef.ll
@@ -90,3 +90,19 @@ define void @test_store_def(i64 %param0, i32 %param1, ptr %out) {
store %struct.T %S2, ptr %out
ret void
}
+
+define void @test_store_volatile_undef(ptr %out) {
+; CHECK-LABEL: test_store_volatile_undef(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_store_volatile_undef_param_0];
+; CHECK-NEXT: st.volatile.v4.u32 [%rd1+16], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT: st.volatile.v2.u32 [%rd1+8], {%r5, %r6};
+; CHECK-NEXT: st.volatile.u64 [%rd1], %rd2;
+; CHECK-NEXT: ret;
+ store volatile %struct.T undef, ptr %out
+ ret void
+}
More information about the llvm-commits
mailing list