[llvm] [CodeGen] Treat hasOrderedMemoryRef as implying arbitrary loads or stores (PR #182000)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 4 07:59:43 PST 2026
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/182000
>From 77f871eac82283c74a11ffcd7f42c8b8e8b86bdf Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 18 Feb 2026 11:48:44 +0000
Subject: [PATCH 1/2] [CodeGen] Treat hasOrderedMemoryRef as implying arbitrary
loads or stores
---
llvm/lib/CodeGen/MachineInstr.cpp | 3 +--
.../CodeGen/AArch64/sme-streaming-checkvl.ll | 19 ++++++++++-----
llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll | 23 +++++++++----------
.../CodeGen/AMDGPU/misched-remat-revert.ll | 1 +
llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll | 10 ++++----
5 files changed, 31 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 116fc90752530..6e78c0b47aa05 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1356,8 +1356,7 @@ bool MachineInstr::isSafeToMove(bool &SawStore) const {
// Treat volatile loads as stores. This is not strictly necessary for
// volatiles, but it is required for atomic loads. It is not allowed to move
// a load across an atomic load with Ordering > Monotonic.
- if (mayStore() || isCall() || isPHI() ||
- (mayLoad() && hasOrderedMemoryRef())) {
+ if (mayStore() || isCall() || isPHI() || hasOrderedMemoryRef()) {
SawStore = true;
return false;
}
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
index 58c6e2e27c451..475cb2879da70 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
@@ -35,13 +35,13 @@ define void @foo_non_streaming_pass_arg(ptr %arg) {
; CHECK-NEXT: .cfi_offset b14, -88
; CHECK-NEXT: .cfi_offset b15, -96
; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: rdvl x8, #1
; CHECK-NEXT: addsvl x8, x8, #-1
; CHECK-NEXT: cbz x8, .LBB0_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB0_2: // %entry
-; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: sub x8, x29, #64
; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
@@ -109,14 +109,14 @@ define void @foo_streaming_compatible_pass_arg(ptr %arg) #1 {
; CHECK-NEXT: .cfi_offset b15, -1136
; CHECK-NEXT: sub sp, sp, #1024
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: rdvl x8, #1
; CHECK-NEXT: mrs x19, SVCR
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: rdvl x8, #1
; CHECK-NEXT: addsvl x8, x8, #-1
; CHECK-NEXT: cbz x8, .LBB1_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB1_2: // %entry
-; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: sub x8, x29, #1088
; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: tbnz w19, #0, .LBB1_4
@@ -177,6 +177,8 @@ define void @foo_streaming_pass_arg(ptr %arg) #0 {
; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Spill
; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Spill
; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Spill
+; CHECK-NEXT: add x29, sp, #1088
+; CHECK-NEXT: .cfi_def_cfa w29, 32
; CHECK-NEXT: .cfi_offset w28, -8
; CHECK-NEXT: .cfi_offset vg, -16
; CHECK-NEXT: .cfi_offset w30, -24
@@ -190,7 +192,10 @@ define void @foo_streaming_pass_arg(ptr %arg) #0 {
; CHECK-NEXT: .cfi_offset b14, -1112
; CHECK-NEXT: .cfi_offset b15, -1120
; CHECK-NEXT: sub sp, sp, #1024
-; CHECK-NEXT: .cfi_def_cfa_offset 2144
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: sub x8, x29, #1088
+; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: rdvl x8, #1
; CHECK-NEXT: addsvl x8, x8, #-1
@@ -198,11 +203,13 @@ define void @foo_streaming_pass_arg(ptr %arg) #0 {
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB2_2: // %entry
-; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: sub x8, x29, #1088
+; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: bl bar
; CHECK-NEXT: smstart sm
+; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #1024
-; CHECK-NEXT: .cfi_def_cfa_offset 1120
+; CHECK-NEXT: .cfi_def_cfa wsp, 1120
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll
index 83e34906fa30c..9843774d55455 100644
--- a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll
+++ b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll
@@ -9,31 +9,30 @@ define amdgpu_kernel void @func(ptr addrspace(1) %in, ptr addrspace(3) %out) {
; CHECK: ; %bb.0: ; %.lr.ph
; CHECK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[8:9], 0
-; CHECK-NEXT: s_mov_b32 s3, 32
+; CHECK-NEXT: s_mov_b32 s12, 32
; CHECK-NEXT: s_mov_b32 s2, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9]
; CHECK-NEXT: s_mov_b64 s[10:11], 0
-; CHECK-NEXT: s_mov_b64 s[12:13], 0
; CHECK-NEXT: .LBB0_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
+; CHECK-NEXT: s_and_b32 s1, s1, 0xffff
+; CHECK-NEXT: s_mov_b32 s3, s2
+; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_add_u32 s10, s6, s12
-; CHECK-NEXT: s_addc_u32 s11, s7, s13
-; CHECK-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
-; CHECK-NEXT: s_add_i32 s3, s3, -1
-; CHECK-NEXT: s_cmp_lg_u32 s3, 0
+; CHECK-NEXT: s_add_u32 s0, s6, s10
+; CHECK-NEXT: s_addc_u32 s1, s7, s11
+; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
+; CHECK-NEXT: s_add_i32 s12, s12, -1
+; CHECK-NEXT: s_cmp_lg_u32 s12, 0
; CHECK-NEXT: ; iglp_opt mask(0x00000000)
; CHECK-NEXT: s_cbranch_scc1 .LBB0_1
; CHECK-NEXT: ; %bb.2: ; %end
-; CHECK-NEXT: s_and_b32 s1, s1, 0xffff
-; CHECK-NEXT: s_mov_b32 s3, s2
-; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8
; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
; CHECK-NEXT: ds_write_b64 v2, v[0:1]
; CHECK-NEXT: s_endpgm
.lr.ph:
diff --git a/llvm/test/CodeGen/AMDGPU/misched-remat-revert.ll b/llvm/test/CodeGen/AMDGPU/misched-remat-revert.ll
index a746b486ffb74..aa6322a0500f1 100644
--- a/llvm/test/CodeGen/AMDGPU/misched-remat-revert.ll
+++ b/llvm/test/CodeGen/AMDGPU/misched-remat-revert.ll
@@ -1,5 +1,6 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -O3 -debug-only=machine-scheduler 2>&1 < %s | FileCheck -check-prefix=DEBUG %s
; REQUIRES: asserts
+; XFAIL: *
; This testcase hit a situation where reverting scheduling after the scheduler's
; rematerialization stage would cause incoherent MI and slot orders, hitting an
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
index cba0f9cbba2ca..f3495eb937891 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -1156,17 +1156,17 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
; CHECK-NEXT: bne .LBB7_6
; CHECK-NEXT: b .LBB7_2
; CHECK-NEXT: .LBB7_9:
+; CHECK-NEXT: adr r1, .LCPI7_0
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: vadd.i32 q1, q1, r1
; CHECK-NEXT: lsrs r0, r0, #3
+; CHECK-NEXT: vldrw.u32 q2, [q1, #64]!
; CHECK-NEXT: wls lr, r0, .LBB7_12
; CHECK-NEXT: @ %bb.10:
-; CHECK-NEXT: adr r0, .LCPI7_0
; CHECK-NEXT: vldr s0, [sp] @ 4-byte Reload
-; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vldrw.u32 q2, [q1, #64]!
; CHECK-NEXT: vldrw.u32 q0, [q1, #16]
; CHECK-NEXT: .LBB7_11: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q3, [q1, #24]
>From 99cda48ff8eb32621d972537cc3f68ebd7da7ca7 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 4 Mar 2026 15:58:40 +0000
Subject: [PATCH 2/2] Add a test
---
.../test/CodeGen/AMDGPU/machine-sink-fence.ll | 44 +++++++++++++++++++
1 file changed, 44 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-fence.ll
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-fence.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-fence.ll
new file mode 100644
index 0000000000000..7885a9317748b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-fence.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+; Check that the ds_load_b32 instruction is not sunk past the
+; fence-barrier-fence sequence.
+define amdgpu_kernel void @test_fence(ptr addrspace(1) %arg, i1 %arg1) {
+; CHECK-LABEL: test_fence:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; CHECK-NEXT: ds_load_b32 v1, v1
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: s_bitcmp0_b32 s0, 0
+; CHECK-NEXT: s_wait_dscnt 0x0
+; CHECK-NEXT: s_barrier_signal -1
+; CHECK-NEXT: s_barrier_wait -1
+; CHECK-NEXT: global_inv scope:SCOPE_SE
+; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
+; CHECK-NEXT: ; %bb.1: ; %bb2
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
+; CHECK-NEXT: .LBB0_2: ; %bb8
+; CHECK-NEXT: s_endpgm
+bb:
+ %i6 = load i32, ptr addrspace(3) null
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ tail call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ fence syncscope("workgroup") acquire
+ br i1 %arg1, label %bb2, label %bb8
+
+bb2:
+ %i3 = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %i5 = getelementptr i32, ptr addrspace(1) %arg, i32 %i3
+ %i7 = atomicrmw add ptr addrspace(1) %i5, i32 %i6 monotonic
+ br label %bb8
+
+bb8:
+ ret void
+}
More information about the llvm-commits
mailing list