[llvm] [X86] Use fence(seq_cst) in IdempotentRMWIntoFencedLoad (PR #126521)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 10 06:29:10 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Valentin Churavy (vchuravy)
<details>
<summary>Changes</summary>
This extends this optimization for scenarios where the subtarget
has `!hasMFence` or we have SyncScope SingleThread, by avoiding
the direct usage of `llvm.x64.sse2.mfence`.
Originally part of #<!-- -->106555
---
Full diff: https://github.com/llvm/llvm-project/pull/126521.diff
2 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+3-14)
- (modified) llvm/test/CodeGen/X86/atomic-idempotent.ll (+30-56)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 995b4de12ce12c..8a27bd1acaa2cd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31807,21 +31807,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// otherwise, we might be able to be more aggressive on relaxed idempotent
// rmw. In practice, they do not look useful, so we don't try to be
// especially clever.
- if (SSID == SyncScope::SingleThread)
- // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
- // the IR level, so we must wrap it in an intrinsic.
- return nullptr;
-
- if (!Subtarget.hasMFence())
- // FIXME: it might make sense to use a locked operation here but on a
- // different cache-line to prevent cache-line bouncing. In practice it
- // is probably a small win, and x86 processors without mfence are rare
- // enough that we do not bother.
- return nullptr;
- Function *MFence =
- llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse2_mfence);
- Builder.CreateCall(MFence, {});
+ // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
+ // lowering for SSID == SyncScope::SingleThread and !hasMFence
+ Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
// Finally we can emit the atomic load.
LoadInst *Loaded = Builder.CreateAlignedLoad(
diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll
index 55b4d1af094f67..10e8cfc0ad497e 100644
--- a/llvm/test/CodeGen/X86/atomic-idempotent.ll
+++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll
@@ -27,18 +27,16 @@ define i8 @add8(ptr %p) {
;
; X86-SLM-LABEL: add8:
; X86-SLM: # %bb.0:
-; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT: xorl %eax, %eax
-; X86-SLM-NEXT: lock xaddb %al, (%ecx)
-; X86-SLM-NEXT: # kill: def $al killed $al killed $eax
+; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT: lock orl $0, (%esp)
+; X86-SLM-NEXT: movzbl (%eax), %eax
; X86-SLM-NEXT: retl
;
; X86-ATOM-LABEL: add8:
; X86-ATOM: # %bb.0:
-; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT: xorl %eax, %eax
-; X86-ATOM-NEXT: lock xaddb %al, (%ecx)
-; X86-ATOM-NEXT: # kill: def $al killed $al killed $eax
+; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT: lock orl $0, (%esp)
+; X86-ATOM-NEXT: movzbl (%eax), %eax
; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: retl
@@ -62,26 +60,18 @@ define i16 @or16(ptr %p) {
;
; X86-SLM-LABEL: or16:
; X86-SLM: # %bb.0:
-; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT: movzwl (%ecx), %eax
-; X86-SLM-NEXT: .p2align 4
-; X86-SLM-NEXT: .LBB1_1: # %atomicrmw.start
-; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT: lock cmpxchgw %ax, (%ecx)
-; X86-SLM-NEXT: jne .LBB1_1
-; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT: lock orl $0, (%esp)
+; X86-SLM-NEXT: movzwl (%eax), %eax
; X86-SLM-NEXT: retl
;
; X86-ATOM-LABEL: or16:
; X86-ATOM: # %bb.0:
-; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT: movzwl (%ecx), %eax
-; X86-ATOM-NEXT: .p2align 4
-; X86-ATOM-NEXT: .LBB1_1: # %atomicrmw.start
-; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT: lock cmpxchgw %ax, (%ecx)
-; X86-ATOM-NEXT: jne .LBB1_1
-; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT: lock orl $0, (%esp)
+; X86-ATOM-NEXT: movzwl (%eax), %eax
+; X86-ATOM-NEXT: nop
+; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: retl
%1 = atomicrmw or ptr %p, i16 0 acquire
ret i16 %1
@@ -103,26 +93,18 @@ define i32 @xor32(ptr %p) {
;
; X86-SLM-LABEL: xor32:
; X86-SLM: # %bb.0:
-; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT: movl (%ecx), %eax
-; X86-SLM-NEXT: .p2align 4
-; X86-SLM-NEXT: .LBB2_1: # %atomicrmw.start
-; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
-; X86-SLM-NEXT: jne .LBB2_1
-; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT: lock orl $0, (%esp)
+; X86-SLM-NEXT: movl (%eax), %eax
; X86-SLM-NEXT: retl
;
; X86-ATOM-LABEL: xor32:
; X86-ATOM: # %bb.0:
-; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT: movl (%ecx), %eax
-; X86-ATOM-NEXT: .p2align 4
-; X86-ATOM-NEXT: .LBB2_1: # %atomicrmw.start
-; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
-; X86-ATOM-NEXT: jne .LBB2_1
-; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT: lock orl $0, (%esp)
+; X86-ATOM-NEXT: movl (%eax), %eax
+; X86-ATOM-NEXT: nop
+; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: retl
%1 = atomicrmw xor ptr %p, i32 0 release
ret i32 %1
@@ -318,26 +300,18 @@ define i32 @and32 (ptr %p) {
;
; X86-SLM-LABEL: and32:
; X86-SLM: # %bb.0:
-; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT: movl (%ecx), %eax
-; X86-SLM-NEXT: .p2align 4
-; X86-SLM-NEXT: .LBB5_1: # %atomicrmw.start
-; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
-; X86-SLM-NEXT: jne .LBB5_1
-; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT: lock orl $0, (%esp)
+; X86-SLM-NEXT: movl (%eax), %eax
; X86-SLM-NEXT: retl
;
; X86-ATOM-LABEL: and32:
; X86-ATOM: # %bb.0:
-; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT: movl (%ecx), %eax
-; X86-ATOM-NEXT: .p2align 4
-; X86-ATOM-NEXT: .LBB5_1: # %atomicrmw.start
-; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
-; X86-ATOM-NEXT: jne .LBB5_1
-; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT: lock orl $0, (%esp)
+; X86-ATOM-NEXT: movl (%eax), %eax
+; X86-ATOM-NEXT: nop
+; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: retl
%1 = atomicrmw and ptr %p, i32 -1 acq_rel
ret i32 %1
``````````
</details>
https://github.com/llvm/llvm-project/pull/126521
More information about the llvm-commits
mailing list