[llvm] b334321 - [X86] Prefer `lock or` over mfence (#106555)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 11 08:12:27 PDT 2025
Author: Valentin Churavy
Date: 2025-03-11T16:12:23+01:00
New Revision: b334321678d4df2d8b1572301c827cae4d4097e2
URL: https://github.com/llvm/llvm-project/commit/b334321678d4df2d8b1572301c827cae4d4097e2
DIFF: https://github.com/llvm/llvm-project/commit/b334321678d4df2d8b1572301c827cae4d4097e2.diff
LOG: [X86] Prefer `lock or` over mfence (#106555)
Originally discussed in https://reviews.llvm.org/D129947
LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On
modern CPUs lock or is more efficient and provides the same sequential
consistency. GCC 11 made this switch as well (see
https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html)
and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632
moved into this direction as well, but didn't touch fence seq_cst.
This switches to `lock or` on all x64 systems, and leaves `__builtin_ia32_mfence` for folks who
want this precise instruction.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86Subtarget.h
llvm/test/CodeGen/X86/atomic-idempotent.ll
llvm/test/CodeGen/X86/atomic-unordered.ll
llvm/test/CodeGen/X86/implicit-null-check.ll
llvm/test/CodeGen/X86/membarrier.ll
llvm/test/CodeGen/X86/mfence.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8984022f8877c..543196adf29e4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31908,7 +31908,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// especially clever.
// Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
- // lowering for SSID == SyncScope::SingleThread and !hasMFence
+ // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
// Finally we can emit the atomic load.
@@ -31997,7 +31997,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
// cross-thread fence.
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
FenceSSID == SyncScope::System) {
- if (Subtarget.hasMFence())
+ if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
SDValue Chain = Op.getOperand(0);
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 722076ca88c9c..8f2d326a69398 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -280,6 +280,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// supports it.
bool hasMFence() const { return hasSSE2() || is64Bit(); }
+ /// Avoid use of `mfence` for`fence seq_cst`, and instead use `lock or`.
+ bool avoidMFence() const { return is64Bit(); }
+
const Triple &getTargetTriple() const { return TargetTriple; }
bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll
index 91355bd64cade..020f9eb793102 100644
--- a/llvm/test/CodeGen/X86/atomic-idempotent.ll
+++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll
@@ -14,7 +14,7 @@
define i8 @add8(ptr %p) #0 {
; X64-LABEL: add8:
; X64: # %bb.0:
-; X64-NEXT: mfence
+; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl (%rdi), %eax
; X64-NEXT: retq
;
@@ -47,7 +47,7 @@ define i8 @add8(ptr %p) #0 {
define i16 @or16(ptr %p) #0 {
; X64-LABEL: or16:
; X64: # %bb.0:
-; X64-NEXT: mfence
+; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: retq
;
@@ -80,7 +80,7 @@ define i16 @or16(ptr %p) #0 {
define i32 @xor32(ptr %p) #0 {
; X64-LABEL: xor32:
; X64: # %bb.0:
-; X64-NEXT: mfence
+; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: retq
;
@@ -113,7 +113,7 @@ define i32 @xor32(ptr %p) #0 {
define i64 @sub64(ptr %p) #0 {
; X64-LABEL: sub64:
; X64: # %bb.0:
-; X64-NEXT: mfence
+; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: retq
;
@@ -265,7 +265,7 @@ define i128 @or128(ptr %p) #0 {
define i32 @and32 (ptr %p) #0 {
; X64-LABEL: and32:
; X64: # %bb.0:
-; X64-NEXT: mfence
+; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 3fb994cdb751a..e8e0ee0b7ef49 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -2096,7 +2096,7 @@ define i64 @nofold_fence(ptr %p) {
; CHECK-LABEL: nofold_fence:
; CHECK: # %bb.0:
; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq $15, %rax
; CHECK-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
@@ -2170,7 +2170,7 @@ define i64 @fold_constant_fence(i64 %arg) {
; CHECK-LABEL: fold_constant_fence:
; CHECK: # %bb.0:
; CHECK-NEXT: movq Constant(%rip), %rax
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq %rdi, %rax
; CHECK-NEXT: retq
%v = load atomic i64, ptr @Constant unordered, align 8
@@ -2197,7 +2197,7 @@ define i64 @fold_invariant_fence(ptr dereferenceable(8) %p, i64 %arg) {
; CHECK-LABEL: fold_invariant_fence:
; CHECK: # %bb.0:
; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq %rsi, %rax
; CHECK-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8, !invariant.load !{}
@@ -2321,7 +2321,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O0-LABEL: fold_cmp_over_fence:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movl (%rdi), %eax
-; CHECK-O0-NEXT: mfence
+; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-O0-NEXT: cmpl %eax, %esi
; CHECK-O0-NEXT: jne .LBB116_2
; CHECK-O0-NEXT: # %bb.1: # %taken
@@ -2335,7 +2335,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O3-LABEL: fold_cmp_over_fence:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movl (%rdi), %eax
-; CHECK-O3-NEXT: mfence
+; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-O3-NEXT: cmpl %eax, %esi
; CHECK-O3-NEXT: jne .LBB116_2
; CHECK-O3-NEXT: # %bb.1: # %taken
diff --git a/llvm/test/CodeGen/X86/implicit-null-check.ll b/llvm/test/CodeGen/X86/implicit-null-check.ll
index fc81f703f5d40..de63c9ae209df 100644
--- a/llvm/test/CodeGen/X86/implicit-null-check.ll
+++ b/llvm/test/CodeGen/X86/implicit-null-check.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -verify-machineinstrs -O3 -mtriple=x86_64-apple-macosx -enable-implicit-null-checks < %s | FileCheck %s
define i32 @imp_null_check_load(ptr %x) {
@@ -465,7 +466,7 @@ define i32 @imp_null_check_load_fence2(ptr %x) {
; CHECK-NEXT: testq %rdi, %rdi
; CHECK-NEXT: je LBB17_1
; CHECK-NEXT: ## %bb.2: ## %not_null
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movl (%rdi), %eax
; CHECK-NEXT: retq
; CHECK-NEXT: LBB17_1: ## %is_null
diff --git a/llvm/test/CodeGen/X86/membarrier.ll b/llvm/test/CodeGen/X86/membarrier.ll
index 55f2a2f210139..2773f01f7ab82 100644
--- a/llvm/test/CodeGen/X86/membarrier.ll
+++ b/llvm/test/CodeGen/X86/membarrier.ll
@@ -6,9 +6,9 @@ define i32 @t() {
; CHECK-LABEL: t:
; CHECK: # %bb.0:
; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: lock decl -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retq
%i = alloca i32, align 4
diff --git a/llvm/test/CodeGen/X86/mfence.ll b/llvm/test/CodeGen/X86/mfence.ll
index 8c29af8648712..ce74d2dd69f9b 100644
--- a/llvm/test/CodeGen/X86/mfence.ll
+++ b/llvm/test/CodeGen/X86/mfence.ll
@@ -5,10 +5,15 @@
; It doesn't matter if an x86-64 target has specified "no-sse2"; we still can use mfence.
define void @test() {
-; CHECK-LABEL: test:
-; CHECK: # %bb.0:
-; CHECK-NEXT: mfence
-; CHECK-NEXT: ret{{[l|q]}}
+; X86-LABEL: test:
+; X86: # %bb.0:
+; X86-NEXT: mfence
+; X86-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: # %bb.0:
+; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: retq
fence seq_cst
ret void
}
@@ -23,10 +28,25 @@ define i32 @fence(ptr %ptr) {
;
; X64-LABEL: fence:
; X64: # %bb.0:
-; X64-NEXT: mfence
+; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: retq
%atomic = atomicrmw add ptr %ptr, i32 0 seq_cst
ret i32 %atomic
}
+define void @mfence() nounwind {
+; X32-LABEL: mfence:
+; X32: # %bb.0:
+; X32-NEXT: mfence
+; X32-NEXT: retl
+;
+; CHECK-LABEL: mfence:
+; CHECK: # %bb.0:
+; CHECK-NEXT: mfence
+; CHECK-NEXT: ret{{[l|q]}}
+ call void @llvm.x86.sse2.mfence()
+ ret void
+}
+declare void @llvm.x86.sse2.mfence() nounwind readnone
+
More information about the llvm-commits
mailing list