[llvm] [X86] Prefer `lock or` over mfence (PR #106555)
Valentin Churavy via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 29 06:24:15 PDT 2024
https://github.com/vchuravy created https://github.com/llvm/llvm-project/pull/106555
Originally opened as https://reviews.llvm.org/D129947
LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On
modern CPUs lock or is more efficient and provides the same sequential
consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html)
and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632
moved into this direction as well, but didn't touch fence seq_cst.
Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/
After another 2 years it doesn't look like anyone complained about the
GCC switch. And there is still `__builtin_ia32_mfence` for folks who
want this precise instruction.
>From 711f0c229fe90ce7a3a4ad71e5953a76938b2708 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy at gmail.com>
Date: Thu, 29 Aug 2024 15:17:40 +0200
Subject: [PATCH] [X86] Prefer `lock or` over mfence
Originally opened as https://reviews.llvm.org/D129947
LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On
modern CPUs lock or is more efficient and provides the same sequential
consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html)
and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632
moved into this direction as well, but didn't touch fence seq_cst.
Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/
After another 2 years it doesn't look like anyone complained about the
GCC switch. And there is still `__builtin_ia32_mfence` for folks who
want this precise instruction.
---
llvm/lib/Target/X86/X86.td | 40 ++++++++++++++++-------
llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
llvm/test/CodeGen/X86/atomic-unordered.ll | 10 +++---
3 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 988966fa6a6c46..dfa534a69e7024 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -754,6 +754,10 @@ def TuningUseGLMDivSqrtCosts
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
"Target has branch hint feature">;
+def TuningAvoidMFENCE
+ : SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
+ "Avoid MFENCE for fence seq_cst, and instead use lock or">;
+
//===----------------------------------------------------------------------===//
// X86 CPU Families
// TODO: Remove these - use general tuning features to determine codegen.
@@ -882,7 +886,8 @@ def ProcessorFeatures {
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
TuningSlowDivide64,
TuningInsertVZEROUPPER,
- TuningNoDomainDelayMov];
+ TuningNoDomainDelayMov,
+ TuningAvoidMFENCE];
// Westmere
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -903,7 +908,8 @@ def ProcessorFeatures {
TuningFast15ByteNOP,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
- TuningNoDomainDelayMov];
+ TuningNoDomainDelayMov,
+ TuningAvoidMFENCE];
list<SubtargetFeature> SNBFeatures =
!listconcat(WSMFeatures, SNBAdditionalFeatures);
@@ -969,7 +975,8 @@ def ProcessorFeatures {
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
- TuningNoDomainDelayBlend];
+ TuningNoDomainDelayBlend,
+ TuningAvoidMFENCE];
list<SubtargetFeature> SKLFeatures =
!listconcat(BDWFeatures, SKLAdditionalFeatures);
@@ -1004,7 +1011,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningAvoidMFENCE];
list<SubtargetFeature> SKXFeatures =
!listconcat(BDWFeatures, SKXAdditionalFeatures);
@@ -1047,7 +1055,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningAvoidMFENCE];
list<SubtargetFeature> CNLFeatures =
!listconcat(SKLFeatures, CNLAdditionalFeatures);
@@ -1076,7 +1085,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningAvoidMFENCE];
list<SubtargetFeature> ICLFeatures =
!listconcat(CNLFeatures, ICLAdditionalFeatures);
@@ -1222,7 +1232,8 @@ def ProcessorFeatures {
// Tremont
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
FeatureGFNI];
- list<SubtargetFeature> TRMTuning = GLPTuning;
+ list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
+ list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
list<SubtargetFeature> TRMFeatures =
!listconcat(GLPFeatures, TRMAdditionalFeatures);
@@ -1429,7 +1440,8 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningBranchFusion,
TuningSBBDepBreaking,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE];
// PileDriver
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -1509,7 +1521,8 @@ def ProcessorFeatures {
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER,
- TuningAllowLight256Bit];
+ TuningAllowLight256Bit,
+ TuningAvoidMFENCE];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
FeatureRDPRU,
@@ -1664,7 +1677,8 @@ def : ProcModel<"nocona", GenericPostRAModel, [
],
[
TuningSlowUAMem16,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
]>;
// Intel Core 2 Solo/Duo.
@@ -1684,7 +1698,8 @@ def : ProcModel<P, SandyBridgeModel, [
[
TuningMacroFusion,
TuningSlowUAMem16,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
]>;
}
foreach P = ["penryn", "core_2_duo_sse4_1"] in {
@@ -1703,7 +1718,8 @@ def : ProcModel<P, SandyBridgeModel, [
[
TuningMacroFusion,
TuningSlowUAMem16,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
]>;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f011249d295040..aade718c1efe80 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31103,7 +31103,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
// cross-thread fence.
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
FenceSSID == SyncScope::System) {
- if (Subtarget.hasMFence())
+ if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
SDValue Chain = Op.getOperand(0);
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 3fb994cdb751a3..e8e0ee0b7ef492 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -2096,7 +2096,7 @@ define i64 @nofold_fence(ptr %p) {
; CHECK-LABEL: nofold_fence:
; CHECK: # %bb.0:
; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq $15, %rax
; CHECK-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
@@ -2170,7 +2170,7 @@ define i64 @fold_constant_fence(i64 %arg) {
; CHECK-LABEL: fold_constant_fence:
; CHECK: # %bb.0:
; CHECK-NEXT: movq Constant(%rip), %rax
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq %rdi, %rax
; CHECK-NEXT: retq
%v = load atomic i64, ptr @Constant unordered, align 8
@@ -2197,7 +2197,7 @@ define i64 @fold_invariant_fence(ptr dereferenceable(8) %p, i64 %arg) {
; CHECK-LABEL: fold_invariant_fence:
; CHECK: # %bb.0:
; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq %rsi, %rax
; CHECK-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8, !invariant.load !{}
@@ -2321,7 +2321,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O0-LABEL: fold_cmp_over_fence:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movl (%rdi), %eax
-; CHECK-O0-NEXT: mfence
+; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-O0-NEXT: cmpl %eax, %esi
; CHECK-O0-NEXT: jne .LBB116_2
; CHECK-O0-NEXT: # %bb.1: # %taken
@@ -2335,7 +2335,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O3-LABEL: fold_cmp_over_fence:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movl (%rdi), %eax
-; CHECK-O3-NEXT: mfence
+; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-O3-NEXT: cmpl %eax, %esi
; CHECK-O3-NEXT: jne .LBB116_2
; CHECK-O3-NEXT: # %bb.1: # %taken
More information about the llvm-commits
mailing list