[llvm] [X86] Prefer `lock or` over mfence (PR #106555)
Valentin Churavy via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 30 07:09:40 PDT 2024
https://github.com/vchuravy updated https://github.com/llvm/llvm-project/pull/106555
>From 4d502dd7dd8c505775763bd783bb33678bff9e63 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy at gmail.com>
Date: Thu, 29 Aug 2024 15:17:40 +0200
Subject: [PATCH 1/2] [X86] Prefer `lock or` over mfence
Originally opened as https://reviews.llvm.org/D129947
LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On
modern CPUs lock or is more efficient and provides the same sequential
consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html)
and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632
moved into this direction as well, but didn't touch fence seq_cst.
Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/
After another 2 years it doesn't look like anyone complained about the
GCC switch. And there is still `__builtin_ia32_mfence` for folks who
want this precise instruction.
---
llvm/lib/Target/X86/X86.td | 49 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 19 +-
llvm/test/CodeGen/X86/atomic-idempotent.ll | 86 +--
llvm/test/CodeGen/X86/atomic-unordered.ll | 765 ++++++++++++++++++++-
llvm/test/CodeGen/X86/mfence.ll | 32 +
5 files changed, 845 insertions(+), 106 deletions(-)
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 6cf37836f921d4..3d79fb2efdd758 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -754,6 +754,10 @@ def TuningUseGLMDivSqrtCosts
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
"Target has branch hint feature">;
+def TuningAvoidMFENCE
+ : SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
+ "Avoid MFENCE for fence seq_cst, and instead use lock or">;
+
//===----------------------------------------------------------------------===//
// X86 CPU Families
// TODO: Remove these - use general tuning features to determine codegen.
@@ -815,7 +819,8 @@ def ProcessorFeatures {
TuningSlow3OpsLEA,
TuningSlowDivide64,
TuningSlowIncDec,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
];
list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [
@@ -831,7 +836,8 @@ def ProcessorFeatures {
TuningFastSHLDRotate,
TuningFast15ByteNOP,
TuningPOPCNTFalseDeps,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
];
list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
@@ -850,7 +856,8 @@ def ProcessorFeatures {
TuningPOPCNTFalseDeps,
TuningLZCNTFalseDeps,
TuningInsertVZEROUPPER,
- TuningAllowLight256Bit
+ TuningAllowLight256Bit,
+ TuningAvoidMFENCE
];
list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
@@ -874,7 +881,8 @@ def ProcessorFeatures {
TuningFastGather,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
- TuningAllowLight256Bit
+ TuningAllowLight256Bit,
+ TuningAvoidMFENCE
];
// Nehalem
@@ -882,7 +890,8 @@ def ProcessorFeatures {
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
TuningSlowDivide64,
TuningInsertVZEROUPPER,
- TuningNoDomainDelayMov];
+ TuningNoDomainDelayMov,
+ TuningAvoidMFENCE];
// Westmere
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -903,7 +912,8 @@ def ProcessorFeatures {
TuningFast15ByteNOP,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
- TuningNoDomainDelayMov];
+ TuningNoDomainDelayMov,
+ TuningAvoidMFENCE];
list<SubtargetFeature> SNBFeatures =
!listconcat(WSMFeatures, SNBAdditionalFeatures);
@@ -969,7 +979,8 @@ def ProcessorFeatures {
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
- TuningNoDomainDelayBlend];
+ TuningNoDomainDelayBlend,
+ TuningAvoidMFENCE];
list<SubtargetFeature> SKLFeatures =
!listconcat(BDWFeatures, SKLAdditionalFeatures);
@@ -1004,7 +1015,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningAvoidMFENCE];
list<SubtargetFeature> SKXFeatures =
!listconcat(BDWFeatures, SKXAdditionalFeatures);
@@ -1047,7 +1059,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningAvoidMFENCE];
list<SubtargetFeature> CNLFeatures =
!listconcat(SKLFeatures, CNLAdditionalFeatures);
@@ -1076,7 +1089,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningAvoidMFENCE];
list<SubtargetFeature> ICLFeatures =
!listconcat(CNLFeatures, ICLAdditionalFeatures);
@@ -1222,7 +1236,8 @@ def ProcessorFeatures {
// Tremont
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
FeatureGFNI];
- list<SubtargetFeature> TRMTuning = GLPTuning;
+ list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
+ list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
list<SubtargetFeature> TRMFeatures =
!listconcat(GLPFeatures, TRMAdditionalFeatures);
@@ -1429,7 +1444,8 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningBranchFusion,
TuningSBBDepBreaking,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE];
// PileDriver
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -1509,7 +1525,8 @@ def ProcessorFeatures {
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER,
- TuningAllowLight256Bit];
+ TuningAllowLight256Bit,
+ TuningAvoidMFENCE];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
FeatureRDPRU,
@@ -1697,7 +1714,8 @@ def : ProcModel<P, SandyBridgeModel, [
[
TuningMacroFusion,
TuningSlowUAMem16,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
]>;
}
foreach P = ["penryn", "core_2_duo_sse4_1"] in {
@@ -1716,7 +1734,8 @@ def : ProcModel<P, SandyBridgeModel, [
[
TuningMacroFusion,
TuningSlowUAMem16,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
]>;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 73f7f52846f625..76df32bef62fea 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31422,21 +31422,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// otherwise, we might be able to be more aggressive on relaxed idempotent
// rmw. In practice, they do not look useful, so we don't try to be
// especially clever.
- if (SSID == SyncScope::SingleThread)
- // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
- // the IR level, so we must wrap it in an intrinsic.
- return nullptr;
-
- if (!Subtarget.hasMFence())
- // FIXME: it might make sense to use a locked operation here but on a
- // different cache-line to prevent cache-line bouncing. In practice it
- // is probably a small win, and x86 processors without mfence are rare
- // enough that we do not bother.
- return nullptr;
- Function *MFence =
- llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
- Builder.CreateCall(MFence, {});
+ // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
+ // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
+ Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
// Finally we can emit the atomic load.
LoadInst *Loaded = Builder.CreateAlignedLoad(
@@ -31524,7 +31513,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
// cross-thread fence.
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
FenceSSID == SyncScope::System) {
- if (Subtarget.hasMFence())
+ if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
SDValue Chain = Op.getOperand(0);
diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll
index d5c46485068a64..4deedd5726b244 100644
--- a/llvm/test/CodeGen/X86/atomic-idempotent.ll
+++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll
@@ -27,18 +27,16 @@ define i8 @add8(ptr %p) {
;
; X86-SLM-LABEL: add8:
; X86-SLM: # %bb.0:
-; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT: xorl %eax, %eax
-; X86-SLM-NEXT: lock xaddb %al, (%ecx)
-; X86-SLM-NEXT: # kill: def $al killed $al killed $eax
+; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT: lock orl $0, (%esp)
+; X86-SLM-NEXT: movzbl (%eax), %eax
; X86-SLM-NEXT: retl
;
; X86-ATOM-LABEL: add8:
; X86-ATOM: # %bb.0:
-; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT: xorl %eax, %eax
-; X86-ATOM-NEXT: lock xaddb %al, (%ecx)
-; X86-ATOM-NEXT: # kill: def $al killed $al killed $eax
+; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT: lock orl $0, (%esp)
+; X86-ATOM-NEXT: movzbl (%eax), %eax
; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: retl
@@ -62,26 +60,18 @@ define i16 @or16(ptr %p) {
;
; X86-SLM-LABEL: or16:
; X86-SLM: # %bb.0:
-; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT: movzwl (%ecx), %eax
-; X86-SLM-NEXT: .p2align 4, 0x90
-; X86-SLM-NEXT: .LBB1_1: # %atomicrmw.start
-; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT: lock cmpxchgw %ax, (%ecx)
-; X86-SLM-NEXT: jne .LBB1_1
-; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT: lock orl $0, (%esp)
+; X86-SLM-NEXT: movzwl (%eax), %eax
; X86-SLM-NEXT: retl
;
; X86-ATOM-LABEL: or16:
; X86-ATOM: # %bb.0:
-; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT: movzwl (%ecx), %eax
-; X86-ATOM-NEXT: .p2align 4, 0x90
-; X86-ATOM-NEXT: .LBB1_1: # %atomicrmw.start
-; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT: lock cmpxchgw %ax, (%ecx)
-; X86-ATOM-NEXT: jne .LBB1_1
-; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT: lock orl $0, (%esp)
+; X86-ATOM-NEXT: movzwl (%eax), %eax
+; X86-ATOM-NEXT: nop
+; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: retl
%1 = atomicrmw or ptr %p, i16 0 acquire
ret i16 %1
@@ -103,26 +93,18 @@ define i32 @xor32(ptr %p) {
;
; X86-SLM-LABEL: xor32:
; X86-SLM: # %bb.0:
-; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT: movl (%ecx), %eax
-; X86-SLM-NEXT: .p2align 4, 0x90
-; X86-SLM-NEXT: .LBB2_1: # %atomicrmw.start
-; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
-; X86-SLM-NEXT: jne .LBB2_1
-; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT: lock orl $0, (%esp)
+; X86-SLM-NEXT: movl (%eax), %eax
; X86-SLM-NEXT: retl
;
; X86-ATOM-LABEL: xor32:
; X86-ATOM: # %bb.0:
-; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT: movl (%ecx), %eax
-; X86-ATOM-NEXT: .p2align 4, 0x90
-; X86-ATOM-NEXT: .LBB2_1: # %atomicrmw.start
-; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
-; X86-ATOM-NEXT: jne .LBB2_1
-; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT: lock orl $0, (%esp)
+; X86-ATOM-NEXT: movl (%eax), %eax
+; X86-ATOM-NEXT: nop
+; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: retl
%1 = atomicrmw xor ptr %p, i32 0 release
ret i32 %1
@@ -318,26 +300,18 @@ define i32 @and32 (ptr %p) {
;
; X86-SLM-LABEL: and32:
; X86-SLM: # %bb.0:
-; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT: movl (%ecx), %eax
-; X86-SLM-NEXT: .p2align 4, 0x90
-; X86-SLM-NEXT: .LBB5_1: # %atomicrmw.start
-; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
-; X86-SLM-NEXT: jne .LBB5_1
-; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT: lock orl $0, (%esp)
+; X86-SLM-NEXT: movl (%eax), %eax
; X86-SLM-NEXT: retl
;
; X86-ATOM-LABEL: and32:
; X86-ATOM: # %bb.0:
-; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT: movl (%ecx), %eax
-; X86-ATOM-NEXT: .p2align 4, 0x90
-; X86-ATOM-NEXT: .LBB5_1: # %atomicrmw.start
-; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
-; X86-ATOM-NEXT: jne .LBB5_1
-; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT: lock orl $0, (%esp)
+; X86-ATOM-NEXT: movl (%eax), %eax
+; X86-ATOM-NEXT: nop
+; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: retl
%1 = atomicrmw and ptr %p, i32 -1 acq_rel
ret i32 %1
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 3fb994cdb751a3..ff101b9037f0ef 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake | FileCheck --check-prefixes=CHECK,CHECK-O0 %s
; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake | FileCheck --check-prefixes=CHECK,CHECK-O3 %s
+; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -mattr=-avoid-mfence | FileCheck --check-prefixes=CHECK,CHECK-MFENCE %s
define i8 @load_i8(ptr %ptr) {
; CHECK-O0-LABEL: load_i8:
@@ -12,6 +13,11 @@ define i8 @load_i8(ptr %ptr) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movzbl (%rdi), %eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_i8:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i8, ptr %ptr unordered, align 1
ret i8 %v
}
@@ -27,6 +33,11 @@ define void @store_i8(ptr %ptr, i8 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movb %sil, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: store_i8:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movb %sil, (%rdi)
+; CHECK-MFENCE-NEXT: retq
store atomic i8 %v, ptr %ptr unordered, align 1
ret void
}
@@ -41,6 +52,11 @@ define i16 @load_i16(ptr %ptr) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movzwl (%rdi), %eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_i16:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i16, ptr %ptr unordered, align 2
ret i16 %v
}
@@ -57,6 +73,11 @@ define void @store_i16(ptr %ptr, i16 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movw %si, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: store_i16:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movw %si, (%rdi)
+; CHECK-MFENCE-NEXT: retq
store atomic i16 %v, ptr %ptr unordered, align 2
ret void
}
@@ -116,6 +137,11 @@ define void @narrow_writeback_or(ptr %ptr) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: orq $7, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: narrow_writeback_or:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: orq $7, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %ptr unordered, align 8
%v.new = or i64 %v, 7
store atomic i64 %v.new, ptr %ptr unordered, align 8
@@ -138,6 +164,12 @@ define void @narrow_writeback_and(ptr %ptr) {
; CHECK-O3-NEXT: movl $4294967040, %eax # imm = 0xFFFFFF00
; CHECK-O3-NEXT: andq %rax, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: narrow_writeback_and:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movl $4294967040, %eax # imm = 0xFFFFFF00
+; CHECK-MFENCE-NEXT: andq %rax, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %ptr unordered, align 8
%v.new = and i64 %v, 4294967040 ;; 0xFFFF_FF00
store atomic i64 %v.new, ptr %ptr unordered, align 8
@@ -157,6 +189,11 @@ define void @narrow_writeback_xor(ptr %ptr) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: xorq $7, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: narrow_writeback_xor:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: xorq $7, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %ptr unordered, align 8
%v.new = xor i64 %v, 7
store atomic i64 %v.new, ptr %ptr unordered, align 8
@@ -254,6 +291,14 @@ define void @store_i128(ptr %ptr, i128 %v) {
; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: store_i128:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: vmovq %rdx, %xmm0
+; CHECK-MFENCE-NEXT: vmovq %rsi, %xmm1
+; CHECK-MFENCE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-MFENCE-NEXT: vmovdqa %xmm0, (%rdi)
+; CHECK-MFENCE-NEXT: retq
store atomic i128 %v, ptr %ptr unordered, align 16
ret void
}
@@ -305,6 +350,28 @@ define i256 @load_i256(ptr %ptr) {
; CHECK-O3-NEXT: .cfi_def_cfa_offset 8
; CHECK-O3-NEXT: vzeroupper
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_i256:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: pushq %rbx
+; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 16
+; CHECK-MFENCE-NEXT: subq $32, %rsp
+; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 48
+; CHECK-MFENCE-NEXT: .cfi_offset %rbx, -16
+; CHECK-MFENCE-NEXT: movq %rdi, %rbx
+; CHECK-MFENCE-NEXT: movq %rsp, %rdx
+; CHECK-MFENCE-NEXT: movl $32, %edi
+; CHECK-MFENCE-NEXT: xorl %ecx, %ecx
+; CHECK-MFENCE-NEXT: callq __atomic_load at PLT
+; CHECK-MFENCE-NEXT: vmovups (%rsp), %ymm0
+; CHECK-MFENCE-NEXT: vmovups %ymm0, (%rbx)
+; CHECK-MFENCE-NEXT: movq %rbx, %rax
+; CHECK-MFENCE-NEXT: addq $32, %rsp
+; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 16
+; CHECK-MFENCE-NEXT: popq %rbx
+; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 8
+; CHECK-MFENCE-NEXT: vzeroupper
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i256, ptr %ptr unordered, align 16
ret i256 %v
}
@@ -345,6 +412,24 @@ define void @store_i256(ptr %ptr, i256 %v) {
; CHECK-O3-NEXT: addq $40, %rsp
; CHECK-O3-NEXT: .cfi_def_cfa_offset 8
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: store_i256:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: subq $40, %rsp
+; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 48
+; CHECK-MFENCE-NEXT: movq %rdi, %rax
+; CHECK-MFENCE-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; CHECK-MFENCE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-MFENCE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-MFENCE-NEXT: movq %rsi, (%rsp)
+; CHECK-MFENCE-NEXT: movq %rsp, %rdx
+; CHECK-MFENCE-NEXT: movl $32, %edi
+; CHECK-MFENCE-NEXT: movq %rax, %rsi
+; CHECK-MFENCE-NEXT: xorl %ecx, %ecx
+; CHECK-MFENCE-NEXT: callq __atomic_store at PLT
+; CHECK-MFENCE-NEXT: addq $40, %rsp
+; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 8
+; CHECK-MFENCE-NEXT: retq
store atomic i256 %v, ptr %ptr unordered, align 16
ret void
}
@@ -366,6 +451,14 @@ define void @vec_store(ptr %p0, <2 x i32> %vec) {
; CHECK-O3-NEXT: movl %eax, (%rdi)
; CHECK-O3-NEXT: movl %ecx, 4(%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: vec_store:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: vmovd %xmm0, %eax
+; CHECK-MFENCE-NEXT: vpextrd $1, %xmm0, %ecx
+; CHECK-MFENCE-NEXT: movl %eax, (%rdi)
+; CHECK-MFENCE-NEXT: movl %ecx, 4(%rdi)
+; CHECK-MFENCE-NEXT: retq
%v1 = extractelement <2 x i32> %vec, i32 0
%v2 = extractelement <2 x i32> %vec, i32 1
%p1 = getelementptr i32, ptr %p0, i64 1
@@ -391,6 +484,14 @@ define void @vec_store_unaligned(ptr %p0, <2 x i32> %vec) {
; CHECK-O3-NEXT: movl %eax, (%rdi)
; CHECK-O3-NEXT: movl %ecx, 4(%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: vec_store_unaligned:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: vmovd %xmm0, %eax
+; CHECK-MFENCE-NEXT: vpextrd $1, %xmm0, %ecx
+; CHECK-MFENCE-NEXT: movl %eax, (%rdi)
+; CHECK-MFENCE-NEXT: movl %ecx, 4(%rdi)
+; CHECK-MFENCE-NEXT: retq
%v1 = extractelement <2 x i32> %vec, i32 0
%v2 = extractelement <2 x i32> %vec, i32 1
%p1 = getelementptr i32, ptr %p0, i64 1
@@ -496,6 +597,12 @@ define i64 @load_fold_add3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: movq (%rsi), %rax
; CHECK-O3-NEXT: addq (%rdi), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_add3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rsi), %rax
+; CHECK-MFENCE-NEXT: addq (%rdi), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = add i64 %v, %v2
@@ -515,6 +622,12 @@ define i64 @load_fold_sub1(ptr %p) {
; CHECK-O3-NEXT: movq (%rdi), %rax
; CHECK-O3-NEXT: addq $-15, %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_sub1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: addq $-15, %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = sub i64 %v, 15
ret i64 %ret
@@ -556,6 +669,13 @@ define i64 @load_fold_mul1(ptr %p) {
; CHECK-O3-NEXT: leaq (%rax,%rax,4), %rax
; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_mul1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: leaq (%rax,%rax,4), %rax
+; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = mul i64 %v, 15
ret i64 %ret
@@ -584,6 +704,12 @@ define i64 @load_fold_mul3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: movq (%rsi), %rax
; CHECK-O3-NEXT: imulq (%rdi), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_mul3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rsi), %rax
+; CHECK-MFENCE-NEXT: imulq (%rdi), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = mul i64 %v, %v2
@@ -613,6 +739,20 @@ define i64 @load_fold_sdiv1(ptr %p) {
; CHECK-O3-NEXT: addq %rax, %rcx
; CHECK-O3-NEXT: movq %rcx, %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_sdiv1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rcx
+; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; CHECK-MFENCE-NEXT: movq %rcx, %rax
+; CHECK-MFENCE-NEXT: imulq %rdx
+; CHECK-MFENCE-NEXT: addq %rdx, %rcx
+; CHECK-MFENCE-NEXT: movq %rcx, %rax
+; CHECK-MFENCE-NEXT: shrq $63, %rax
+; CHECK-MFENCE-NEXT: sarq $3, %rcx
+; CHECK-MFENCE-NEXT: addq %rax, %rcx
+; CHECK-MFENCE-NEXT: movq %rcx, %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = sdiv i64 %v, 15
ret i64 %ret
@@ -644,6 +784,24 @@ define i64 @load_fold_sdiv2(ptr %p, i64 %v2) {
; CHECK-O3-NEXT: divl %esi
; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_sdiv2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq %rax, %rcx
+; CHECK-MFENCE-NEXT: orq %rsi, %rcx
+; CHECK-MFENCE-NEXT: shrq $32, %rcx
+; CHECK-MFENCE-NEXT: je .LBB35_1
+; CHECK-MFENCE-NEXT: # %bb.2:
+; CHECK-MFENCE-NEXT: cqto
+; CHECK-MFENCE-NEXT: idivq %rsi
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB35_1:
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divl %esi
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = sdiv i64 %v, %v2
ret i64 %ret
@@ -675,6 +833,25 @@ define i64 @load_fold_sdiv3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: divl %ecx
; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_sdiv3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq (%rsi), %rcx
+; CHECK-MFENCE-NEXT: movq %rax, %rdx
+; CHECK-MFENCE-NEXT: orq %rcx, %rdx
+; CHECK-MFENCE-NEXT: shrq $32, %rdx
+; CHECK-MFENCE-NEXT: je .LBB36_1
+; CHECK-MFENCE-NEXT: # %bb.2:
+; CHECK-MFENCE-NEXT: cqto
+; CHECK-MFENCE-NEXT: idivq %rcx
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB36_1:
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divl %ecx
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = sdiv i64 %v, %v2
@@ -699,6 +876,14 @@ define i64 @load_fold_udiv1(ptr %p) {
; CHECK-O3-NEXT: mulxq %rax, %rax, %rax
; CHECK-O3-NEXT: shrq $3, %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_udiv1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rdx
+; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-MFENCE-NEXT: mulxq %rax, %rax, %rax
+; CHECK-MFENCE-NEXT: shrq $3, %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = udiv i64 %v, 15
ret i64 %ret
@@ -730,6 +915,24 @@ define i64 @load_fold_udiv2(ptr %p, i64 %v2) {
; CHECK-O3-NEXT: divl %esi
; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_udiv2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq %rax, %rcx
+; CHECK-MFENCE-NEXT: orq %rsi, %rcx
+; CHECK-MFENCE-NEXT: shrq $32, %rcx
+; CHECK-MFENCE-NEXT: je .LBB38_1
+; CHECK-MFENCE-NEXT: # %bb.2:
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divq %rsi
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB38_1:
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divl %esi
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = udiv i64 %v, %v2
ret i64 %ret
@@ -762,6 +965,25 @@ define i64 @load_fold_udiv3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: divl %ecx
; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_udiv3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq (%rsi), %rcx
+; CHECK-MFENCE-NEXT: movq %rax, %rdx
+; CHECK-MFENCE-NEXT: orq %rcx, %rdx
+; CHECK-MFENCE-NEXT: shrq $32, %rdx
+; CHECK-MFENCE-NEXT: je .LBB39_1
+; CHECK-MFENCE-NEXT: # %bb.2:
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divq %rcx
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB39_1:
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divl %ecx
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = udiv i64 %v, %v2
@@ -795,6 +1017,23 @@ define i64 @load_fold_srem1(ptr %p) {
; CHECK-O3-NEXT: subq %rax, %rcx
; CHECK-O3-NEXT: movq %rcx, %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_srem1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rcx
+; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; CHECK-MFENCE-NEXT: movq %rcx, %rax
+; CHECK-MFENCE-NEXT: imulq %rdx
+; CHECK-MFENCE-NEXT: addq %rcx, %rdx
+; CHECK-MFENCE-NEXT: movq %rdx, %rax
+; CHECK-MFENCE-NEXT: shrq $63, %rax
+; CHECK-MFENCE-NEXT: sarq $3, %rdx
+; CHECK-MFENCE-NEXT: addq %rax, %rdx
+; CHECK-MFENCE-NEXT: leaq (%rdx,%rdx,4), %rax
+; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax
+; CHECK-MFENCE-NEXT: subq %rax, %rcx
+; CHECK-MFENCE-NEXT: movq %rcx, %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = srem i64 %v, 15
ret i64 %ret
@@ -828,6 +1067,25 @@ define i64 @load_fold_srem2(ptr %p, i64 %v2) {
; CHECK-O3-NEXT: divl %esi
; CHECK-O3-NEXT: movl %edx, %eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_srem2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq %rax, %rcx
+; CHECK-MFENCE-NEXT: orq %rsi, %rcx
+; CHECK-MFENCE-NEXT: shrq $32, %rcx
+; CHECK-MFENCE-NEXT: je .LBB41_1
+; CHECK-MFENCE-NEXT: # %bb.2:
+; CHECK-MFENCE-NEXT: cqto
+; CHECK-MFENCE-NEXT: idivq %rsi
+; CHECK-MFENCE-NEXT: movq %rdx, %rax
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB41_1:
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divl %esi
+; CHECK-MFENCE-NEXT: movl %edx, %eax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = srem i64 %v, %v2
ret i64 %ret
@@ -861,6 +1119,26 @@ define i64 @load_fold_srem3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: divl %ecx
; CHECK-O3-NEXT: movl %edx, %eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_srem3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq (%rsi), %rcx
+; CHECK-MFENCE-NEXT: movq %rax, %rdx
+; CHECK-MFENCE-NEXT: orq %rcx, %rdx
+; CHECK-MFENCE-NEXT: shrq $32, %rdx
+; CHECK-MFENCE-NEXT: je .LBB42_1
+; CHECK-MFENCE-NEXT: # %bb.2:
+; CHECK-MFENCE-NEXT: cqto
+; CHECK-MFENCE-NEXT: idivq %rcx
+; CHECK-MFENCE-NEXT: movq %rdx, %rax
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB42_1:
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divl %ecx
+; CHECK-MFENCE-NEXT: movl %edx, %eax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = srem i64 %v, %v2
@@ -890,6 +1168,18 @@ define i64 @load_fold_urem1(ptr %p) {
; CHECK-O3-NEXT: leaq (%rcx,%rcx,2), %rcx
; CHECK-O3-NEXT: subq %rcx, %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_urem1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; CHECK-MFENCE-NEXT: movq %rax, %rdx
+; CHECK-MFENCE-NEXT: mulxq %rcx, %rcx, %rcx
+; CHECK-MFENCE-NEXT: shrq $3, %rcx
+; CHECK-MFENCE-NEXT: leaq (%rcx,%rcx,4), %rcx
+; CHECK-MFENCE-NEXT: leaq (%rcx,%rcx,2), %rcx
+; CHECK-MFENCE-NEXT: subq %rcx, %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = urem i64 %v, 15
ret i64 %ret
@@ -924,6 +1214,25 @@ define i64 @load_fold_urem2(ptr %p, i64 %v2) {
; CHECK-O3-NEXT: divl %esi
; CHECK-O3-NEXT: movl %edx, %eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_urem2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq %rax, %rcx
+; CHECK-MFENCE-NEXT: orq %rsi, %rcx
+; CHECK-MFENCE-NEXT: shrq $32, %rcx
+; CHECK-MFENCE-NEXT: je .LBB44_1
+; CHECK-MFENCE-NEXT: # %bb.2:
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divq %rsi
+; CHECK-MFENCE-NEXT: movq %rdx, %rax
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB44_1:
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divl %esi
+; CHECK-MFENCE-NEXT: movl %edx, %eax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = urem i64 %v, %v2
ret i64 %ret
@@ -958,6 +1267,26 @@ define i64 @load_fold_urem3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: divl %ecx
; CHECK-O3-NEXT: movl %edx, %eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_urem3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq (%rsi), %rcx
+; CHECK-MFENCE-NEXT: movq %rax, %rdx
+; CHECK-MFENCE-NEXT: orq %rcx, %rdx
+; CHECK-MFENCE-NEXT: shrq $32, %rdx
+; CHECK-MFENCE-NEXT: je .LBB45_1
+; CHECK-MFENCE-NEXT: # %bb.2:
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divq %rcx
+; CHECK-MFENCE-NEXT: movq %rdx, %rax
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB45_1:
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divl %ecx
+; CHECK-MFENCE-NEXT: movl %edx, %eax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = urem i64 %v, %v2
@@ -989,6 +1318,11 @@ define i64 @load_fold_shl2(ptr %p, i64 %v2) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: shlxq %rsi, (%rdi), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_shl2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: shlxq %rsi, (%rdi), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = shl i64 %v, %v2
ret i64 %ret
@@ -1008,6 +1342,12 @@ define i64 @load_fold_shl3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: movq (%rsi), %rax
; CHECK-O3-NEXT: shlxq %rax, (%rdi), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_shl3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rsi), %rax
+; CHECK-MFENCE-NEXT: shlxq %rax, (%rdi), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = shl i64 %v, %v2
@@ -1039,6 +1379,11 @@ define i64 @load_fold_lshr2(ptr %p, i64 %v2) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: shrxq %rsi, (%rdi), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_lshr2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: shrxq %rsi, (%rdi), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = lshr i64 %v, %v2
ret i64 %ret
@@ -1058,6 +1403,12 @@ define i64 @load_fold_lshr3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: movq (%rsi), %rax
; CHECK-O3-NEXT: shrxq %rax, (%rdi), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_lshr3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rsi), %rax
+; CHECK-MFENCE-NEXT: shrxq %rax, (%rdi), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = lshr i64 %v, %v2
@@ -1089,6 +1440,11 @@ define i64 @load_fold_ashr2(ptr %p, i64 %v2) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: sarxq %rsi, (%rdi), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_ashr2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: sarxq %rsi, (%rdi), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = ashr i64 %v, %v2
ret i64 %ret
@@ -1108,6 +1464,12 @@ define i64 @load_fold_ashr3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: movq (%rsi), %rax
; CHECK-O3-NEXT: sarxq %rax, (%rdi), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_ashr3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rsi), %rax
+; CHECK-MFENCE-NEXT: sarxq %rax, (%rdi), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = ashr i64 %v, %v2
@@ -1127,6 +1489,12 @@ define i64 @load_fold_and1(ptr %p) {
; CHECK-O3-NEXT: movq (%rdi), %rax
; CHECK-O3-NEXT: andl $15, %eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_and1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: andl $15, %eax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = and i64 %v, 15
ret i64 %ret
@@ -1155,6 +1523,12 @@ define i64 @load_fold_and3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: movq (%rsi), %rax
; CHECK-O3-NEXT: andq (%rdi), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_and3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rsi), %rax
+; CHECK-MFENCE-NEXT: andq (%rdi), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = and i64 %v, %v2
@@ -1196,6 +1570,12 @@ define i64 @load_fold_or3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: movq (%rsi), %rax
; CHECK-O3-NEXT: orq (%rdi), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_or3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rsi), %rax
+; CHECK-MFENCE-NEXT: orq (%rdi), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = or i64 %v, %v2
@@ -1237,6 +1617,12 @@ define i64 @load_fold_xor3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: movq (%rsi), %rax
; CHECK-O3-NEXT: xorq (%rdi), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_xor3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rsi), %rax
+; CHECK-MFENCE-NEXT: xorq (%rdi), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = xor i64 %v, %v2
@@ -1256,6 +1642,12 @@ define i1 @load_fold_icmp1(ptr %p) {
; CHECK-O3-NEXT: cmpq $15, (%rdi)
; CHECK-O3-NEXT: sete %al
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_icmp1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: cmpq $15, (%rdi)
+; CHECK-MFENCE-NEXT: sete %al
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = icmp eq i64 %v, 15
ret i1 %ret
@@ -1274,6 +1666,12 @@ define i1 @load_fold_icmp2(ptr %p, i64 %v2) {
; CHECK-O3-NEXT: cmpq %rsi, (%rdi)
; CHECK-O3-NEXT: sete %al
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_icmp2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: cmpq %rsi, (%rdi)
+; CHECK-MFENCE-NEXT: sete %al
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = icmp eq i64 %v, %v2
ret i1 %ret
@@ -1294,6 +1692,13 @@ define i1 @load_fold_icmp3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: cmpq %rax, (%rdi)
; CHECK-O3-NEXT: sete %al
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_fold_icmp3:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rsi), %rax
+; CHECK-MFENCE-NEXT: cmpq %rax, (%rdi)
+; CHECK-MFENCE-NEXT: sete %al
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p1 unordered, align 8
%v2 = load atomic i64, ptr %p2 unordered, align 8
%ret = icmp eq i64 %v, %v2
@@ -1319,6 +1724,11 @@ define void @rmw_fold_add1(ptr %p, i64 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: addq $15, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_add1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: addq $15, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = add i64 %prev, 15
store atomic i64 %val, ptr %p unordered, align 8
@@ -1338,6 +1748,11 @@ define void @rmw_fold_add2(ptr %p, i64 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: addq %rsi, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_add2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: addq %rsi, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = add i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1357,6 +1772,11 @@ define void @rmw_fold_sub1(ptr %p, i64 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: addq $-15, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_sub1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: addq $-15, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = sub i64 %prev, 15
store atomic i64 %val, ptr %p unordered, align 8
@@ -1376,6 +1796,11 @@ define void @rmw_fold_sub2(ptr %p, i64 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: subq %rsi, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_sub2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: subq %rsi, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = sub i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1411,6 +1836,12 @@ define void @rmw_fold_mul2(ptr %p, i64 %v) {
; CHECK-O3-NEXT: imulq (%rdi), %rsi
; CHECK-O3-NEXT: movq %rsi, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_mul2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: imulq (%rdi), %rsi
+; CHECK-MFENCE-NEXT: movq %rsi, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = mul i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1447,6 +1878,20 @@ define void @rmw_fold_sdiv1(ptr %p, i64 %v) {
; CHECK-O3-NEXT: addq %rax, %rdx
; CHECK-O3-NEXT: movq %rdx, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_sdiv1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rcx
+; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; CHECK-MFENCE-NEXT: movq %rcx, %rax
+; CHECK-MFENCE-NEXT: imulq %rdx
+; CHECK-MFENCE-NEXT: addq %rcx, %rdx
+; CHECK-MFENCE-NEXT: movq %rdx, %rax
+; CHECK-MFENCE-NEXT: shrq $63, %rax
+; CHECK-MFENCE-NEXT: sarq $3, %rdx
+; CHECK-MFENCE-NEXT: addq %rax, %rdx
+; CHECK-MFENCE-NEXT: movq %rdx, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = sdiv i64 %prev, 15
store atomic i64 %val, ptr %p unordered, align 8
@@ -1482,6 +1927,26 @@ define void @rmw_fold_sdiv2(ptr %p, i64 %v) {
; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax
; CHECK-O3-NEXT: movq %rax, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_sdiv2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq %rax, %rcx
+; CHECK-MFENCE-NEXT: orq %rsi, %rcx
+; CHECK-MFENCE-NEXT: shrq $32, %rcx
+; CHECK-MFENCE-NEXT: je .LBB74_1
+; CHECK-MFENCE-NEXT: # %bb.2:
+; CHECK-MFENCE-NEXT: cqto
+; CHECK-MFENCE-NEXT: idivq %rsi
+; CHECK-MFENCE-NEXT: movq %rax, (%rdi)
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB74_1:
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divl %esi
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax
+; CHECK-MFENCE-NEXT: movq %rax, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = sdiv i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1534,6 +1999,26 @@ define void @rmw_fold_udiv2(ptr %p, i64 %v) {
; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax
; CHECK-O3-NEXT: movq %rax, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_udiv2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq %rax, %rcx
+; CHECK-MFENCE-NEXT: orq %rsi, %rcx
+; CHECK-MFENCE-NEXT: shrq $32, %rcx
+; CHECK-MFENCE-NEXT: je .LBB76_1
+; CHECK-MFENCE-NEXT: # %bb.2:
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divq %rsi
+; CHECK-MFENCE-NEXT: movq %rax, (%rdi)
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB76_1:
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divl %esi
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax
+; CHECK-MFENCE-NEXT: movq %rax, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = udiv i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1577,6 +2062,23 @@ define void @rmw_fold_srem1(ptr %p, i64 %v) {
; CHECK-O3-NEXT: subq %rax, %rcx
; CHECK-O3-NEXT: movq %rcx, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_srem1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rcx
+; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; CHECK-MFENCE-NEXT: movq %rcx, %rax
+; CHECK-MFENCE-NEXT: imulq %rdx
+; CHECK-MFENCE-NEXT: addq %rcx, %rdx
+; CHECK-MFENCE-NEXT: movq %rdx, %rax
+; CHECK-MFENCE-NEXT: shrq $63, %rax
+; CHECK-MFENCE-NEXT: sarq $3, %rdx
+; CHECK-MFENCE-NEXT: addq %rax, %rdx
+; CHECK-MFENCE-NEXT: leaq (%rdx,%rdx,4), %rax
+; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax
+; CHECK-MFENCE-NEXT: subq %rax, %rcx
+; CHECK-MFENCE-NEXT: movq %rcx, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = srem i64 %prev, 15
store atomic i64 %val, ptr %p unordered, align 8
@@ -1612,6 +2114,26 @@ define void @rmw_fold_srem2(ptr %p, i64 %v) {
; CHECK-O3-NEXT: # kill: def $edx killed $edx def $rdx
; CHECK-O3-NEXT: movq %rdx, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_srem2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq %rax, %rcx
+; CHECK-MFENCE-NEXT: orq %rsi, %rcx
+; CHECK-MFENCE-NEXT: shrq $32, %rcx
+; CHECK-MFENCE-NEXT: je .LBB78_1
+; CHECK-MFENCE-NEXT: # %bb.2:
+; CHECK-MFENCE-NEXT: cqto
+; CHECK-MFENCE-NEXT: idivq %rsi
+; CHECK-MFENCE-NEXT: movq %rdx, (%rdi)
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB78_1:
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divl %esi
+; CHECK-MFENCE-NEXT: # kill: def $edx killed $edx def $rdx
+; CHECK-MFENCE-NEXT: movq %rdx, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = srem i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1644,6 +2166,18 @@ define void @rmw_fold_urem1(ptr %p, i64 %v) {
; CHECK-O3-NEXT: subq %rax, %rdx
; CHECK-O3-NEXT: movq %rdx, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_urem1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rdx
+; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-MFENCE-NEXT: mulxq %rax, %rax, %rax
+; CHECK-MFENCE-NEXT: shrq $3, %rax
+; CHECK-MFENCE-NEXT: leaq (%rax,%rax,4), %rax
+; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax
+; CHECK-MFENCE-NEXT: subq %rax, %rdx
+; CHECK-MFENCE-NEXT: movq %rdx, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = urem i64 %prev, 15
store atomic i64 %val, ptr %p unordered, align 8
@@ -1680,6 +2214,26 @@ define void @rmw_fold_urem2(ptr %p, i64 %v) {
; CHECK-O3-NEXT: # kill: def $edx killed $edx def $rdx
; CHECK-O3-NEXT: movq %rdx, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_urem2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq %rax, %rcx
+; CHECK-MFENCE-NEXT: orq %rsi, %rcx
+; CHECK-MFENCE-NEXT: shrq $32, %rcx
+; CHECK-MFENCE-NEXT: je .LBB80_1
+; CHECK-MFENCE-NEXT: # %bb.2:
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divq %rsi
+; CHECK-MFENCE-NEXT: movq %rdx, (%rdi)
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB80_1:
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: xorl %edx, %edx
+; CHECK-MFENCE-NEXT: divl %esi
+; CHECK-MFENCE-NEXT: # kill: def $edx killed $edx def $rdx
+; CHECK-MFENCE-NEXT: movq %rdx, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = urem i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1717,6 +2271,12 @@ define void @rmw_fold_shl2(ptr %p, i64 %v) {
; CHECK-O3-NEXT: shlxq %rsi, (%rdi), %rax
; CHECK-O3-NEXT: movq %rax, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_shl2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: shlxq %rsi, (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq %rax, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = shl i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1754,6 +2314,12 @@ define void @rmw_fold_lshr2(ptr %p, i64 %v) {
; CHECK-O3-NEXT: shrxq %rsi, (%rdi), %rax
; CHECK-O3-NEXT: movq %rax, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_lshr2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: shrxq %rsi, (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq %rax, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = lshr i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1791,6 +2357,12 @@ define void @rmw_fold_ashr2(ptr %p, i64 %v) {
; CHECK-O3-NEXT: sarxq %rsi, (%rdi), %rax
; CHECK-O3-NEXT: movq %rax, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_ashr2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: sarxq %rsi, (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq %rax, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = ashr i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1812,6 +2384,11 @@ define void @rmw_fold_and1(ptr %p, i64 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: andq $15, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_and1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: andq $15, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = and i64 %prev, 15
store atomic i64 %val, ptr %p unordered, align 8
@@ -1831,6 +2408,11 @@ define void @rmw_fold_and2(ptr %p, i64 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: andq %rsi, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_and2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: andq %rsi, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = and i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1850,6 +2432,11 @@ define void @rmw_fold_or1(ptr %p, i64 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: orq $15, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_or1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: orq $15, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = or i64 %prev, 15
store atomic i64 %val, ptr %p unordered, align 8
@@ -1869,6 +2456,11 @@ define void @rmw_fold_or2(ptr %p, i64 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: orq %rsi, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_or2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: orq %rsi, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = or i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1888,6 +2480,11 @@ define void @rmw_fold_xor1(ptr %p, i64 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: xorq $15, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_xor1:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: xorq $15, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = xor i64 %prev, 15
store atomic i64 %val, ptr %p unordered, align 8
@@ -1907,6 +2504,11 @@ define void @rmw_fold_xor2(ptr %p, i64 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: xorq %rsi, (%rdi)
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: rmw_fold_xor2:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: xorq %rsi, (%rdi)
+; CHECK-MFENCE-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
%val = xor i64 %prev, %v
store atomic i64 %val, ptr %p unordered, align 8
@@ -1943,6 +2545,13 @@ define i32 @fold_trunc_add(ptr %p, i32 %v2) {
; CHECK-O3-NEXT: addl %esi, %eax
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: fold_trunc_add:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: addl %esi, %eax
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%trunc = trunc i64 %v to i32
%ret = add i32 %trunc, %v2
@@ -1964,6 +2573,13 @@ define i32 @fold_trunc_and(ptr %p, i32 %v2) {
; CHECK-O3-NEXT: andl %esi, %eax
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: fold_trunc_and:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: andl %esi, %eax
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%trunc = trunc i64 %v to i32
%ret = and i32 %trunc, %v2
@@ -1985,6 +2601,13 @@ define i32 @fold_trunc_or(ptr %p, i32 %v2) {
; CHECK-O3-NEXT: orl %esi, %eax
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: fold_trunc_or:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: orl %esi, %eax
+; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%trunc = trunc i64 %v to i32
%ret = or i32 %trunc, %v2
@@ -2012,6 +2635,15 @@ define i32 @split_load(ptr %p) {
; CHECK-O3-NEXT: orl %eax, %ecx
; CHECK-O3-NEXT: movzbl %cl, %eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: split_load:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: movq %rax, %rcx
+; CHECK-MFENCE-NEXT: shrq $32, %rcx
+; CHECK-MFENCE-NEXT: orl %eax, %ecx
+; CHECK-MFENCE-NEXT: movzbl %cl, %eax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%b1 = trunc i64 %v to i8
%v.shift = lshr i64 %v, 32
@@ -2093,12 +2725,26 @@ define void @dead_store(ptr %p, i64 %v) {
;; isn't violated.
define i64 @nofold_fence(ptr %p) {
-; CHECK-LABEL: nofold_fence:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: mfence
-; CHECK-NEXT: addq $15, %rax
-; CHECK-NEXT: retq
+; CHECK-O0-LABEL: nofold_fence:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movq (%rdi), %rax
+; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
+; CHECK-O0-NEXT: addq $15, %rax
+; CHECK-O0-NEXT: retq
+;
+; CHECK-O3-LABEL: nofold_fence:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movq (%rdi), %rax
+; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
+; CHECK-O3-NEXT: addq $15, %rax
+; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: nofold_fence:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: mfence
+; CHECK-MFENCE-NEXT: addq $15, %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
fence seq_cst
%ret = add i64 %v, 15
@@ -2148,6 +2794,12 @@ define i64 @fold_constant(i64 %arg) {
; CHECK-O3-NEXT: movq %rdi, %rax
; CHECK-O3-NEXT: addq Constant(%rip), %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: fold_constant:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq %rdi, %rax
+; CHECK-MFENCE-NEXT: addq Constant(%rip), %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr @Constant unordered, align 8
%ret = add i64 %v, %arg
ret i64 %ret
@@ -2167,12 +2819,26 @@ define i64 @fold_constant_clobber(ptr %p, i64 %arg) {
}
define i64 @fold_constant_fence(i64 %arg) {
-; CHECK-LABEL: fold_constant_fence:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq Constant(%rip), %rax
-; CHECK-NEXT: mfence
-; CHECK-NEXT: addq %rdi, %rax
-; CHECK-NEXT: retq
+; CHECK-O0-LABEL: fold_constant_fence:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movq Constant(%rip), %rax
+; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
+; CHECK-O0-NEXT: addq %rdi, %rax
+; CHECK-O0-NEXT: retq
+;
+; CHECK-O3-LABEL: fold_constant_fence:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movq Constant(%rip), %rax
+; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
+; CHECK-O3-NEXT: addq %rdi, %rax
+; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: fold_constant_fence:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq Constant(%rip), %rax
+; CHECK-MFENCE-NEXT: mfence
+; CHECK-MFENCE-NEXT: addq %rdi, %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr @Constant unordered, align 8
fence seq_cst
%ret = add i64 %v, %arg
@@ -2194,12 +2860,26 @@ define i64 @fold_invariant_clobber(ptr dereferenceable(8) %p, i64 %arg) {
define i64 @fold_invariant_fence(ptr dereferenceable(8) %p, i64 %arg) {
-; CHECK-LABEL: fold_invariant_fence:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: mfence
-; CHECK-NEXT: addq %rsi, %rax
-; CHECK-NEXT: retq
+; CHECK-O0-LABEL: fold_invariant_fence:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movq (%rdi), %rax
+; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
+; CHECK-O0-NEXT: addq %rsi, %rax
+; CHECK-O0-NEXT: retq
+;
+; CHECK-O3-LABEL: fold_invariant_fence:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movq (%rdi), %rax
+; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
+; CHECK-O3-NEXT: addq %rsi, %rax
+; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: fold_invariant_fence:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movq (%rdi), %rax
+; CHECK-MFENCE-NEXT: mfence
+; CHECK-MFENCE-NEXT: addq %rsi, %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8, !invariant.load !{}
fence seq_cst
%ret = add i64 %v, %arg
@@ -2222,6 +2902,12 @@ define i16 @load_i8_anyext_i16(ptr %ptr) {
; CHECK-O3-NEXT: movzbl (%rdi), %eax
; CHECK-O3-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_i8_anyext_i16:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax
+; CHECK-MFENCE-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i8, ptr %ptr unordered, align 2
%vec = insertelement <2 x i8> undef, i8 %v, i32 0
%res = bitcast <2 x i8> %vec to i16
@@ -2239,6 +2925,11 @@ define i32 @load_i8_anyext_i32(ptr %ptr) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movzbl (%rdi), %eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_i8_anyext_i32:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i8, ptr %ptr unordered, align 4
%vec = insertelement <4 x i8> undef, i8 %v, i32 0
%res = bitcast <4 x i8> %vec to i32
@@ -2257,6 +2948,11 @@ define i32 @load_i16_anyext_i32(ptr %ptr) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movzwl (%rdi), %eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_i16_anyext_i32:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i16, ptr %ptr unordered, align 4
%vec = insertelement <2 x i16> undef, i16 %v, i64 0
%res = bitcast <2 x i16> %vec to i32
@@ -2279,6 +2975,13 @@ define i64 @load_i16_anyext_i64(ptr %ptr) {
; CHECK-O3-NEXT: vmovd %eax, %xmm0
; CHECK-O3-NEXT: vmovq %xmm0, %rax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_i16_anyext_i64:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax
+; CHECK-MFENCE-NEXT: vmovd %eax, %xmm0
+; CHECK-MFENCE-NEXT: vmovq %xmm0, %rax
+; CHECK-MFENCE-NEXT: retq
%v = load atomic i16, ptr %ptr unordered, align 8
%vec = insertelement <4 x i16> undef, i16 %v, i64 0
%res = bitcast <4 x i16> %vec to i64
@@ -2307,6 +3010,15 @@ define i16 @load_combine(ptr %p) {
; CHECK-O3-NEXT: orl %ecx, %eax
; CHECK-O3-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: load_combine:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movzbl (%rdi), %ecx
+; CHECK-MFENCE-NEXT: movzbl 1(%rdi), %eax
+; CHECK-MFENCE-NEXT: shll $8, %eax
+; CHECK-MFENCE-NEXT: orl %ecx, %eax
+; CHECK-MFENCE-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-MFENCE-NEXT: retq
%v1 = load atomic i8, ptr %p unordered, align 2
%p2 = getelementptr i8, ptr %p, i64 1
%v2 = load atomic i8, ptr %p2 unordered, align 1
@@ -2321,7 +3033,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O0-LABEL: fold_cmp_over_fence:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movl (%rdi), %eax
-; CHECK-O0-NEXT: mfence
+; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-O0-NEXT: cmpl %eax, %esi
; CHECK-O0-NEXT: jne .LBB116_2
; CHECK-O0-NEXT: # %bb.1: # %taken
@@ -2335,7 +3047,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O3-LABEL: fold_cmp_over_fence:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movl (%rdi), %eax
-; CHECK-O3-NEXT: mfence
+; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-O3-NEXT: cmpl %eax, %esi
; CHECK-O3-NEXT: jne .LBB116_2
; CHECK-O3-NEXT: # %bb.1: # %taken
@@ -2344,6 +3056,19 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O3-NEXT: .LBB116_2: # %untaken
; CHECK-O3-NEXT: xorl %eax, %eax
; CHECK-O3-NEXT: retq
+;
+; CHECK-MFENCE-LABEL: fold_cmp_over_fence:
+; CHECK-MFENCE: # %bb.0:
+; CHECK-MFENCE-NEXT: movl (%rdi), %eax
+; CHECK-MFENCE-NEXT: mfence
+; CHECK-MFENCE-NEXT: cmpl %eax, %esi
+; CHECK-MFENCE-NEXT: jne .LBB116_2
+; CHECK-MFENCE-NEXT: # %bb.1: # %taken
+; CHECK-MFENCE-NEXT: movb $1, %al
+; CHECK-MFENCE-NEXT: retq
+; CHECK-MFENCE-NEXT: .LBB116_2: # %untaken
+; CHECK-MFENCE-NEXT: xorl %eax, %eax
+; CHECK-MFENCE-NEXT: retq
%v2 = load atomic i32, ptr %p unordered, align 4
fence seq_cst
%cmp = icmp eq i32 %v1, %v2
diff --git a/llvm/test/CodeGen/X86/mfence.ll b/llvm/test/CodeGen/X86/mfence.ll
index f34657b3f240c9..a7b4790bf801ee 100644
--- a/llvm/test/CodeGen/X86/mfence.ll
+++ b/llvm/test/CodeGen/X86/mfence.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2,+avoid-mfence | FileCheck %s --check-prefix=X64-NO-MFENCE
; It doesn't matter if an x86-64 target has specified "no-sse2"; we still can use mfence.
@@ -14,6 +15,11 @@ define void @test() {
; X64: # %bb.0:
; X64-NEXT: mfence
; X64-NEXT: retq
+;
+; X64-NO-MFENCE-LABEL: test:
+; X64-NO-MFENCE: # %bb.0:
+; X64-NO-MFENCE-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
+; X64-NO-MFENCE-NEXT: retq
fence seq_cst
ret void
}
@@ -31,7 +37,33 @@ define i32 @fence(ptr %ptr) {
; X64-NEXT: mfence
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: retq
+;
+; X64-NO-MFENCE-LABEL: fence:
+; X64-NO-MFENCE: # %bb.0:
+; X64-NO-MFENCE-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
+; X64-NO-MFENCE-NEXT: movl (%rdi), %eax
+; X64-NO-MFENCE-NEXT: retq
%atomic = atomicrmw add ptr %ptr, i32 0 seq_cst
ret i32 %atomic
}
+define void @mfence() nounwind {
+; X32-LABEL: mfence:
+; X32: # %bb.0:
+; X32-NEXT: mfence
+; X32-NEXT: retl
+;
+; X64-LABEL: mfence:
+; X64: # %bb.0:
+; X64-NEXT: mfence
+; X64-NEXT: retq
+;
+; X64-NO-MFENCE-LABEL: mfence:
+; X64-NO-MFENCE: # %bb.0:
+; X64-NO-MFENCE-NEXT: mfence
+; X64-NO-MFENCE-NEXT: retq
+ call void @llvm.x86.sse2.mfence()
+ ret void
+}
+declare void @llvm.x86.sse2.mfence() nounwind readnone
+
>From 707ca0e44f57bc2235d8ea29376ef45de9a1adb8 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy at gmail.com>
Date: Mon, 30 Sep 2024 16:09:24 +0200
Subject: [PATCH 2/2] fixup! [X86] Prefer `lock or` over mfence
---
llvm/lib/Target/X86/X86.td | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 3d79fb2efdd758..67486ae0ebf842 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1415,7 +1415,8 @@ def ProcessorFeatures {
TuningFastImm16,
TuningSBBDepBreaking,
TuningSlowDivide64,
- TuningSlowSHLD];
+ TuningSlowSHLD,
+ TuningAvoidMFENCE];
list<SubtargetFeature> BtVer2Features =
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);
More information about the llvm-commits
mailing list