[llvm] d6f9278 - [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (#74275)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 16 15:24:26 PDT 2024
Author: James Y Knight
Date: 2024-05-16T18:24:23-04:00
New Revision: d6f9278ae9e587d2d23a9940a2364aaafba74735
URL: https://github.com/llvm/llvm-project/commit/d6f9278ae9e587d2d23a9940a2364aaafba74735
DIFF: https://github.com/llvm/llvm-project/commit/d6f9278ae9e587d2d23a9940a2364aaafba74735.diff
LOG: [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (#74275)
In late 2021, both Intel and AMD finally documented that every
AVX-capable CPU has always been guaranteed to execute aligned 16-byte
loads/stores atomically, and further, guaranteed that all future CPUs
with AVX will do so as well.
Therefore, we may use normal SSE 128-bit load/store instructions to
implement atomics, if AVX is enabled.
Per AMD64 Architecture Programmer's manual, 7.3.2 Access Atomicity:
> Processors that report [AVX] extend the atomicity for cacheable,
> naturally-aligned single loads or stores from a quadword to a double
> quadword.
Per Intel's SDM:
> Processors that enumerate support for Intel(R) AVX guarantee that the
> 16-byte memory operations performed by the following instructions will
> always be carried out atomically:
> - MOVAPD, MOVAPS, and MOVDQA.
> - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
> - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded with
> EVEX.128 and k0 (masking disabled).
This was also confirmed to be true for Zhaoxin CPUs with AVX, in
https://gcc.gnu.org/PR104688
Added:
llvm/test/CodeGen/X86/atomic-unaligned.ll
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
llvm/test/CodeGen/X86/atomic-non-integer.ll
llvm/test/CodeGen/X86/atomic-unordered.ll
llvm/test/CodeGen/X86/atomic128.ll
llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e7c70e3872ad1..cd252c54887a5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -544,6 +544,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.is64Bit())
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+ if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
+ // All CPUs supporting AVX will atomically load/store aligned 128-bit
+ // values, so we can emit [V]MOVAPS/[V]MOVDQA.
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
+ }
+
if (Subtarget.canUseCMPXCHG16B())
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
@@ -30415,32 +30422,40 @@ TargetLoweringBase::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
Type *MemType = SI->getValueOperand()->getType();
- bool NoImplicitFloatOps =
- SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
- if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
- !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
- (Subtarget.hasSSE1() || Subtarget.hasX87()))
- return AtomicExpansionKind::None;
+ if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
+ !Subtarget.useSoftFloat()) {
+ if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+ (Subtarget.hasSSE1() || Subtarget.hasX87()))
+ return AtomicExpansionKind::None;
+
+ if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
+ Subtarget.hasAVX())
+ return AtomicExpansionKind::None;
+ }
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
: AtomicExpansionKind::None;
}
// Note: this turns large loads into lock cmpxchg8b/16b.
-// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
Type *MemType = LI->getType();
- // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
- // can use movq to do the load. If we have X87 we can load into an 80-bit
- // X87 register and store it to a stack temporary.
- bool NoImplicitFloatOps =
- LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
- if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
- !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
- (Subtarget.hasSSE1() || Subtarget.hasX87()))
- return AtomicExpansionKind::None;
+ if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
+ !Subtarget.useSoftFloat()) {
+ // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
+ // can use movq to do the load. If we have X87 we can load into an 80-bit
+ // X87 register and store it to a stack temporary.
+ if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+ (Subtarget.hasSSE1() || Subtarget.hasX87()))
+ return AtomicExpansionKind::None;
+
+ // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
+ if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
+ Subtarget.hasAVX())
+ return AtomicExpansionKind::None;
+ }
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
: AtomicExpansionKind::None;
@@ -31683,14 +31698,21 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
if (!IsSeqCst && IsTypeLegal)
return Op;
- if (VT == MVT::i64 && !IsTypeLegal) {
+ if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
+ !DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat)) {
+ SDValue Chain;
+ // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
+ // vector store.
+ if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
+ SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
+ Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
+ Node->getMemOperand());
+ }
+
// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
// is enabled.
- bool NoImplicitFloatOps =
- DAG.getMachineFunction().getFunction().hasFnAttribute(
- Attribute::NoImplicitFloat);
- if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
- SDValue Chain;
+ if (VT == MVT::i64) {
if (Subtarget.hasSSE1()) {
SDValue SclToVec =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
@@ -31722,15 +31744,15 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
StoreOps, MVT::i64, Node->getMemOperand());
}
+ }
- if (Chain) {
- // If this is a sequentially consistent store, also emit an appropriate
- // barrier.
- if (IsSeqCst)
- Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
+ if (Chain) {
+ // If this is a sequentially consistent store, also emit an appropriate
+ // barrier.
+ if (IsSeqCst)
+ Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
- return Chain;
- }
+ return Chain;
}
}
@@ -33303,12 +33325,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::ATOMIC_LOAD: {
- assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+ assert(
+ (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
+ "Unexpected VT!");
bool NoImplicitFloatOps =
DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
auto *Node = cast<AtomicSDNode>(N);
+
+ if (N->getValueType(0) == MVT::i128) {
+ if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
+ SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
+ Node->getBasePtr(), Node->getMemOperand());
+ SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+ DAG.getIntPtrConstant(0, dl));
+ SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+ DAG.getIntPtrConstant(1, dl));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
+ {ResL, ResH}));
+ Results.push_back(Ld.getValue(1));
+ return;
+ }
+ break;
+ }
if (Subtarget.hasSSE1()) {
// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
// Then extract the lower 64-bits.
diff --git a/llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll b/llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
index 9555c45086d6f..5960787fe30de 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
@@ -28,22 +28,7 @@ define void @store_fp128(ptr %fptr, fp128 %v) {
;
; X64-AVX-LABEL: store_fp128:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: pushq %rbx
-; X64-AVX-NEXT: .cfi_def_cfa_offset 16
-; X64-AVX-NEXT: .cfi_offset %rbx, -16
-; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rbx
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-AVX-NEXT: movq (%rdi), %rax
-; X64-AVX-NEXT: movq 8(%rdi), %rdx
-; X64-AVX-NEXT: .p2align 4, 0x90
-; X64-AVX-NEXT: .LBB0_1: # %atomicrmw.start
-; X64-AVX-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-AVX-NEXT: lock cmpxchg16b (%rdi)
-; X64-AVX-NEXT: jne .LBB0_1
-; X64-AVX-NEXT: # %bb.2: # %atomicrmw.end
-; X64-AVX-NEXT: popq %rbx
-; X64-AVX-NEXT: .cfi_def_cfa_offset 8
+; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
; X64-AVX-NEXT: retq
store atomic fp128 %v, ptr %fptr unordered, align 16
ret void
@@ -69,19 +54,9 @@ define fp128 @load_fp128(ptr %fptr) {
;
; X64-AVX-LABEL: load_fp128:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: pushq %rbx
-; X64-AVX-NEXT: .cfi_def_cfa_offset 16
-; X64-AVX-NEXT: .cfi_offset %rbx, -16
-; X64-AVX-NEXT: xorl %eax, %eax
-; X64-AVX-NEXT: xorl %edx, %edx
-; X64-AVX-NEXT: xorl %ecx, %ecx
-; X64-AVX-NEXT: xorl %ebx, %ebx
-; X64-AVX-NEXT: lock cmpxchg16b (%rdi)
-; X64-AVX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
+; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-AVX-NEXT: popq %rbx
-; X64-AVX-NEXT: .cfi_def_cfa_offset 8
; X64-AVX-NEXT: retq
%v = load atomic fp128, ptr %fptr unordered, align 16
ret fp128 %v
diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index 9e6f584d83112..9995e7d3a4d31 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -131,7 +131,6 @@ define void @store_double(ptr %fptr, double %v) {
ret void
}
-
define half @load_half(ptr %fptr) {
; X86-SSE1-LABEL: load_half:
; X86-SSE1: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/atomic-unaligned.ll b/llvm/test/CodeGen/X86/atomic-unaligned.ll
new file mode 100644
index 0000000000000..f02041cc5fc8f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/atomic-unaligned.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
+
+; Quick test to ensure that atomics which are not naturally-aligned
+; emit unsized libcalls, and aren't emitted as native instructions or
+; sized libcalls.
+define void @test_i32(ptr %a) nounwind {
+; CHECK-LABEL: test_i32:
+; CHECK: callq __atomic_load
+; CHECK: callq __atomic_store
+; CHECK: callq __atomic_exchange
+; CHECK: callq __atomic_compare_exchange
+; CHECK: callq __atomic_compare_exchange
+ %t0 = load atomic i32, ptr %a seq_cst, align 2
+ store atomic i32 1, ptr %a seq_cst, align 2
+ %t1 = atomicrmw xchg ptr %a, i32 1 seq_cst, align 2
+ %t3 = atomicrmw add ptr %a, i32 2 seq_cst, align 2
+ %t2 = cmpxchg ptr %a, i32 0, i32 1 seq_cst seq_cst, align 2
+ ret void
+}
+
+define void @test_i128(ptr %a) nounwind {
+; CHECK-LABEL: test_i128:
+; CHECK: callq __atomic_load
+; CHECK: callq __atomic_store
+; CHECK: callq __atomic_exchange
+; CHECK: callq __atomic_compare_exchange
+ %t0 = load atomic i128, ptr %a seq_cst, align 8
+ store atomic i128 1, ptr %a seq_cst, align 8
+ %t1 = atomicrmw xchg ptr %a, i128 1 seq_cst, align 8
+ %t2 = atomicrmw add ptr %a, i128 2 seq_cst, align 8
+ %t3 = cmpxchg ptr %a, i128 0, i128 1 seq_cst seq_cst, align 8
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index df123be53474f..3fb994cdb751a 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -228,34 +228,12 @@ define void @widen_broadcast_unaligned(ptr %p0, i32 %v) {
}
define i128 @load_i128(ptr %ptr) {
-; CHECK-O0-LABEL: load_i128:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: pushq %rbx
-; CHECK-O0-NEXT: .cfi_def_cfa_offset 16
-; CHECK-O0-NEXT: .cfi_offset %rbx, -16
-; CHECK-O0-NEXT: xorl %eax, %eax
-; CHECK-O0-NEXT: movl %eax, %ebx
-; CHECK-O0-NEXT: movq %rbx, %rax
-; CHECK-O0-NEXT: movq %rbx, %rdx
-; CHECK-O0-NEXT: movq %rbx, %rcx
-; CHECK-O0-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-O0-NEXT: popq %rbx
-; CHECK-O0-NEXT: .cfi_def_cfa_offset 8
-; CHECK-O0-NEXT: retq
-;
-; CHECK-O3-LABEL: load_i128:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: pushq %rbx
-; CHECK-O3-NEXT: .cfi_def_cfa_offset 16
-; CHECK-O3-NEXT: .cfi_offset %rbx, -16
-; CHECK-O3-NEXT: xorl %eax, %eax
-; CHECK-O3-NEXT: xorl %edx, %edx
-; CHECK-O3-NEXT: xorl %ecx, %ecx
-; CHECK-O3-NEXT: xorl %ebx, %ebx
-; CHECK-O3-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-O3-NEXT: popq %rbx
-; CHECK-O3-NEXT: .cfi_def_cfa_offset 8
-; CHECK-O3-NEXT: retq
+; CHECK-LABEL: load_i128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-NEXT: vmovq %xmm0, %rax
+; CHECK-NEXT: vpextrq $1, %xmm0, %rdx
+; CHECK-NEXT: retq
%v = load atomic i128, ptr %ptr unordered, align 16
ret i128 %v
}
@@ -263,51 +241,18 @@ define i128 @load_i128(ptr %ptr) {
define void @store_i128(ptr %ptr, i128 %v) {
; CHECK-O0-LABEL: store_i128:
; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: pushq %rbx
-; CHECK-O0-NEXT: .cfi_def_cfa_offset 16
-; CHECK-O0-NEXT: .cfi_offset %rbx, -16
-; CHECK-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: movq (%rdi), %rax
-; CHECK-O0-NEXT: movq 8(%rdi), %rdx
-; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: jmp .LBB16_1
-; CHECK-O0-NEXT: .LBB16_1: # %atomicrmw.start
-; CHECK-O0-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-O0-NEXT: lock cmpxchg16b (%rsi)
-; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: jne .LBB16_1
-; CHECK-O0-NEXT: jmp .LBB16_2
-; CHECK-O0-NEXT: .LBB16_2: # %atomicrmw.end
-; CHECK-O0-NEXT: popq %rbx
-; CHECK-O0-NEXT: .cfi_def_cfa_offset 8
+; CHECK-O0-NEXT: vmovq %rsi, %xmm0
+; CHECK-O0-NEXT: vmovq %rdx, %xmm1
+; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-O0-NEXT: vmovdqa %xmm0, (%rdi)
; CHECK-O0-NEXT: retq
;
; CHECK-O3-LABEL: store_i128:
; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: pushq %rbx
-; CHECK-O3-NEXT: .cfi_def_cfa_offset 16
-; CHECK-O3-NEXT: .cfi_offset %rbx, -16
-; CHECK-O3-NEXT: movq %rdx, %rcx
-; CHECK-O3-NEXT: movq %rsi, %rbx
-; CHECK-O3-NEXT: movq (%rdi), %rax
-; CHECK-O3-NEXT: movq 8(%rdi), %rdx
-; CHECK-O3-NEXT: .p2align 4, 0x90
-; CHECK-O3-NEXT: .LBB16_1: # %atomicrmw.start
-; CHECK-O3-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-O3-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-O3-NEXT: jne .LBB16_1
-; CHECK-O3-NEXT: # %bb.2: # %atomicrmw.end
-; CHECK-O3-NEXT: popq %rbx
-; CHECK-O3-NEXT: .cfi_def_cfa_offset 8
+; CHECK-O3-NEXT: vmovq %rdx, %xmm0
+; CHECK-O3-NEXT: vmovq %rsi, %xmm1
+; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi)
; CHECK-O3-NEXT: retq
store atomic i128 %v, ptr %ptr unordered, align 16
ret void
diff --git a/llvm/test/CodeGen/X86/atomic128.ll b/llvm/test/CodeGen/X86/atomic128.ll
index 1f7c2254bc79f..949ee9e276a43 100644
--- a/llvm/test/CodeGen/X86/atomic128.ll
+++ b/llvm/test/CodeGen/X86/atomic128.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9 -verify-machineinstrs -mattr=cx16 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9 -verify-machineinstrs -mattr=cx16 | FileCheck %s --check-prefixes=CHECK,CHECK-NOAVX
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9 -verify-machineinstrs -mattr=cx16,avx | FileCheck %s --check-prefixes=CHECK,CHECK-AVX
; Codegen of i128 without cx16 is tested in atomic-nocx16.ll
@@ -28,20 +29,28 @@ define i128 @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
@cmpxchg16b_global = external dso_local global { i128, i128 }, align 16
;; Make sure we retain the offset of the global variable.
-define void @cmpxchg16b_global_with_offset() nounwind {
-; CHECK-LABEL: cmpxchg16b_global_with_offset:
-; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: xorl %ebx, %ebx
-; CHECK-NEXT: lock cmpxchg16b _cmpxchg16b_global+16(%rip)
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: retq
+define i128 @load_global_with_offset() nounwind {
+; CHECK-NOAVX-LABEL: load_global_with_offset:
+; CHECK-NOAVX: ## %bb.0: ## %entry
+; CHECK-NOAVX-NEXT: pushq %rbx
+; CHECK-NOAVX-NEXT: xorl %eax, %eax
+; CHECK-NOAVX-NEXT: xorl %edx, %edx
+; CHECK-NOAVX-NEXT: xorl %ecx, %ecx
+; CHECK-NOAVX-NEXT: xorl %ebx, %ebx
+; CHECK-NOAVX-NEXT: lock cmpxchg16b _cmpxchg16b_global+16(%rip)
+; CHECK-NOAVX-NEXT: popq %rbx
+; CHECK-NOAVX-NEXT: retq
+;
+; CHECK-AVX-LABEL: load_global_with_offset:
+; CHECK-AVX: ## %bb.0: ## %entry
+; CHECK-AVX-NEXT: vmovdqa _cmpxchg16b_global+16(%rip), %xmm0
+; CHECK-AVX-NEXT: vmovq %xmm0, %rax
+; CHECK-AVX-NEXT: vpextrq $1, %xmm0, %rdx
+; CHECK-AVX-NEXT: retq
+;
entry:
%0 = load atomic i128, ptr getelementptr inbounds ({i128, i128}, ptr @cmpxchg16b_global, i64 0, i32 1) acquire, align 16
- ret void
+ ret i128 %0
}
define void @fetch_and_nand(ptr %p, i128 %bits) {
@@ -283,101 +292,140 @@ define void @fetch_and_umax(ptr %p, i128 %bits) {
}
define i128 @atomic_load_seq_cst(ptr %p) {
-; CHECK-LABEL: atomic_load_seq_cst:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbx, -16
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: xorl %ebx, %ebx
-; CHECK-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: retq
+; CHECK-NOAVX-LABEL: atomic_load_seq_cst:
+; CHECK-NOAVX: ## %bb.0:
+; CHECK-NOAVX-NEXT: pushq %rbx
+; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16
+; CHECK-NOAVX-NEXT: xorl %eax, %eax
+; CHECK-NOAVX-NEXT: xorl %edx, %edx
+; CHECK-NOAVX-NEXT: xorl %ecx, %ecx
+; CHECK-NOAVX-NEXT: xorl %ebx, %ebx
+; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-NOAVX-NEXT: popq %rbx
+; CHECK-NOAVX-NEXT: retq
+;
+; CHECK-AVX-LABEL: atomic_load_seq_cst:
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-AVX-NEXT: vmovq %xmm0, %rax
+; CHECK-AVX-NEXT: vpextrq $1, %xmm0, %rdx
+; CHECK-AVX-NEXT: retq
%r = load atomic i128, ptr %p seq_cst, align 16
ret i128 %r
}
define i128 @atomic_load_relaxed(ptr %p) {
-; CHECK-LABEL: atomic_load_relaxed:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbx, -16
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: xorl %ebx, %ebx
-; CHECK-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: retq
+; CHECK-NOAVX-LABEL: atomic_load_relaxed:
+; CHECK-NOAVX: ## %bb.0:
+; CHECK-NOAVX-NEXT: pushq %rbx
+; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16
+; CHECK-NOAVX-NEXT: xorl %eax, %eax
+; CHECK-NOAVX-NEXT: xorl %edx, %edx
+; CHECK-NOAVX-NEXT: xorl %ecx, %ecx
+; CHECK-NOAVX-NEXT: xorl %ebx, %ebx
+; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-NOAVX-NEXT: popq %rbx
+; CHECK-NOAVX-NEXT: retq
+;
+; CHECK-AVX-LABEL: atomic_load_relaxed:
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-AVX-NEXT: vmovq %xmm0, %rax
+; CHECK-AVX-NEXT: vpextrq $1, %xmm0, %rdx
+; CHECK-AVX-NEXT: retq
%r = load atomic i128, ptr %p monotonic, align 16
ret i128 %r
}
define void @atomic_store_seq_cst(ptr %p, i128 %in) {
-; CHECK-LABEL: atomic_store_seq_cst:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbx, -16
-; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq 8(%rdi), %rdx
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: LBB12_1: ## %atomicrmw.start
-; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-NEXT: jne LBB12_1
-; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: retq
+; CHECK-NOAVX-LABEL: atomic_store_seq_cst:
+; CHECK-NOAVX: ## %bb.0:
+; CHECK-NOAVX-NEXT: pushq %rbx
+; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16
+; CHECK-NOAVX-NEXT: movq %rdx, %rcx
+; CHECK-NOAVX-NEXT: movq %rsi, %rbx
+; CHECK-NOAVX-NEXT: movq (%rdi), %rax
+; CHECK-NOAVX-NEXT: movq 8(%rdi), %rdx
+; CHECK-NOAVX-NEXT: .p2align 4, 0x90
+; CHECK-NOAVX-NEXT: LBB12_1: ## %atomicrmw.start
+; CHECK-NOAVX-NEXT: ## =>This Inner Loop Header: Depth=1
+; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-NOAVX-NEXT: jne LBB12_1
+; CHECK-NOAVX-NEXT: ## %bb.2: ## %atomicrmw.end
+; CHECK-NOAVX-NEXT: popq %rbx
+; CHECK-NOAVX-NEXT: retq
+;
+; CHECK-AVX-LABEL: atomic_store_seq_cst:
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vmovq %rdx, %xmm0
+; CHECK-AVX-NEXT: vmovq %rsi, %xmm1
+; CHECK-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; CHECK-AVX-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-NEXT: retq
store atomic i128 %in, ptr %p seq_cst, align 16
ret void
}
define void @atomic_store_release(ptr %p, i128 %in) {
-; CHECK-LABEL: atomic_store_release:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbx, -16
-; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq 8(%rdi), %rdx
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: LBB13_1: ## %atomicrmw.start
-; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-NEXT: jne LBB13_1
-; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: retq
+; CHECK-NOAVX-LABEL: atomic_store_release:
+; CHECK-NOAVX: ## %bb.0:
+; CHECK-NOAVX-NEXT: pushq %rbx
+; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16
+; CHECK-NOAVX-NEXT: movq %rdx, %rcx
+; CHECK-NOAVX-NEXT: movq %rsi, %rbx
+; CHECK-NOAVX-NEXT: movq (%rdi), %rax
+; CHECK-NOAVX-NEXT: movq 8(%rdi), %rdx
+; CHECK-NOAVX-NEXT: .p2align 4, 0x90
+; CHECK-NOAVX-NEXT: LBB13_1: ## %atomicrmw.start
+; CHECK-NOAVX-NEXT: ## =>This Inner Loop Header: Depth=1
+; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-NOAVX-NEXT: jne LBB13_1
+; CHECK-NOAVX-NEXT: ## %bb.2: ## %atomicrmw.end
+; CHECK-NOAVX-NEXT: popq %rbx
+; CHECK-NOAVX-NEXT: retq
+;
+; CHECK-AVX-LABEL: atomic_store_release:
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vmovq %rdx, %xmm0
+; CHECK-AVX-NEXT: vmovq %rsi, %xmm1
+; CHECK-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; CHECK-AVX-NEXT: retq
store atomic i128 %in, ptr %p release, align 16
ret void
}
define void @atomic_store_relaxed(ptr %p, i128 %in) {
-; CHECK-LABEL: atomic_store_relaxed:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbx, -16
-; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq 8(%rdi), %rdx
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: LBB14_1: ## %atomicrmw.start
-; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-NEXT: jne LBB14_1
-; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: retq
+; CHECK-NOAVX-LABEL: atomic_store_relaxed:
+; CHECK-NOAVX: ## %bb.0:
+; CHECK-NOAVX-NEXT: pushq %rbx
+; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16
+; CHECK-NOAVX-NEXT: movq %rdx, %rcx
+; CHECK-NOAVX-NEXT: movq %rsi, %rbx
+; CHECK-NOAVX-NEXT: movq (%rdi), %rax
+; CHECK-NOAVX-NEXT: movq 8(%rdi), %rdx
+; CHECK-NOAVX-NEXT: .p2align 4, 0x90
+; CHECK-NOAVX-NEXT: LBB14_1: ## %atomicrmw.start
+; CHECK-NOAVX-NEXT: ## =>This Inner Loop Header: Depth=1
+; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-NOAVX-NEXT: jne LBB14_1
+; CHECK-NOAVX-NEXT: ## %bb.2: ## %atomicrmw.end
+; CHECK-NOAVX-NEXT: popq %rbx
+; CHECK-NOAVX-NEXT: retq
+;
+; CHECK-AVX-LABEL: atomic_store_relaxed:
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vmovq %rdx, %xmm0
+; CHECK-AVX-NEXT: vmovq %rsi, %xmm1
+; CHECK-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; CHECK-AVX-NEXT: retq
store atomic i128 %in, ptr %p unordered, align 16
ret void
}
diff --git a/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll b/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll
index 57594870a54a8..3fb561d00f97d 100644
--- a/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll
+++ b/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll
@@ -110,11 +110,9 @@ define i128 @cmpxchg_use_eflags_and_val(ptr %addr, i128 %offset) {
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %r8
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: xorl %ebx, %ebx
-; CHECK-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-NEXT: vpextrq $1, %xmm0, %rdx
+; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB4_1: # %loop
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
More information about the llvm-commits
mailing list