[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)
James Y Knight via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 11 13:39:11 PST 2023
https://github.com/jyknight updated https://github.com/llvm/llvm-project/pull/74275
>From 7baffd6d1f4254b1bd725ddc883a360d79267435 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Sat, 2 Dec 2023 23:05:26 -0500
Subject: [PATCH 1/2] [X86] Use plain load/store instead of cmpxchg16b for
atomics with AVX
In late 2021, both Intel and AMD finally documented that every
AVX-capable CPU has always been guaranteed to execute aligned 16-byte
loads/stores atomically, and further, guaranteed that all future CPUs
with AVX will do so as well.
Therefore, we may use normal SSE 128-bit load/store instructions to
implement atomics, if AVX is enabled.
Also adjust handling of unordered atomic load/store in LegalizeIntegerTypes.cpp;
currently, it hardcodes a fallback to ATOMIC_CMP_SWAP_WITH_SUCCESS,
but we should instead fallback to ATOMIC_LOAD/ATOMIC_STORE.
Per AMD64 Architecture Programmer's manual, 7.3.2 Access Atomicity:
"""
Processors that report [AVX] extend the atomicity for cacheable,
naturally-aligned single loads or stores from a quadword to a double
quadword.
"""
Per Intel's SDM:
"""
Processors that enumerate support for Intel(R) AVX guarantee that the
16-byte memory operations performed by the following instructions will
always be carried out atomically:
- MOVAPD, MOVAPS, and MOVDQA.
- VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
- VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded with
EVEX.128 and k0 (masking disabled).
"""
This was also confirmed to be true for Zhaoxin CPUs with AVX, in
https://gcc.gnu.org/PR104688
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 28 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 94 +++++--
llvm/test/CodeGen/X86/atomic-non-integer.ll | 24 +-
llvm/test/CodeGen/X86/atomic-unordered.ll | 83 +-----
llvm/test/CodeGen/X86/atomic128.ll | 247 +++++++++++-------
llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll | 8 +-
6 files changed, 256 insertions(+), 228 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 54698edce7d6f8..5b496feee7a8f4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3831,17 +3831,14 @@ void DAGTypeLegalizer::ExpandIntRes_XROUND_XRINT(SDNode *N, SDValue &Lo,
void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
SDValue &Lo, SDValue &Hi) {
if (N->isAtomic()) {
- // It's typical to have larger CAS than atomic load instructions.
SDLoc dl(N);
EVT VT = N->getMemoryVT();
- SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
- SDValue Zero = DAG.getConstant(0, dl, VT);
- SDValue Swap = DAG.getAtomicCmpSwap(
- ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl,
- VT, VTs, N->getOperand(0),
- N->getOperand(1), Zero, Zero, N->getMemOperand());
- ReplaceValueWith(SDValue(N, 0), Swap.getValue(0));
- ReplaceValueWith(SDValue(N, 1), Swap.getValue(2));
+ // We may support larger values in atomic_load than in a normal load
+ // (without splitting), so switch over if needed.
+ SDValue New = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, N->getOperand(0),
+ N->getOperand(1), N->getMemOperand());
+ ReplaceValueWith(SDValue(N, 0), New.getValue(0));
+ ReplaceValueWith(SDValue(N, 1), New.getValue(1));
return;
}
@@ -5399,14 +5396,13 @@ SDValue DAGTypeLegalizer::ExpandIntOp_XINT_TO_FP(SDNode *N) {
SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
if (N->isAtomic()) {
- // It's typical to have larger CAS than atomic store instructions.
+ // We may support larger values in atomic_store than in a normal store
+ // (without splitting), so switch over if needed.
SDLoc dl(N);
- SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
- N->getMemoryVT(),
- N->getOperand(0), N->getOperand(2),
- N->getOperand(1),
- N->getMemOperand());
- return Swap.getValue(1);
+ SDValue New =
+ DAG.getAtomic(ISD::ATOMIC_STORE, dl, N->getMemoryVT(), N->getOperand(0),
+ N->getOperand(1), N->getOperand(2), N->getMemOperand());
+ return New.getValue(0);
}
if (ISD::isNormalStore(N))
return ExpandOp_NormalStore(N, OpNo);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6167be7bdf84e9..1880cbc3a5bf35 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -515,6 +515,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.is64Bit())
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+ if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
+ // All CPUs supporting AVX will atomically load/store aligned 128-bit
+ // values, so we can emit [V]MOVAPS/[V]MOVDQA.
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
+ }
+
if (Subtarget.canUseCMPXCHG16B())
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
@@ -30101,12 +30108,16 @@ TargetLoweringBase::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
Type *MemType = SI->getValueOperand()->getType();
- bool NoImplicitFloatOps =
- SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
- if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
- !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
- (Subtarget.hasSSE1() || Subtarget.hasX87()))
- return AtomicExpansionKind::None;
+ if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
+ !Subtarget.useSoftFloat()) {
+ if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+ (Subtarget.hasSSE1() || Subtarget.hasX87()))
+ return AtomicExpansionKind::None;
+
+ if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
+ Subtarget.hasAVX())
+ return AtomicExpansionKind::None;
+ }
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
: AtomicExpansionKind::None;
@@ -30121,12 +30132,16 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
// can use movq to do the load. If we have X87 we can load into an 80-bit
// X87 register and store it to a stack temporary.
- bool NoImplicitFloatOps =
- LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
- if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
- !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
- (Subtarget.hasSSE1() || Subtarget.hasX87()))
- return AtomicExpansionKind::None;
+ if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
+ !Subtarget.useSoftFloat()) {
+ if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+ (Subtarget.hasSSE1() || Subtarget.hasX87()))
+ return AtomicExpansionKind::None;
+
+ if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
+ Subtarget.hasAVX())
+ return AtomicExpansionKind::None;
+ }
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
: AtomicExpansionKind::None;
@@ -31277,14 +31292,23 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
if (!IsSeqCst && IsTypeLegal)
return Op;
- if (VT == MVT::i64 && !IsTypeLegal) {
+ if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
+ !DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat)) {
+ SDValue Chain;
+ // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
+ // vector store.
+ if (VT == MVT::i128) {
+ if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
+ SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
+ Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
+ Node->getMemOperand());
+ }
+ }
+
// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
// is enabled.
- bool NoImplicitFloatOps =
- DAG.getMachineFunction().getFunction().hasFnAttribute(
- Attribute::NoImplicitFloat);
- if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
- SDValue Chain;
+ if (VT == MVT::i64) {
if (Subtarget.hasSSE1()) {
SDValue SclToVec =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
@@ -31316,15 +31340,15 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
StoreOps, MVT::i64, Node->getMemOperand());
}
+ }
- if (Chain) {
- // If this is a sequentially consistent store, also emit an appropriate
- // barrier.
- if (IsSeqCst)
- Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
+ if (Chain) {
+ // If this is a sequentially consistent store, also emit an appropriate
+ // barrier.
+ if (IsSeqCst)
+ Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
- return Chain;
- }
+ return Chain;
}
}
@@ -32877,12 +32901,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::ATOMIC_LOAD: {
- assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+ assert(
+ (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
+ "Unexpected VT!");
bool NoImplicitFloatOps =
DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
auto *Node = cast<AtomicSDNode>(N);
+
+ if (N->getValueType(0) == MVT::i128) {
+ if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
+ SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
+ Node->getBasePtr(), Node->getMemOperand());
+ SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+ DAG.getIntPtrConstant(0, dl));
+ SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+ DAG.getIntPtrConstant(1, dl));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
+ {ResL, ResH}));
+ Results.push_back(Ld.getValue(1));
+ return;
+ }
+ break;
+ }
if (Subtarget.hasSSE1()) {
// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
// Then extract the lower 64-bits.
diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index 7d2810e57a25b5..22b45b13aae227 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -207,14 +207,7 @@ define void @store_fp128(ptr %fptr, fp128 %v) {
;
; X64-AVX-LABEL: store_fp128:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: subq $24, %rsp
-; X64-AVX-NEXT: .cfi_def_cfa_offset 32
-; X64-AVX-NEXT: vmovaps %xmm0, (%rsp)
-; X64-AVX-NEXT: movq (%rsp), %rsi
-; X64-AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; X64-AVX-NEXT: callq __sync_lock_test_and_set_16 at PLT
-; X64-AVX-NEXT: addq $24, %rsp
-; X64-AVX-NEXT: .cfi_def_cfa_offset 8
+; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
; X64-AVX-NEXT: retq
store atomic fp128 %v, ptr %fptr unordered, align 16
ret void
@@ -592,18 +585,9 @@ define fp128 @load_fp128(ptr %fptr) {
;
; X64-AVX-LABEL: load_fp128:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: subq $24, %rsp
-; X64-AVX-NEXT: .cfi_def_cfa_offset 32
-; X64-AVX-NEXT: xorl %esi, %esi
-; X64-AVX-NEXT: xorl %edx, %edx
-; X64-AVX-NEXT: xorl %ecx, %ecx
-; X64-AVX-NEXT: xorl %r8d, %r8d
-; X64-AVX-NEXT: callq __sync_val_compare_and_swap_16 at PLT
-; X64-AVX-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rax, (%rsp)
-; X64-AVX-NEXT: vmovaps (%rsp), %xmm0
-; X64-AVX-NEXT: addq $24, %rsp
-; X64-AVX-NEXT: .cfi_def_cfa_offset 8
+; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
+; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; X64-AVX-NEXT: retq
%v = load atomic fp128, ptr %fptr unordered, align 16
ret fp128 %v
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index b66988c8bd24b5..91e427189de477 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -230,34 +230,12 @@ define void @widen_broadcast_unaligned(ptr %p0, i32 %v) {
}
define i128 @load_i128(ptr %ptr) {
-; CHECK-O0-LABEL: load_i128:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: pushq %rbx
-; CHECK-O0-NEXT: .cfi_def_cfa_offset 16
-; CHECK-O0-NEXT: .cfi_offset %rbx, -16
-; CHECK-O0-NEXT: xorl %eax, %eax
-; CHECK-O0-NEXT: movl %eax, %ebx
-; CHECK-O0-NEXT: movq %rbx, %rax
-; CHECK-O0-NEXT: movq %rbx, %rdx
-; CHECK-O0-NEXT: movq %rbx, %rcx
-; CHECK-O0-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-O0-NEXT: popq %rbx
-; CHECK-O0-NEXT: .cfi_def_cfa_offset 8
-; CHECK-O0-NEXT: retq
-;
-; CHECK-O3-LABEL: load_i128:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: pushq %rbx
-; CHECK-O3-NEXT: .cfi_def_cfa_offset 16
-; CHECK-O3-NEXT: .cfi_offset %rbx, -16
-; CHECK-O3-NEXT: xorl %eax, %eax
-; CHECK-O3-NEXT: xorl %edx, %edx
-; CHECK-O3-NEXT: xorl %ecx, %ecx
-; CHECK-O3-NEXT: xorl %ebx, %ebx
-; CHECK-O3-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-O3-NEXT: popq %rbx
-; CHECK-O3-NEXT: .cfi_def_cfa_offset 8
-; CHECK-O3-NEXT: retq
+; CHECK-LABEL: load_i128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-NEXT: vmovq %xmm0, %rax
+; CHECK-NEXT: vpextrq $1, %xmm0, %rdx
+; CHECK-NEXT: retq
%v = load atomic i128, ptr %ptr unordered, align 16
ret i128 %v
}
@@ -265,51 +243,18 @@ define i128 @load_i128(ptr %ptr) {
define void @store_i128(ptr %ptr, i128 %v) {
; CHECK-O0-LABEL: store_i128:
; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: pushq %rbx
-; CHECK-O0-NEXT: .cfi_def_cfa_offset 16
-; CHECK-O0-NEXT: .cfi_offset %rbx, -16
-; CHECK-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: movq (%rdi), %rax
-; CHECK-O0-NEXT: movq 8(%rdi), %rdx
-; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: jmp .LBB16_1
-; CHECK-O0-NEXT: .LBB16_1: # %atomicrmw.start
-; CHECK-O0-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-O0-NEXT: lock cmpxchg16b (%rsi)
-; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: jne .LBB16_1
-; CHECK-O0-NEXT: jmp .LBB16_2
-; CHECK-O0-NEXT: .LBB16_2: # %atomicrmw.end
-; CHECK-O0-NEXT: popq %rbx
-; CHECK-O0-NEXT: .cfi_def_cfa_offset 8
+; CHECK-O0-NEXT: vmovq %rsi, %xmm0
+; CHECK-O0-NEXT: vmovq %rdx, %xmm1
+; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-O0-NEXT: vmovdqa %xmm0, (%rdi)
; CHECK-O0-NEXT: retq
;
; CHECK-O3-LABEL: store_i128:
; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: pushq %rbx
-; CHECK-O3-NEXT: .cfi_def_cfa_offset 16
-; CHECK-O3-NEXT: .cfi_offset %rbx, -16
-; CHECK-O3-NEXT: movq %rdx, %rcx
-; CHECK-O3-NEXT: movq %rsi, %rbx
-; CHECK-O3-NEXT: movq (%rdi), %rax
-; CHECK-O3-NEXT: movq 8(%rdi), %rdx
-; CHECK-O3-NEXT: .p2align 4, 0x90
-; CHECK-O3-NEXT: .LBB16_1: # %atomicrmw.start
-; CHECK-O3-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-O3-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-O3-NEXT: jne .LBB16_1
-; CHECK-O3-NEXT: # %bb.2: # %atomicrmw.end
-; CHECK-O3-NEXT: popq %rbx
-; CHECK-O3-NEXT: .cfi_def_cfa_offset 8
+; CHECK-O3-NEXT: vmovq %rdx, %xmm0
+; CHECK-O3-NEXT: vmovq %rsi, %xmm1
+; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi)
; CHECK-O3-NEXT: retq
store atomic i128 %v, ptr %ptr unordered, align 16
ret void
diff --git a/llvm/test/CodeGen/X86/atomic128.ll b/llvm/test/CodeGen/X86/atomic128.ll
index d5600b54a169d2..76c3b2c5f1bb13 100644
--- a/llvm/test/CodeGen/X86/atomic128.ll
+++ b/llvm/test/CodeGen/X86/atomic128.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9 -verify-machineinstrs -mattr=cx16 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9 -verify-machineinstrs -mattr=cx16 | FileCheck %s --check-prefixes=CHECK,CHECK-NOAVX
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9 -verify-machineinstrs -mattr=cx16,avx | FileCheck %s --check-prefixes=CHECK,CHECK-AVX
; RUN: llc < %s -mtriple=i386-linux-gnu -verify-machineinstrs -mattr=cx16 | FileCheck %s -check-prefixes=CHECK32
; RUN: llc < %s -mtriple=i386-linux-gnu -verify-machineinstrs -mattr=-cx16 | FileCheck %s -check-prefixes=CHECK32
@@ -83,21 +84,32 @@ define i128 @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
@cmpxchg16b_global = external dso_local global { i128, i128 }, align 16
;; Make sure we retain the offset of the global variable.
-define void @cmpxchg16b_global_with_offset() nounwind {
-; CHECK-LABEL: cmpxchg16b_global_with_offset:
-; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: xorl %ebx, %ebx
-; CHECK-NEXT: lock cmpxchg16b _cmpxchg16b_global+16(%rip)
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: retq
+define i128 @load_global_with_offset() nounwind {
+; CHECK-NOAVX-LABEL: load_global_with_offset:
+; CHECK-NOAVX: ## %bb.0: ## %entry
+; CHECK-NOAVX-NEXT: pushq %rbx
+; CHECK-NOAVX-NEXT: xorl %eax, %eax
+; CHECK-NOAVX-NEXT: xorl %edx, %edx
+; CHECK-NOAVX-NEXT: xorl %ecx, %ecx
+; CHECK-NOAVX-NEXT: xorl %ebx, %ebx
+; CHECK-NOAVX-NEXT: lock cmpxchg16b _cmpxchg16b_global+16(%rip)
+; CHECK-NOAVX-NEXT: popq %rbx
+; CHECK-NOAVX-NEXT: retq
;
-; CHECK32-LABEL: cmpxchg16b_global_with_offset:
+; CHECK-AVX-LABEL: load_global_with_offset:
+; CHECK-AVX: ## %bb.0: ## %entry
+; CHECK-AVX-NEXT: vmovdqa _cmpxchg16b_global+16(%rip), %xmm0
+; CHECK-AVX-NEXT: vmovq %xmm0, %rax
+; CHECK-AVX-NEXT: vpextrq $1, %xmm0, %rdx
+; CHECK-AVX-NEXT: retq
+;
+; CHECK32-LABEL: load_global_with_offset:
; CHECK32: # %bb.0: # %entry
-; CHECK32-NEXT: subl $36, %esp
+; CHECK32-NEXT: pushl %edi
+; CHECK32-NEXT: pushl %esi
+; CHECK32-NEXT: subl $20, %esp
+; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT: subl $8, %esp
; CHECK32-NEXT: leal {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: pushl $0
; CHECK32-NEXT: pushl $0
@@ -110,11 +122,23 @@ define void @cmpxchg16b_global_with_offset() nounwind {
; CHECK32-NEXT: pushl $cmpxchg16b_global+16
; CHECK32-NEXT: pushl %eax
; CHECK32-NEXT: calll __sync_val_compare_and_swap_16
-; CHECK32-NEXT: addl $72, %esp
-; CHECK32-NEXT: retl
+; CHECK32-NEXT: addl $44, %esp
+; CHECK32-NEXT: movl (%esp), %eax
+; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT: movl %edi, 8(%esi)
+; CHECK32-NEXT: movl %edx, 12(%esi)
+; CHECK32-NEXT: movl %eax, (%esi)
+; CHECK32-NEXT: movl %ecx, 4(%esi)
+; CHECK32-NEXT: movl %esi, %eax
+; CHECK32-NEXT: addl $20, %esp
+; CHECK32-NEXT: popl %esi
+; CHECK32-NEXT: popl %edi
+; CHECK32-NEXT: retl $4
entry:
%0 = load atomic i128, ptr getelementptr inbounds ({i128, i128}, ptr @cmpxchg16b_global, i64 0, i32 1) acquire, align 16
- ret void
+ ret i128 %0
}
define void @fetch_and_nand(ptr %p, i128 %bits) {
@@ -676,18 +700,25 @@ define void @fetch_and_umax(ptr %p, i128 %bits) {
}
define i128 @atomic_load_seq_cst(ptr %p) {
-; CHECK-LABEL: atomic_load_seq_cst:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbx, -16
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: xorl %ebx, %ebx
-; CHECK-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: retq
+; CHECK-NOAVX-LABEL: atomic_load_seq_cst:
+; CHECK-NOAVX: ## %bb.0:
+; CHECK-NOAVX-NEXT: pushq %rbx
+; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16
+; CHECK-NOAVX-NEXT: xorl %eax, %eax
+; CHECK-NOAVX-NEXT: xorl %edx, %edx
+; CHECK-NOAVX-NEXT: xorl %ecx, %ecx
+; CHECK-NOAVX-NEXT: xorl %ebx, %ebx
+; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-NOAVX-NEXT: popq %rbx
+; CHECK-NOAVX-NEXT: retq
+;
+; CHECK-AVX-LABEL: atomic_load_seq_cst:
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-AVX-NEXT: vmovq %xmm0, %rax
+; CHECK-AVX-NEXT: vpextrq $1, %xmm0, %rdx
+; CHECK-AVX-NEXT: retq
;
; CHECK32-LABEL: atomic_load_seq_cst:
; CHECK32: # %bb.0:
@@ -748,18 +779,25 @@ define i128 @atomic_load_seq_cst(ptr %p) {
}
define i128 @atomic_load_relaxed(ptr %p) {
-; CHECK-LABEL: atomic_load_relaxed:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbx, -16
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: xorl %ebx, %ebx
-; CHECK-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: retq
+; CHECK-NOAVX-LABEL: atomic_load_relaxed:
+; CHECK-NOAVX: ## %bb.0:
+; CHECK-NOAVX-NEXT: pushq %rbx
+; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16
+; CHECK-NOAVX-NEXT: xorl %eax, %eax
+; CHECK-NOAVX-NEXT: xorl %edx, %edx
+; CHECK-NOAVX-NEXT: xorl %ecx, %ecx
+; CHECK-NOAVX-NEXT: xorl %ebx, %ebx
+; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-NOAVX-NEXT: popq %rbx
+; CHECK-NOAVX-NEXT: retq
+;
+; CHECK-AVX-LABEL: atomic_load_relaxed:
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-AVX-NEXT: vmovq %xmm0, %rax
+; CHECK-AVX-NEXT: vpextrq $1, %xmm0, %rdx
+; CHECK-AVX-NEXT: retq
;
; CHECK32-LABEL: atomic_load_relaxed:
; CHECK32: # %bb.0:
@@ -820,23 +858,32 @@ define i128 @atomic_load_relaxed(ptr %p) {
}
define void @atomic_store_seq_cst(ptr %p, i128 %in) {
-; CHECK-LABEL: atomic_store_seq_cst:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbx, -16
-; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq 8(%rdi), %rdx
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: LBB12_1: ## %atomicrmw.start
-; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-NEXT: jne LBB12_1
-; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: retq
+; CHECK-NOAVX-LABEL: atomic_store_seq_cst:
+; CHECK-NOAVX: ## %bb.0:
+; CHECK-NOAVX-NEXT: pushq %rbx
+; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16
+; CHECK-NOAVX-NEXT: movq %rdx, %rcx
+; CHECK-NOAVX-NEXT: movq %rsi, %rbx
+; CHECK-NOAVX-NEXT: movq (%rdi), %rax
+; CHECK-NOAVX-NEXT: movq 8(%rdi), %rdx
+; CHECK-NOAVX-NEXT: .p2align 4, 0x90
+; CHECK-NOAVX-NEXT: LBB12_1: ## %atomicrmw.start
+; CHECK-NOAVX-NEXT: ## =>This Inner Loop Header: Depth=1
+; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-NOAVX-NEXT: jne LBB12_1
+; CHECK-NOAVX-NEXT: ## %bb.2: ## %atomicrmw.end
+; CHECK-NOAVX-NEXT: popq %rbx
+; CHECK-NOAVX-NEXT: retq
+;
+; CHECK-AVX-LABEL: atomic_store_seq_cst:
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vmovq %rdx, %xmm0
+; CHECK-AVX-NEXT: vmovq %rsi, %xmm1
+; CHECK-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; CHECK-AVX-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-NEXT: retq
;
; CHECK32-LABEL: atomic_store_seq_cst:
; CHECK32: # %bb.0:
@@ -865,23 +912,31 @@ define void @atomic_store_seq_cst(ptr %p, i128 %in) {
}
define void @atomic_store_release(ptr %p, i128 %in) {
-; CHECK-LABEL: atomic_store_release:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbx, -16
-; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq 8(%rdi), %rdx
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: LBB13_1: ## %atomicrmw.start
-; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-NEXT: jne LBB13_1
-; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: retq
+; CHECK-NOAVX-LABEL: atomic_store_release:
+; CHECK-NOAVX: ## %bb.0:
+; CHECK-NOAVX-NEXT: pushq %rbx
+; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16
+; CHECK-NOAVX-NEXT: movq %rdx, %rcx
+; CHECK-NOAVX-NEXT: movq %rsi, %rbx
+; CHECK-NOAVX-NEXT: movq (%rdi), %rax
+; CHECK-NOAVX-NEXT: movq 8(%rdi), %rdx
+; CHECK-NOAVX-NEXT: .p2align 4, 0x90
+; CHECK-NOAVX-NEXT: LBB13_1: ## %atomicrmw.start
+; CHECK-NOAVX-NEXT: ## =>This Inner Loop Header: Depth=1
+; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-NOAVX-NEXT: jne LBB13_1
+; CHECK-NOAVX-NEXT: ## %bb.2: ## %atomicrmw.end
+; CHECK-NOAVX-NEXT: popq %rbx
+; CHECK-NOAVX-NEXT: retq
+;
+; CHECK-AVX-LABEL: atomic_store_release:
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vmovq %rdx, %xmm0
+; CHECK-AVX-NEXT: vmovq %rsi, %xmm1
+; CHECK-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; CHECK-AVX-NEXT: retq
;
; CHECK32-LABEL: atomic_store_release:
; CHECK32: # %bb.0:
@@ -910,23 +965,31 @@ define void @atomic_store_release(ptr %p, i128 %in) {
}
define void @atomic_store_relaxed(ptr %p, i128 %in) {
-; CHECK-LABEL: atomic_store_relaxed:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbx, -16
-; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq 8(%rdi), %rdx
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: LBB14_1: ## %atomicrmw.start
-; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lock cmpxchg16b (%rdi)
-; CHECK-NEXT: jne LBB14_1
-; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: retq
+; CHECK-NOAVX-LABEL: atomic_store_relaxed:
+; CHECK-NOAVX: ## %bb.0:
+; CHECK-NOAVX-NEXT: pushq %rbx
+; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16
+; CHECK-NOAVX-NEXT: movq %rdx, %rcx
+; CHECK-NOAVX-NEXT: movq %rsi, %rbx
+; CHECK-NOAVX-NEXT: movq (%rdi), %rax
+; CHECK-NOAVX-NEXT: movq 8(%rdi), %rdx
+; CHECK-NOAVX-NEXT: .p2align 4, 0x90
+; CHECK-NOAVX-NEXT: LBB14_1: ## %atomicrmw.start
+; CHECK-NOAVX-NEXT: ## =>This Inner Loop Header: Depth=1
+; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-NOAVX-NEXT: jne LBB14_1
+; CHECK-NOAVX-NEXT: ## %bb.2: ## %atomicrmw.end
+; CHECK-NOAVX-NEXT: popq %rbx
+; CHECK-NOAVX-NEXT: retq
+;
+; CHECK-AVX-LABEL: atomic_store_relaxed:
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vmovq %rdx, %xmm0
+; CHECK-AVX-NEXT: vmovq %rsi, %xmm1
+; CHECK-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; CHECK-AVX-NEXT: retq
;
; CHECK32-LABEL: atomic_store_relaxed:
; CHECK32: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll b/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll
index 57594870a54a85..3fb561d00f97d1 100644
--- a/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll
+++ b/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll
@@ -110,11 +110,9 @@ define i128 @cmpxchg_use_eflags_and_val(ptr %addr, i128 %offset) {
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %r8
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: xorl %ebx, %ebx
-; CHECK-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-NEXT: vpextrq $1, %xmm0, %rdx
+; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB4_1: # %loop
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
>From db3abe6b07517fbafc480a99f72fa62e9e4c9234 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Mon, 4 Dec 2023 12:30:47 -0500
Subject: [PATCH 2/2] Revert change to LegalizeIntegerTypes.cpp
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 28 +++---
llvm/test/CodeGen/X86/atomic-unordered.ll | 94 +++++++++++++++----
2 files changed, 90 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 5b496feee7a8f4..54698edce7d6f8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3831,14 +3831,17 @@ void DAGTypeLegalizer::ExpandIntRes_XROUND_XRINT(SDNode *N, SDValue &Lo,
void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
SDValue &Lo, SDValue &Hi) {
if (N->isAtomic()) {
+ // It's typical to have larger CAS than atomic load instructions.
SDLoc dl(N);
EVT VT = N->getMemoryVT();
- // We may support larger values in atomic_load than in a normal load
- // (without splitting), so switch over if needed.
- SDValue New = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, N->getOperand(0),
- N->getOperand(1), N->getMemOperand());
- ReplaceValueWith(SDValue(N, 0), New.getValue(0));
- ReplaceValueWith(SDValue(N, 1), New.getValue(1));
+ SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+ SDValue Swap = DAG.getAtomicCmpSwap(
+ ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl,
+ VT, VTs, N->getOperand(0),
+ N->getOperand(1), Zero, Zero, N->getMemOperand());
+ ReplaceValueWith(SDValue(N, 0), Swap.getValue(0));
+ ReplaceValueWith(SDValue(N, 1), Swap.getValue(2));
return;
}
@@ -5396,13 +5399,14 @@ SDValue DAGTypeLegalizer::ExpandIntOp_XINT_TO_FP(SDNode *N) {
SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
if (N->isAtomic()) {
- // We may support larger values in atomic_store than in a normal store
- // (without splitting), so switch over if needed.
+ // It's typical to have larger CAS than atomic store instructions.
SDLoc dl(N);
- SDValue New =
- DAG.getAtomic(ISD::ATOMIC_STORE, dl, N->getMemoryVT(), N->getOperand(0),
- N->getOperand(1), N->getOperand(2), N->getMemOperand());
- return New.getValue(0);
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+ N->getMemoryVT(),
+ N->getOperand(0), N->getOperand(2),
+ N->getOperand(1),
+ N->getMemOperand());
+ return Swap.getValue(1);
}
if (ISD::isNormalStore(N))
return ExpandOp_NormalStore(N, OpNo);
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 91e427189de477..f93cbe6aa9a91c 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -230,32 +230,86 @@ define void @widen_broadcast_unaligned(ptr %p0, i32 %v) {
}
define i128 @load_i128(ptr %ptr) {
-; CHECK-LABEL: load_i128:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %xmm0
-; CHECK-NEXT: vmovq %xmm0, %rax
-; CHECK-NEXT: vpextrq $1, %xmm0, %rdx
-; CHECK-NEXT: retq
+; CHECK-O0-CUR-LABEL: load_i128:
+; CHECK-O0-CUR: # %bb.0:
+; CHECK-O0-CUR-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-O0-CUR-NEXT: vmovq %xmm0, %rax
+; CHECK-O0-CUR-NEXT: vpextrq $1, %xmm0, %rdx
+; CHECK-O0-CUR-NEXT: retq
+;
+; CHECK-O3-CUR-LABEL: load_i128:
+; CHECK-O3-CUR: # %bb.0:
+; CHECK-O3-CUR-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-O3-CUR-NEXT: vmovq %xmm0, %rax
+; CHECK-O3-CUR-NEXT: vpextrq $1, %xmm0, %rdx
+; CHECK-O3-CUR-NEXT: retq
+;
+; CHECK-O0-EX-LABEL: load_i128:
+; CHECK-O0-EX: # %bb.0:
+; CHECK-O0-EX-NEXT: pushq %rbx
+; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-O0-EX-NEXT: .cfi_offset %rbx, -16
+; CHECK-O0-EX-NEXT: xorl %eax, %eax
+; CHECK-O0-EX-NEXT: movl %eax, %ebx
+; CHECK-O0-EX-NEXT: movq %rbx, %rax
+; CHECK-O0-EX-NEXT: movq %rbx, %rdx
+; CHECK-O0-EX-NEXT: movq %rbx, %rcx
+; CHECK-O0-EX-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-O0-EX-NEXT: popq %rbx
+; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 8
+; CHECK-O0-EX-NEXT: retq
+;
+; CHECK-O3-EX-LABEL: load_i128:
+; CHECK-O3-EX: # %bb.0:
+; CHECK-O3-EX-NEXT: pushq %rbx
+; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-O3-EX-NEXT: .cfi_offset %rbx, -16
+; CHECK-O3-EX-NEXT: xorl %eax, %eax
+; CHECK-O3-EX-NEXT: xorl %edx, %edx
+; CHECK-O3-EX-NEXT: xorl %ecx, %ecx
+; CHECK-O3-EX-NEXT: xorl %ebx, %ebx
+; CHECK-O3-EX-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-O3-EX-NEXT: popq %rbx
+; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 8
+; CHECK-O3-EX-NEXT: retq
%v = load atomic i128, ptr %ptr unordered, align 16
ret i128 %v
}
define void @store_i128(ptr %ptr, i128 %v) {
-; CHECK-O0-LABEL: store_i128:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: vmovq %rsi, %xmm0
-; CHECK-O0-NEXT: vmovq %rdx, %xmm1
-; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-O0-NEXT: vmovdqa %xmm0, (%rdi)
-; CHECK-O0-NEXT: retq
+; CHECK-O0-CUR-LABEL: store_i128:
+; CHECK-O0-CUR: # %bb.0:
+; CHECK-O0-CUR-NEXT: vmovq %rsi, %xmm0
+; CHECK-O0-CUR-NEXT: vmovq %rdx, %xmm1
+; CHECK-O0-CUR-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-O0-CUR-NEXT: vmovdqa %xmm0, (%rdi)
+; CHECK-O0-CUR-NEXT: retq
;
-; CHECK-O3-LABEL: store_i128:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: vmovq %rdx, %xmm0
-; CHECK-O3-NEXT: vmovq %rsi, %xmm1
-; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi)
-; CHECK-O3-NEXT: retq
+; CHECK-O3-CUR-LABEL: store_i128:
+; CHECK-O3-CUR: # %bb.0:
+; CHECK-O3-CUR-NEXT: vmovq %rdx, %xmm0
+; CHECK-O3-CUR-NEXT: vmovq %rsi, %xmm1
+; CHECK-O3-CUR-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-O3-CUR-NEXT: vmovdqa %xmm0, (%rdi)
+; CHECK-O3-CUR-NEXT: retq
+;
+; CHECK-O0-EX-LABEL: store_i128:
+; CHECK-O0-EX: # %bb.0:
+; CHECK-O0-EX-NEXT: pushq %rax
+; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-O0-EX-NEXT: callq __sync_lock_test_and_set_16 at PLT
+; CHECK-O0-EX-NEXT: popq %rax
+; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 8
+; CHECK-O0-EX-NEXT: retq
+;
+; CHECK-O3-EX-LABEL: store_i128:
+; CHECK-O3-EX: # %bb.0:
+; CHECK-O3-EX-NEXT: pushq %rax
+; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 16
+; CHECK-O3-EX-NEXT: callq __sync_lock_test_and_set_16 at PLT
+; CHECK-O3-EX-NEXT: popq %rax
+; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 8
+; CHECK-O3-EX-NEXT: retq
store atomic i128 %v, ptr %ptr unordered, align 16
ret void
}
More information about the llvm-commits
mailing list