[llvm-branch-commits] [llvm] [SelectionDAG][X86] Remove unused elements from atomic vector. (PR #125432)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun Feb 2 12:25:39 PST 2025
https://github.com/jofrn created https://github.com/llvm/llvm-project/pull/125432
After splitting, all elements are created. The elements are
placed back into a concat_vectors. This change extends EltsFromConsecutiveLoads
to understand AtomicSDNode so that its concat_vectors can be
mapped to a BUILD_VECTOR and so unused elements are no longer
referenced.
---
**Stack**:
- #120716
- #125432 ⬅
- #120640
- #120598
- #120387
- #120386
- #120385
- #120384
⚠️ *Part of a stack created by [spr](https://github.com/ejoffe/spr). Do not merge manually using the UI - doing so may have unexpected results.*
>From fbe99a2a2d6f3c95978c520812046567ec7811cd Mon Sep 17 00:00:00 2001
From: jofrn <jofernau at amd.com>
Date: Fri, 31 Jan 2025 13:12:56 -0500
Subject: [PATCH] [SelectionDAG][X86] Remove unused elements from atomic
vector.
After splitting, all elements are created. The elements are
placed back into a concat_vectors. This change extends EltsFromConsecutiveLoads
to understand AtomicSDNode so that its concat_vectors can be
mapped to a BUILD_VECTOR and so unused elements are no longer
referenced.
commit-id:b83937a8
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 4 +-
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 20 ++-
.../SelectionDAGAddressAnalysis.cpp | 30 ++--
.../SelectionDAG/SelectionDAGBuilder.cpp | 6 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 28 +--
llvm/test/CodeGen/X86/atomic-load-store.ll | 167 ++----------------
6 files changed, 69 insertions(+), 186 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 461c0c1ead16d2c..bea5958ec0bba6e 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1840,7 +1840,7 @@ class SelectionDAG {
/// chain to the token factor. This ensures that the new memory node will have
/// the same relative memory dependency position as the old load. Returns the
/// new merged load chain.
- SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp);
+ SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp);
/// Topological-sort the AllNodes list and a
/// assign a unique node id for each node in the DAG based on their
@@ -2264,7 +2264,7 @@ class SelectionDAG {
/// merged. Check that both are nonvolatile and if LD is loading
/// 'Bytes' bytes from a location that is 'Dist' units away from the
/// location that the 'Base' load is loading from.
- bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base,
+ bool areNonVolatileConsecutiveLoads(MemSDNode *LD, MemSDNode *Base,
unsigned Bytes, int Dist) const;
/// Infer alignment of a load / store address. Return std::nullopt if it
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b416c0efbbc4fc6..5f274fabfe8d640 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -12167,7 +12167,7 @@ SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain,
return TokenFactor;
}
-SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad,
SDValue NewMemOp) {
assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
SDValue OldChain = SDValue(OldLoad, 1);
@@ -12879,17 +12879,21 @@ std::pair<SDValue, SDValue> SelectionDAG::UnrollVectorOverflowOp(
getBuildVector(NewOvVT, dl, OvScalars));
}
-bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
- LoadSDNode *Base,
+bool SelectionDAG::areNonVolatileConsecutiveLoads(MemSDNode *LD,
+ MemSDNode *Base,
unsigned Bytes,
int Dist) const {
if (LD->isVolatile() || Base->isVolatile())
return false;
- // TODO: probably too restrictive for atomics, revisit
- if (!LD->isSimple())
- return false;
- if (LD->isIndexed() || Base->isIndexed())
- return false;
+ if (auto Ld = dyn_cast<LoadSDNode>(LD)) {
+ if (!Ld->isSimple())
+ return false;
+ if (Ld->isIndexed())
+ return false;
+ }
+ if (auto Ld = dyn_cast<LoadSDNode>(Base))
+ if (Ld->isIndexed())
+ return false;
if (LD->getChain() != Base->getChain())
return false;
EVT VT = LD->getMemoryVT();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index f2ab88851b780ef..a19af64a7962291 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -194,8 +194,8 @@ bool BaseIndexOffset::contains(const SelectionDAG &DAG, int64_t BitSize,
return false;
}
-/// Parses tree in Ptr for base, index, offset addresses.
-static BaseIndexOffset matchLSNode(const LSBaseSDNode *N,
+template <typename T>
+static BaseIndexOffset matchSDNode(const T *N,
const SelectionDAG &DAG) {
SDValue Ptr = N->getBasePtr();
@@ -206,16 +206,18 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N,
bool IsIndexSignExt = false;
// pre-inc/pre-dec ops are components of EA.
- if (N->getAddressingMode() == ISD::PRE_INC) {
- if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
- Offset += C->getSExtValue();
- else // If unknown, give up now.
- return BaseIndexOffset(SDValue(), SDValue(), 0, false);
- } else if (N->getAddressingMode() == ISD::PRE_DEC) {
- if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
- Offset -= C->getSExtValue();
- else // If unknown, give up now.
- return BaseIndexOffset(SDValue(), SDValue(), 0, false);
+ if constexpr (std::is_same_v<T, LSBaseSDNode>) {
+ if (N->getAddressingMode() == ISD::PRE_INC) {
+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
+ Offset += C->getSExtValue();
+ else // If unknown, give up now.
+ return BaseIndexOffset(SDValue(), SDValue(), 0, false);
+ } else if (N->getAddressingMode() == ISD::PRE_DEC) {
+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
+ Offset -= C->getSExtValue();
+ else // If unknown, give up now.
+ return BaseIndexOffset(SDValue(), SDValue(), 0, false);
+ }
}
// Consume constant adds & ors with appropriate masking.
@@ -300,8 +302,10 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N,
BaseIndexOffset BaseIndexOffset::match(const SDNode *N,
const SelectionDAG &DAG) {
+ if (const auto *AN = dyn_cast<AtomicSDNode>(N))
+ return matchSDNode(AN, DAG);
if (const auto *LS0 = dyn_cast<LSBaseSDNode>(N))
- return matchLSNode(LS0, DAG);
+ return matchSDNode(LS0, DAG);
if (const auto *LN = dyn_cast<LifetimeSDNode>(N)) {
if (LN->hasOffset())
return BaseIndexOffset(LN->getOperand(1), SDValue(), LN->getOffset(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 428e7a316d247b0..7e784b2919c2a6d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5218,7 +5218,11 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
L = DAG.getPtrExtOrTrunc(L, dl, VT);
setValue(&I, L);
- DAG.setRoot(OutChain);
+
+ if (VT.isVector())
+ DAG.setRoot(InChain);
+ else
+ DAG.setRoot(OutChain);
}
void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ba9ac2f21c7564d..3b8f3dd1e9a5e91 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7074,15 +7074,20 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
}
// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
-static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
- if (ISD::isNON_EXTLoad(Elt.getNode())) {
- auto *BaseLd = cast<LoadSDNode>(Elt);
- if (!BaseLd->isSimple())
- return false;
+static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) {
+ if (auto *BaseLd = dyn_cast<AtomicSDNode>(Elt)) {
Ld = BaseLd;
ByteOffset = 0;
return true;
}
+ else if (auto *BaseLd = dyn_cast<LoadSDNode>(Elt))
+ if (ISD::isNON_EXTLoad(Elt.getNode())) {
+ if (!BaseLd->isSimple())
+ return false;
+ Ld = BaseLd;
+ ByteOffset = 0;
+ return true;
+ }
switch (Elt.getOpcode()) {
case ISD::BITCAST:
@@ -7135,7 +7140,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
APInt ZeroMask = APInt::getZero(NumElems);
APInt UndefMask = APInt::getZero(NumElems);
- SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
+ SmallVector<MemSDNode*, 8> Loads(NumElems, nullptr);
SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
// For each element in the initializer, see if we've found a load, zero or an
@@ -7185,7 +7190,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
EVT EltBaseVT = EltBase.getValueType();
assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
"Register/Memory size mismatch");
- LoadSDNode *LDBase = Loads[FirstLoadedElt];
+ MemSDNode *LDBase = Loads[FirstLoadedElt];
assert(LDBase && "Did not find base load for merging consecutive loads");
unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
unsigned BaseSizeInBytes = BaseSizeInBits / 8;
@@ -7199,8 +7204,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// Check to see if the element's load is consecutive to the base load
// or offset from a previous (already checked) load.
- auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
- LoadSDNode *Ld = Loads[EltIdx];
+ auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) {
+ MemSDNode *Ld = Loads[EltIdx];
int64_t ByteOffset = ByteOffsets[EltIdx];
if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
@@ -7228,7 +7233,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}
- auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
+ auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, MemSDNode *LDBase) {
auto MMOFlags = LDBase->getMemOperand()->getFlags();
assert(LDBase->isSimple() &&
"Cannot merge volatile or atomic loads.");
@@ -9271,8 +9276,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
{
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
if (SDValue LD =
- EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
+ EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) {
return LD;
+ }
}
// If this is a splat of pairs of 32-bit elements, we can use a narrower
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 42b095582429345..08d0405345f5736 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -205,63 +205,19 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
}
define <2 x half> @atomic_vec2_half(ptr %x) {
-; CHECK3-LABEL: atomic_vec2_half:
-; CHECK3: ## %bb.0:
-; CHECK3-NEXT: movl (%rdi), %eax
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK3-NEXT: shrl $16, %eax
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm1
-; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK3-NEXT: retq
-;
-; CHECK0-LABEL: atomic_vec2_half:
-; CHECK0: ## %bb.0:
-; CHECK0-NEXT: movl (%rdi), %eax
-; CHECK0-NEXT: movl %eax, %ecx
-; CHECK0-NEXT: shrl $16, %ecx
-; CHECK0-NEXT: movw %cx, %dx
-; CHECK0-NEXT: ## implicit-def: $ecx
-; CHECK0-NEXT: movw %dx, %cx
-; CHECK0-NEXT: ## implicit-def: $xmm1
-; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1
-; CHECK0-NEXT: movw %ax, %cx
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %cx, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm0
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK0-NEXT: retq
+; CHECK-LABEL: atomic_vec2_half:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq
%ret = load atomic <2 x half>, ptr %x acquire, align 4
ret <2 x half> %ret
}
define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) {
-; CHECK3-LABEL: atomic_vec2_bfloat:
-; CHECK3: ## %bb.0:
-; CHECK3-NEXT: movl (%rdi), %eax
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK3-NEXT: shrl $16, %eax
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm1
-; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK3-NEXT: retq
-;
-; CHECK0-LABEL: atomic_vec2_bfloat:
-; CHECK0: ## %bb.0:
-; CHECK0-NEXT: movl (%rdi), %eax
-; CHECK0-NEXT: movl %eax, %ecx
-; CHECK0-NEXT: shrl $16, %ecx
-; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx
-; CHECK0-NEXT: movw %ax, %dx
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %dx, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm0
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %cx, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm1
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm1
-; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK0-NEXT: retq
+; CHECK-LABEL: atomic_vec2_bfloat:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq
%ret = load atomic <2 x bfloat>, ptr %x acquire, align 4
ret <2 x bfloat> %ret
}
@@ -439,110 +395,19 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
}
define <4 x half> @atomic_vec4_half(ptr %x) nounwind {
-; CHECK3-LABEL: atomic_vec4_half:
-; CHECK3: ## %bb.0:
-; CHECK3-NEXT: movq (%rdi), %rax
-; CHECK3-NEXT: movl %eax, %ecx
-; CHECK3-NEXT: shrl $16, %ecx
-; CHECK3-NEXT: pinsrw $0, %ecx, %xmm1
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK3-NEXT: movq %rax, %rcx
-; CHECK3-NEXT: shrq $32, %rcx
-; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2
-; CHECK3-NEXT: shrq $48, %rax
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm3
-; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK3-NEXT: retq
-;
-; CHECK0-LABEL: atomic_vec4_half:
-; CHECK0: ## %bb.0:
-; CHECK0-NEXT: movq (%rdi), %rax
-; CHECK0-NEXT: movl %eax, %ecx
-; CHECK0-NEXT: shrl $16, %ecx
-; CHECK0-NEXT: movw %cx, %dx
-; CHECK0-NEXT: ## implicit-def: $ecx
-; CHECK0-NEXT: movw %dx, %cx
-; CHECK0-NEXT: ## implicit-def: $xmm2
-; CHECK0-NEXT: pinsrw $0, %ecx, %xmm2
-; CHECK0-NEXT: movw %ax, %dx
-; CHECK0-NEXT: ## implicit-def: $ecx
-; CHECK0-NEXT: movw %dx, %cx
-; CHECK0-NEXT: ## implicit-def: $xmm0
-; CHECK0-NEXT: pinsrw $0, %ecx, %xmm0
-; CHECK0-NEXT: movq %rax, %rcx
-; CHECK0-NEXT: shrq $32, %rcx
-; CHECK0-NEXT: movw %cx, %dx
-; CHECK0-NEXT: ## implicit-def: $ecx
-; CHECK0-NEXT: movw %dx, %cx
-; CHECK0-NEXT: ## implicit-def: $xmm1
-; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1
-; CHECK0-NEXT: shrq $48, %rax
-; CHECK0-NEXT: movw %ax, %cx
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %cx, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm3
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm3
-; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK0-NEXT: retq
+; CHECK-LABEL: atomic_vec4_half:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retq
%ret = load atomic <4 x half>, ptr %x acquire, align 8
ret <4 x half> %ret
}
define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind {
-; CHECK3-LABEL: atomic_vec4_bfloat:
-; CHECK3: ## %bb.0:
-; CHECK3-NEXT: movq (%rdi), %rax
-; CHECK3-NEXT: movq %rax, %rcx
-; CHECK3-NEXT: movq %rax, %rdx
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK3-NEXT: ## kill: def $eax killed $eax killed $rax
-; CHECK3-NEXT: shrl $16, %eax
-; CHECK3-NEXT: shrq $32, %rcx
-; CHECK3-NEXT: shrq $48, %rdx
-; CHECK3-NEXT: pinsrw $0, %edx, %xmm1
-; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2
-; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm1
-; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK3-NEXT: retq
-;
-; CHECK0-LABEL: atomic_vec4_bfloat:
-; CHECK0: ## %bb.0:
-; CHECK0-NEXT: movq (%rdi), %rax
-; CHECK0-NEXT: movl %eax, %ecx
-; CHECK0-NEXT: shrl $16, %ecx
-; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx
-; CHECK0-NEXT: movw %ax, %dx
-; CHECK0-NEXT: movq %rax, %rsi
-; CHECK0-NEXT: shrq $32, %rsi
-; CHECK0-NEXT: ## kill: def $si killed $si killed $rsi
-; CHECK0-NEXT: shrq $48, %rax
-; CHECK0-NEXT: movw %ax, %di
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %di, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm0
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %si, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm1
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm1
-; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %dx, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm0
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %cx, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm2
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm2
-; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK0-NEXT: retq
+; CHECK-LABEL: atomic_vec4_bfloat:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retq
%ret = load atomic <4 x bfloat>, ptr %x acquire, align 8
ret <4 x bfloat> %ret
}
More information about the llvm-branch-commits
mailing list