[llvm-branch-commits] [llvm] [SelectionDAG][X86] Remove unused elements from atomic vector. (PR #125432)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sat May 10 01:27:12 PDT 2025
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/125432
>From bf8fc80f870022c2a42d01a500e2b16d648dd376 Mon Sep 17 00:00:00 2001
From: jofrn <jofernau at amd.com>
Date: Fri, 31 Jan 2025 13:12:56 -0500
Subject: [PATCH] [SelectionDAG][X86] Remove unused elements from atomic
vector.
After splitting, all elements are created. The two components must
be found by looking at the upper and lower half of EXTRACT_ELEMENT.
This change extends EltsFromConsecutiveLoads
to understand AtomicSDNode so that unused elements can be removed.
commit-id:b83937a8
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 4 +-
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 20 ++-
.../SelectionDAGAddressAnalysis.cpp | 30 ++--
llvm/lib/Target/X86/X86ISelLowering.cpp | 59 +++++--
llvm/test/CodeGen/X86/atomic-load-store.ll | 149 ++----------------
5 files changed, 90 insertions(+), 172 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 87b6914f8a0ee..ab8bb517e6ae4 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1873,7 +1873,7 @@ class SelectionDAG {
/// chain to the token factor. This ensures that the new memory node will have
/// the same relative memory dependency position as the old load. Returns the
/// new merged load chain.
- SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp);
+ SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp);
/// Topological-sort the AllNodes list and a
/// assign a unique node id for each node in the DAG based on their
@@ -2311,7 +2311,7 @@ class SelectionDAG {
/// merged. Check that both are nonvolatile and if LD is loading
/// 'Bytes' bytes from a location that is 'Dist' units away from the
/// location that the 'Base' load is loading from.
- bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base,
+ bool areNonVolatileConsecutiveLoads(MemSDNode *LD, MemSDNode *Base,
unsigned Bytes, int Dist) const;
/// Infer alignment of a load / store address. Return std::nullopt if it
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bbf1b0fd590ef..38b22078c8c44 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -12215,7 +12215,7 @@ SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain,
return TokenFactor;
}
-SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad,
SDValue NewMemOp) {
assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
SDValue OldChain = SDValue(OldLoad, 1);
@@ -12905,17 +12905,21 @@ std::pair<SDValue, SDValue> SelectionDAG::UnrollVectorOverflowOp(
getBuildVector(NewOvVT, dl, OvScalars));
}
-bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
- LoadSDNode *Base,
+bool SelectionDAG::areNonVolatileConsecutiveLoads(MemSDNode *LD,
+ MemSDNode *Base,
unsigned Bytes,
int Dist) const {
if (LD->isVolatile() || Base->isVolatile())
return false;
- // TODO: probably too restrictive for atomics, revisit
- if (!LD->isSimple())
- return false;
- if (LD->isIndexed() || Base->isIndexed())
- return false;
+ if (auto Ld = dyn_cast<LoadSDNode>(LD)) {
+ if (!Ld->isSimple())
+ return false;
+ if (Ld->isIndexed())
+ return false;
+ }
+ if (auto Ld = dyn_cast<LoadSDNode>(Base))
+ if (Ld->isIndexed())
+ return false;
if (LD->getChain() != Base->getChain())
return false;
EVT VT = LD->getMemoryVT();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index f2ab88851b780..c29cb424c7a4c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -195,8 +195,8 @@ bool BaseIndexOffset::contains(const SelectionDAG &DAG, int64_t BitSize,
}
/// Parses tree in Ptr for base, index, offset addresses.
-static BaseIndexOffset matchLSNode(const LSBaseSDNode *N,
- const SelectionDAG &DAG) {
+template <typename T>
+static BaseIndexOffset matchSDNode(const T *N, const SelectionDAG &DAG) {
SDValue Ptr = N->getBasePtr();
// (((B + I*M) + c)) + c ...
@@ -206,16 +206,18 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N,
bool IsIndexSignExt = false;
// pre-inc/pre-dec ops are components of EA.
- if (N->getAddressingMode() == ISD::PRE_INC) {
- if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
- Offset += C->getSExtValue();
- else // If unknown, give up now.
- return BaseIndexOffset(SDValue(), SDValue(), 0, false);
- } else if (N->getAddressingMode() == ISD::PRE_DEC) {
- if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
- Offset -= C->getSExtValue();
- else // If unknown, give up now.
- return BaseIndexOffset(SDValue(), SDValue(), 0, false);
+ if constexpr (std::is_same_v<T, LSBaseSDNode>) {
+ if (N->getAddressingMode() == ISD::PRE_INC) {
+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
+ Offset += C->getSExtValue();
+ else // If unknown, give up now.
+ return BaseIndexOffset(SDValue(), SDValue(), 0, false);
+ } else if (N->getAddressingMode() == ISD::PRE_DEC) {
+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
+ Offset -= C->getSExtValue();
+ else // If unknown, give up now.
+ return BaseIndexOffset(SDValue(), SDValue(), 0, false);
+ }
}
// Consume constant adds & ors with appropriate masking.
@@ -300,8 +302,10 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N,
BaseIndexOffset BaseIndexOffset::match(const SDNode *N,
const SelectionDAG &DAG) {
+ if (const auto *AN = dyn_cast<AtomicSDNode>(N))
+ return matchSDNode(AN, DAG);
if (const auto *LS0 = dyn_cast<LSBaseSDNode>(N))
- return matchLSNode(LS0, DAG);
+ return matchSDNode(LS0, DAG);
if (const auto *LN = dyn_cast<LifetimeSDNode>(N)) {
if (LN->hasOffset())
return BaseIndexOffset(LN->getOperand(1), SDValue(), LN->getOffset(),
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 20d400c669693..3cfbf68be7ed6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7193,15 +7193,19 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
}
// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
-static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
- if (ISD::isNON_EXTLoad(Elt.getNode())) {
- auto *BaseLd = cast<LoadSDNode>(Elt);
- if (!BaseLd->isSimple())
- return false;
+static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) {
+ if (auto *BaseLd = dyn_cast<AtomicSDNode>(Elt)) {
Ld = BaseLd;
ByteOffset = 0;
return true;
- }
+ } else if (auto *BaseLd = dyn_cast<LoadSDNode>(Elt))
+ if (ISD::isNON_EXTLoad(Elt.getNode())) {
+ if (!BaseLd->isSimple())
+ return false;
+ Ld = BaseLd;
+ ByteOffset = 0;
+ return true;
+ }
switch (Elt.getOpcode()) {
case ISD::BITCAST:
@@ -7254,7 +7258,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
APInt ZeroMask = APInt::getZero(NumElems);
APInt UndefMask = APInt::getZero(NumElems);
- SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
+ SmallVector<MemSDNode *, 8> Loads(NumElems, nullptr);
SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
// For each element in the initializer, see if we've found a load, zero or an
@@ -7304,7 +7308,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
EVT EltBaseVT = EltBase.getValueType();
assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
"Register/Memory size mismatch");
- LoadSDNode *LDBase = Loads[FirstLoadedElt];
+ MemSDNode *LDBase = Loads[FirstLoadedElt];
assert(LDBase && "Did not find base load for merging consecutive loads");
unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
unsigned BaseSizeInBytes = BaseSizeInBits / 8;
@@ -7318,8 +7322,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// Check to see if the element's load is consecutive to the base load
// or offset from a previous (already checked) load.
- auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
- LoadSDNode *Ld = Loads[EltIdx];
+ auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) {
+ MemSDNode *Ld = Loads[EltIdx];
int64_t ByteOffset = ByteOffsets[EltIdx];
if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
@@ -7347,7 +7351,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}
- auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
+ auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, MemSDNode *LDBase) {
auto MMOFlags = LDBase->getMemOperand()->getFlags();
assert(LDBase->isSimple() &&
"Cannot merge volatile or atomic loads.");
@@ -9452,8 +9456,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
{
SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
if (SDValue LD =
- EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
+ EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) {
return LD;
+ }
}
// If this is a splat of pairs of 32-bit elements, we can use a narrower
@@ -60388,6 +60393,35 @@ static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineVZEXT_LOAD(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // Find the TokenFactor to locate the associated AtomicLoad.
+ SDNode *ALD = nullptr;
+ for (auto &TF : DAG.allnodes())
+ if (TF.getOpcode() == ISD::TokenFactor) {
+ SDValue L = TF.getOperand(0);
+ SDValue R = TF.getOperand(1);
+ if (L.getNode() == N)
+ ALD = R.getNode();
+ else if (R.getNode() == N)
+ ALD = L.getNode();
+ }
+
+ if (!ALD)
+ return SDValue();
+ if (!isa<AtomicSDNode>(ALD))
+ return SDValue();
+
+ // Replace the VZEXT_LOAD with the AtomicLoad.
+ SDLoc dl(N);
+ SDValue SV =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ N->getValueType(0).changeTypeToInteger(), SDValue(ALD, 0));
+ SDValue BC = DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), SV);
+ BC = DCI.CombineTo(N, BC, SDValue(ALD, 1));
+ return BC;
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -60584,6 +60618,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
case ISD::FP_TO_SINT_SAT:
case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
+ case X86ISD::VZEXT_LOAD: return combineVZEXT_LOAD(N, DAG, DCI);
// clang-format on
}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 6e2e9d4b21891..f72970d12b6eb 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -208,29 +208,12 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
define <2 x half> @atomic_vec2_half(ptr %x) {
; CHECK3-LABEL: atomic_vec2_half:
; CHECK3: ## %bb.0:
-; CHECK3-NEXT: movl (%rdi), %eax
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK3-NEXT: shrl $16, %eax
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm1
-; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK3-NEXT: retq
;
; CHECK0-LABEL: atomic_vec2_half:
; CHECK0: ## %bb.0:
-; CHECK0-NEXT: movl (%rdi), %eax
-; CHECK0-NEXT: movl %eax, %ecx
-; CHECK0-NEXT: shrl $16, %ecx
-; CHECK0-NEXT: movw %cx, %dx
-; CHECK0-NEXT: ## implicit-def: $ecx
-; CHECK0-NEXT: movw %dx, %cx
-; CHECK0-NEXT: ## implicit-def: $xmm1
-; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1
-; CHECK0-NEXT: movw %ax, %cx
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %cx, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm0
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK0-NEXT: retq
%ret = load atomic <2 x half>, ptr %x acquire, align 4
ret <2 x half> %ret
@@ -239,29 +222,12 @@ define <2 x half> @atomic_vec2_half(ptr %x) {
define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) {
; CHECK3-LABEL: atomic_vec2_bfloat:
; CHECK3: ## %bb.0:
-; CHECK3-NEXT: movl (%rdi), %eax
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK3-NEXT: shrl $16, %eax
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm1
-; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK3-NEXT: retq
;
; CHECK0-LABEL: atomic_vec2_bfloat:
; CHECK0: ## %bb.0:
-; CHECK0-NEXT: movl (%rdi), %eax
-; CHECK0-NEXT: movl %eax, %ecx
-; CHECK0-NEXT: shrl $16, %ecx
-; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx
-; CHECK0-NEXT: movw %ax, %dx
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %dx, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm0
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %cx, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm1
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm1
-; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK0-NEXT: retq
%ret = load atomic <2 x bfloat>, ptr %x acquire, align 4
ret <2 x bfloat> %ret
@@ -440,110 +406,19 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
}
define <4 x half> @atomic_vec4_half(ptr %x) nounwind {
-; CHECK3-LABEL: atomic_vec4_half:
-; CHECK3: ## %bb.0:
-; CHECK3-NEXT: movq (%rdi), %rax
-; CHECK3-NEXT: movl %eax, %ecx
-; CHECK3-NEXT: shrl $16, %ecx
-; CHECK3-NEXT: pinsrw $0, %ecx, %xmm1
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK3-NEXT: movq %rax, %rcx
-; CHECK3-NEXT: shrq $32, %rcx
-; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2
-; CHECK3-NEXT: shrq $48, %rax
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm3
-; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK3-NEXT: retq
-;
-; CHECK0-LABEL: atomic_vec4_half:
-; CHECK0: ## %bb.0:
-; CHECK0-NEXT: movq (%rdi), %rax
-; CHECK0-NEXT: movl %eax, %ecx
-; CHECK0-NEXT: shrl $16, %ecx
-; CHECK0-NEXT: movw %cx, %dx
-; CHECK0-NEXT: ## implicit-def: $ecx
-; CHECK0-NEXT: movw %dx, %cx
-; CHECK0-NEXT: ## implicit-def: $xmm2
-; CHECK0-NEXT: pinsrw $0, %ecx, %xmm2
-; CHECK0-NEXT: movw %ax, %dx
-; CHECK0-NEXT: ## implicit-def: $ecx
-; CHECK0-NEXT: movw %dx, %cx
-; CHECK0-NEXT: ## implicit-def: $xmm0
-; CHECK0-NEXT: pinsrw $0, %ecx, %xmm0
-; CHECK0-NEXT: movq %rax, %rcx
-; CHECK0-NEXT: shrq $32, %rcx
-; CHECK0-NEXT: movw %cx, %dx
-; CHECK0-NEXT: ## implicit-def: $ecx
-; CHECK0-NEXT: movw %dx, %cx
-; CHECK0-NEXT: ## implicit-def: $xmm1
-; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1
-; CHECK0-NEXT: shrq $48, %rax
-; CHECK0-NEXT: movw %ax, %cx
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %cx, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm3
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm3
-; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK0-NEXT: retq
+; CHECK-LABEL: atomic_vec4_half:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movq (%rdi), %xmm0
+; CHECK-NEXT: retq
%ret = load atomic <4 x half>, ptr %x acquire, align 8
ret <4 x half> %ret
}
define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind {
-; CHECK3-LABEL: atomic_vec4_bfloat:
-; CHECK3: ## %bb.0:
-; CHECK3-NEXT: movq (%rdi), %rax
-; CHECK3-NEXT: movq %rax, %rcx
-; CHECK3-NEXT: movq %rax, %rdx
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK3-NEXT: ## kill: def $eax killed $eax killed $rax
-; CHECK3-NEXT: shrl $16, %eax
-; CHECK3-NEXT: shrq $32, %rcx
-; CHECK3-NEXT: shrq $48, %rdx
-; CHECK3-NEXT: pinsrw $0, %edx, %xmm1
-; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2
-; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK3-NEXT: pinsrw $0, %eax, %xmm1
-; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK3-NEXT: retq
-;
-; CHECK0-LABEL: atomic_vec4_bfloat:
-; CHECK0: ## %bb.0:
-; CHECK0-NEXT: movq (%rdi), %rax
-; CHECK0-NEXT: movl %eax, %ecx
-; CHECK0-NEXT: shrl $16, %ecx
-; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx
-; CHECK0-NEXT: movw %ax, %dx
-; CHECK0-NEXT: movq %rax, %rsi
-; CHECK0-NEXT: shrq $32, %rsi
-; CHECK0-NEXT: ## kill: def $si killed $si killed $rsi
-; CHECK0-NEXT: shrq $48, %rax
-; CHECK0-NEXT: movw %ax, %di
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %di, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm0
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %si, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm1
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm1
-; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %dx, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm0
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK0-NEXT: ## implicit-def: $eax
-; CHECK0-NEXT: movw %cx, %ax
-; CHECK0-NEXT: ## implicit-def: $xmm2
-; CHECK0-NEXT: pinsrw $0, %eax, %xmm2
-; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK0-NEXT: retq
+; CHECK-LABEL: atomic_vec4_bfloat:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movq (%rdi), %xmm0
+; CHECK-NEXT: retq
%ret = load atomic <4 x bfloat>, ptr %x acquire, align 8
ret <4 x bfloat> %ret
}
More information about the llvm-branch-commits
mailing list