[llvm] r362723 - [DAGCombine] MergeConsecutiveStores - improve non-temporal load\store handling (PR42123)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 6 10:04:14 PDT 2019
Author: rksimon
Date: Thu Jun 6 10:04:13 2019
New Revision: 362723
URL: http://llvm.org/viewvc/llvm-project?rev=362723&view=rev
Log:
[DAGCombine] MergeConsecutiveStores - improve non-temporal load\store handling (PR42123)
This patch is the first step towards ensuring MergeConsecutiveStores correctly handles non-temporal loads\stores:
1 - When merging load\stores we must ensure that they all have the same non-temporal flag. This is unlikely to occur, but can in strange cases where we're storing at the end of one page and the beginning of another.
2 - The merged load\store node must retain the non-temporal flag.
Differential Revision: https://reviews.llvm.org/D62910
Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=362723&r1=362722&r2=362723&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Thu Jun 6 10:04:13 2019
@@ -14958,14 +14958,18 @@ void DAGCombiner::getStoreMergeCandidate
// Loads must only have one use.
if (!Ld->hasNUsesOfValue(1, 0))
return;
- // The memory operands must not be volatile.
+ // The memory operands must not be volatile/indexed.
if (Ld->isVolatile() || Ld->isIndexed())
return;
}
auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
int64_t &Offset) -> bool {
+ // The memory operands must not be volatile/indexed.
if (Other->isVolatile() || Other->isIndexed())
return false;
+ // Don't mix temporal stores with non-temporal stores.
+ if (St->isNonTemporal() != Other->isNonTemporal())
+ return false;
SDValue OtherBC = peekThroughBitcasts(Other->getValue());
// Allow merging constants of different types as integers.
bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
@@ -14975,15 +14979,18 @@ void DAGCombiner::getStoreMergeCandidate
return false;
// The Load's Base Ptr must also match
if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(OtherBC)) {
- auto LPtr = BaseIndexOffset::match(OtherLd, DAG);
+ BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG);
if (LoadVT != OtherLd->getMemoryVT())
return false;
// Loads must only have one use.
if (!OtherLd->hasNUsesOfValue(1, 0))
return false;
- // The memory operands must not be volatile.
+ // The memory operands must not be volatile/indexed.
if (OtherLd->isVolatile() || OtherLd->isIndexed())
return false;
+ // Don't mix temporal loads with non-temporal loads.
+ if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
+ return false;
if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
return false;
} else
@@ -15140,6 +15147,9 @@ bool DAGCombiner::MergeConsecutiveStores
isa<ConstantFPSDNode>(StoredVal);
bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR);
+ bool IsNonTemporalStore = St->isNonTemporal();
+ bool IsNonTemporalLoad =
+ IsLoadSrc && cast<LoadSDNode>(StoredVal)->isNonTemporal();
if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)
return false;
@@ -15583,26 +15593,32 @@ bool DAGCombiner::MergeConsecutiveStores
SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
AddToWorklist(NewStoreChain.getNode());
- MachineMemOperand::Flags MMOFlags =
+ MachineMemOperand::Flags LdMMOFlags =
isDereferenceable ? MachineMemOperand::MODereferenceable
: MachineMemOperand::MONone;
+ if (IsNonTemporalLoad)
+ LdMMOFlags |= MachineMemOperand::MONonTemporal;
+
+ MachineMemOperand::Flags StMMOFlags =
+ IsNonTemporalStore ? MachineMemOperand::MONonTemporal
+ : MachineMemOperand::MONone;
SDValue NewLoad, NewStore;
if (UseVectorTy || !DoIntegerTruncate) {
NewLoad =
DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),
- FirstLoadAlign, MMOFlags);
+ FirstLoadAlign, LdMMOFlags);
NewStore = DAG.getStore(
NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
- FirstInChain->getPointerInfo(), FirstStoreAlign);
+ FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags);
} else { // This must be the truncstore/extload case
EVT ExtendedTy =
TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy,
FirstLoad->getChain(), FirstLoad->getBasePtr(),
FirstLoad->getPointerInfo(), JointMemOpVT,
- FirstLoadAlign, MMOFlags);
+ FirstLoadAlign, LdMMOFlags);
NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
FirstInChain->getBasePtr(),
FirstInChain->getPointerInfo(),
Modified: llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll?rev=362723&r1=362722&r2=362723&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll Thu Jun 6 10:04:13 2019
@@ -10,10 +10,6 @@
; PR42123
;
-; FIXME: AVX doesn't retain NT flag on load/store.
-; AVX1 load should be 2 x VMOVNTDQA xmm.
-; AVX2 load should be VMOVNTDQA ymm.
-; AVX store should be VMOVNTPS ymm.
define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) {
; X86-LABEL: merge_2_v4f32_align32:
; X86: # %bb.0:
@@ -49,12 +45,20 @@ define void @merge_2_v4f32_align32(<4 x
; X64-SSE41-NEXT: movntdq %xmm1, 16(%rsi)
; X64-SSE41-NEXT: retq
;
-; X64-AVX-LABEL: merge_2_v4f32_align32:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
-; X64-AVX-NEXT: vmovaps %ymm0, (%rsi)
-; X64-AVX-NEXT: vzeroupper
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: merge_2_v4f32_align32:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; X64-AVX1-NEXT: vmovntdq %xmm1, 16(%rsi)
+; X64-AVX1-NEXT: vmovntdq %xmm0, (%rsi)
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: merge_2_v4f32_align32:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; X64-AVX2-NEXT: vmovntdq %ymm0, (%rsi)
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
%1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
%2 = bitcast float* %1 to <4 x float>*
%3 = load <4 x float>, <4 x float>* %a0, align 32, !nontemporal !0
@@ -66,8 +70,7 @@ define void @merge_2_v4f32_align32(<4 x
ret void
}
-; FIXME: shouldn't attempt to merge nt and non-nt loads even if aligned.
-; Must be kept seperate as VMOVNTDQA xmm + VMOVDQA xmm.
+; Don't merge nt and non-nt loads even if aligned.
define void @merge_2_v4f32_align32_mix_ntload(<4 x float>* %a0, <4 x float>* %a1) {
; X86-LABEL: merge_2_v4f32_align32_mix_ntload:
; X86: # %bb.0:
@@ -105,9 +108,10 @@ define void @merge_2_v4f32_align32_mix_n
;
; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntload:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
-; X64-AVX-NEXT: vmovaps %ymm0, (%rsi)
-; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1
+; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi)
+; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi)
; X64-AVX-NEXT: retq
%1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
%2 = bitcast float* %1 to <4 x float>*
@@ -120,8 +124,7 @@ define void @merge_2_v4f32_align32_mix_n
ret void
}
-; FIXME: shouldn't attempt to merge nt and non-nt stores even if aligned.
-; Must be kept seperate as VMOVNTPS xmm + VMOVAPS xmm.
+; Don't merge nt and non-nt stores even if aligned.
define void @merge_2_v4f32_align32_mix_ntstore(<4 x float>* %a0, <4 x float>* %a1) {
; X86-LABEL: merge_2_v4f32_align32_mix_ntstore:
; X86: # %bb.0:
@@ -143,9 +146,10 @@ define void @merge_2_v4f32_align32_mix_n
;
; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntstore:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
-; X64-AVX-NEXT: vmovaps %ymm0, (%rsi)
-; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
+; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1
+; X64-AVX-NEXT: vmovntps %xmm0, (%rsi)
+; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi)
; X64-AVX-NEXT: retq
%1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
%2 = bitcast float* %1 to <4 x float>*
@@ -158,7 +162,7 @@ define void @merge_2_v4f32_align32_mix_n
ret void
}
-; FIXME: AVX can't perform NT-load-ymm on 16-byte aligned memory.
+; FIXME: AVX2 can't perform NT-load-ymm on 16-byte aligned memory.
; Must be kept seperate as VMOVNTDQA xmm.
define void @merge_2_v4f32_align16_ntload(<4 x float>* %a0, <4 x float>* %a1) {
; X86-LABEL: merge_2_v4f32_align16_ntload:
@@ -195,12 +199,20 @@ define void @merge_2_v4f32_align16_ntloa
; X64-SSE41-NEXT: movdqa %xmm1, 16(%rsi)
; X64-SSE41-NEXT: retq
;
-; X64-AVX-LABEL: merge_2_v4f32_align16_ntload:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX-NEXT: vmovups %ymm0, (%rsi)
-; X64-AVX-NEXT: vzeroupper
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: merge_2_v4f32_align16_ntload:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; X64-AVX1-NEXT: vmovdqa %xmm1, 16(%rsi)
+; X64-AVX1-NEXT: vmovdqa %xmm0, (%rsi)
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: merge_2_v4f32_align16_ntload:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovups (%rdi), %ymm0
+; X64-AVX2-NEXT: vmovups %ymm0, (%rsi)
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
%1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
%2 = bitcast float* %1 to <4 x float>*
%3 = load <4 x float>, <4 x float>* %a0, align 16, !nontemporal !0
More information about the llvm-commits
mailing list