[llvm] r226845 - merge consecutive stores of extracted vector elements (PR21711)
Quentin Colombet
qcolombet at apple.com
Thu Jan 22 10:28:00 PST 2015
Nice!
Thanks,
-Quentin
On Jan 22, 2015, at 10:21 AM, Sanjay Patel <spatel at rotateright.com> wrote:
> Author: spatel
> Date: Thu Jan 22 12:21:26 2015
> New Revision: 226845
>
> URL: http://llvm.org/viewvc/llvm-project?rev=226845&view=rev
> Log:
> merge consecutive stores of extracted vector elements (PR21711)
>
> This is a 2nd try at the same optimization as http://reviews.llvm.org/D6698.
> That patch was checked in at r224611, but reverted at r225031 because it
> caused a failure outside of the regression tests.
>
> The cause of the crash was not recognizing consecutive stores that have mixed
> source values (loads and vector element extracts), so this patch adds a check
> to bail out if any store value is not coming from a vector element extract.
>
> This patch also refactors the shared logic of the constant source and vector
> extracted elements source cases into a helper function.
>
> Differential Revision: http://reviews.llvm.org/D6850
>
>
> Modified:
> llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
> llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll
>
> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=226845&r1=226844&r2=226845&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Thu Jan 22 12:21:26 2015
> @@ -363,6 +363,28 @@ namespace {
> /// chain (aliasing node.)
> SDValue FindBetterChain(SDNode *N, SDValue Chain);
>
> + /// Holds a pointer to an LSBaseSDNode as well as information on where it
> + /// is located in a sequence of memory operations connected by a chain.
> + struct MemOpLink {
> + MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
> + MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
> + // Ptr to the mem node.
> + LSBaseSDNode *MemNode;
> + // Offset from the base ptr.
> + int64_t OffsetFromBase;
> + // What is the sequence number of this mem node.
> + // Lowest mem operand in the DAG starts at zero.
> + unsigned SequenceNum;
> + };
> +
> + /// This is a helper function for MergeConsecutiveStores. When the source
> + /// elements of the consecutive stores are all constants or all extracted
> + /// vector elements, try to merge them into one larger store.
> + /// \return True if a merged store was created.
> + bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
> + EVT MemVT, unsigned NumElem,
> + bool IsConstantSrc, bool UseVector);
> +
> /// Merge consecutive store operations into a wide store.
> /// This optimization uses wide integers or vectors when possible.
> /// \return True if some memory operations were changed.
> @@ -9706,19 +9728,116 @@ struct BaseIndexOffset {
> }
> };
>
> -/// Holds a pointer to an LSBaseSDNode as well as information on where it
> -/// is located in a sequence of memory operations connected by a chain.
> -struct MemOpLink {
> - MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
> - MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
> - // Ptr to the mem node.
> - LSBaseSDNode *MemNode;
> - // Offset from the base ptr.
> - int64_t OffsetFromBase;
> - // What is the sequence number of this mem node.
> - // Lowest mem operand in the DAG starts at zero.
> - unsigned SequenceNum;
> -};
> +bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
> + SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT,
> + unsigned NumElem, bool IsConstantSrc, bool UseVector) {
> + // Make sure we have something to merge.
> + if (NumElem < 2)
> + return false;
> +
> + int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
> + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
> + unsigned EarliestNodeUsed = 0;
> +
> + for (unsigned i=0; i < NumElem; ++i) {
> + // Find a chain for the new wide-store operand. Notice that some
> + // of the store nodes that we found may not be selected for inclusion
> + // in the wide store. The chain we use needs to be the chain of the
> + // earliest store node which is *used* and replaced by the wide store.
> + if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
> + EarliestNodeUsed = i;
> + }
> +
> + // The earliest Node in the DAG.
> + LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
> + SDLoc DL(StoreNodes[0].MemNode);
> +
> + SDValue StoredVal;
> + if (UseVector) {
> + // Find a legal type for the vector store.
> + EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
> + assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
> + if (IsConstantSrc) {
> + // A vector store with a constant source implies that the constant is
> + // zero; we only handle merging stores of constant zeros because the zero
> + // can be materialized without a load.
> + // It may be beneficial to loosen this restriction to allow non-zero
> + // store merging.
> + StoredVal = DAG.getConstant(0, Ty);
> + } else {
> + SmallVector<SDValue, 8> Ops;
> + for (unsigned i = 0; i < NumElem ; ++i) {
> + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
> + SDValue Val = St->getValue();
> + // All of the operands of a BUILD_VECTOR must have the same type.
> + if (Val.getValueType() != MemVT)
> + return false;
> + Ops.push_back(Val);
> + }
> +
> + // Build the extracted vector elements back into a vector.
> + StoredVal = DAG.getNode(ISD::BUILD_VECTOR, DL, Ty, Ops);
> + }
> + } else {
> + // We should always use a vector store when merging extracted vector
> + // elements, so this path implies a store of constants.
> + assert(IsConstantSrc && "Merged vector elements should use vector store");
> +
> + unsigned StoreBW = NumElem * ElementSizeBytes * 8;
> + APInt StoreInt(StoreBW, 0);
> +
> + // Construct a single integer constant which is made of the smaller
> + // constant inputs.
> + bool IsLE = TLI.isLittleEndian();
> + for (unsigned i = 0; i < NumElem ; ++i) {
> + unsigned Idx = IsLE ? (NumElem - 1 - i) : i;
> + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
> + SDValue Val = St->getValue();
> + StoreInt <<= ElementSizeBytes*8;
> + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
> + StoreInt |= C->getAPIntValue().zext(StoreBW);
> + } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
> + StoreInt |= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
> + } else {
> + llvm_unreachable("Invalid constant element type");
> + }
> + }
> +
> + // Create the new Load and Store operations.
> + EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
> + StoredVal = DAG.getConstant(StoreInt, StoreTy);
> + }
> +
> + SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal,
> + FirstInChain->getBasePtr(),
> + FirstInChain->getPointerInfo(),
> + false, false,
> + FirstInChain->getAlignment());
> +
> + // Replace the first store with the new store
> + CombineTo(EarliestOp, NewStore);
> + // Erase all other stores.
> + for (unsigned i = 0; i < NumElem ; ++i) {
> + if (StoreNodes[i].MemNode == EarliestOp)
> + continue;
> + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
> + // ReplaceAllUsesWith will replace all uses that existed when it was
> + // called, but graph optimizations may cause new ones to appear. For
> + // example, the case in pr14333 looks like
> + //
> + // St's chain -> St -> another store -> X
> + //
> + // And the only difference from St to the other store is the chain.
> + // When we change it's chain to be St's chain they become identical,
> + // get CSEed and the net result is that X is now a use of St.
> + // Since we know that St is redundant, just iterate.
> + while (!St->use_empty())
> + DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
> + deleteAndRecombine(St);
> + }
> +
> + return true;
> +}
>
> bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
> EVT MemVT = St->getMemoryVT();
> @@ -9731,11 +9850,14 @@ bool DAGCombiner::MergeConsecutiveStores
> return false;
>
> // Perform an early exit check. Do not bother looking at stored values that
> - // are not constants or loads.
> + // are not constants, loads, or extracted vector elements.
> SDValue StoredVal = St->getValue();
> bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
> - if (!isa<ConstantSDNode>(StoredVal) && !isa<ConstantFPSDNode>(StoredVal) &&
> - !IsLoadSrc)
> + bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||
> + isa<ConstantFPSDNode>(StoredVal);
> + bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT);
> +
> + if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc)
> return false;
>
> // Only look at ends of store sequences.
> @@ -9877,7 +9999,7 @@ bool DAGCombiner::MergeConsecutiveStores
> LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
>
> // Store the constants into memory as one consecutive store.
> - if (!IsLoadSrc) {
> + if (IsConstantSrc) {
> unsigned LastLegalType = 0;
> unsigned LastLegalVectorType = 0;
> bool NonZero = false;
> @@ -9926,85 +10048,33 @@ bool DAGCombiner::MergeConsecutiveStores
> bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
> unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
>
> - // Make sure we have something to merge.
> - if (NumElem < 2)
> - return false;
> -
> - unsigned EarliestNodeUsed = 0;
> - for (unsigned i=0; i < NumElem; ++i) {
> - // Find a chain for the new wide-store operand. Notice that some
> - // of the store nodes that we found may not be selected for inclusion
> - // in the wide store. The chain we use needs to be the chain of the
> - // earliest store node which is *used* and replaced by the wide store.
> - if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
> - EarliestNodeUsed = i;
> - }
> -
> - // The earliest Node in the DAG.
> - LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
> - SDLoc DL(StoreNodes[0].MemNode);
> + return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
> + true, UseVector);
> + }
>
> - SDValue StoredVal;
> - if (UseVector) {
> + // When extracting multiple vector elements, try to store them
> + // in one vector store rather than a sequence of scalar stores.
> + if (IsExtractVecEltSrc) {
> + unsigned NumElem = 0;
> + for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
> + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
> + SDValue StoredVal = St->getValue();
> + // This restriction could be loosened.
> + // Bail out if any stored values are not elements extracted from a vector.
> + // It should be possible to handle mixed sources, but load sources need
> + // more careful handling (see the block of code below that handles
> + // consecutive loads).
> + if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
> + return false;
> +
> // Find a legal type for the vector store.
> - EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
> - assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
> - StoredVal = DAG.getConstant(0, Ty);
> - } else {
> - unsigned StoreBW = NumElem * ElementSizeBytes * 8;
> - APInt StoreInt(StoreBW, 0);
> -
> - // Construct a single integer constant which is made of the smaller
> - // constant inputs.
> - bool IsLE = TLI.isLittleEndian();
> - for (unsigned i = 0; i < NumElem ; ++i) {
> - unsigned Idx = IsLE ?(NumElem - 1 - i) : i;
> - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
> - SDValue Val = St->getValue();
> - StoreInt<<=ElementSizeBytes*8;
> - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
> - StoreInt|=C->getAPIntValue().zext(StoreBW);
> - } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
> - StoreInt|= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
> - } else {
> - llvm_unreachable("Invalid constant element type");
> - }
> - }
> -
> - // Create the new Load and Store operations.
> - EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
> - StoredVal = DAG.getConstant(StoreInt, StoreTy);
> - }
> -
> - SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal,
> - FirstInChain->getBasePtr(),
> - FirstInChain->getPointerInfo(),
> - false, false,
> - FirstInChain->getAlignment());
> -
> - // Replace the first store with the new store
> - CombineTo(EarliestOp, NewStore);
> - // Erase all other stores.
> - for (unsigned i = 0; i < NumElem ; ++i) {
> - if (StoreNodes[i].MemNode == EarliestOp)
> - continue;
> - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
> - // ReplaceAllUsesWith will replace all uses that existed when it was
> - // called, but graph optimizations may cause new ones to appear. For
> - // example, the case in pr14333 looks like
> - //
> - // St's chain -> St -> another store -> X
> - //
> - // And the only difference from St to the other store is the chain.
> - // When we change it's chain to be St's chain they become identical,
> - // get CSEed and the net result is that X is now a use of St.
> - // Since we know that St is redundant, just iterate.
> - while (!St->use_empty())
> - DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
> - deleteAndRecombine(St);
> + EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
> + if (TLI.isTypeLegal(Ty))
> + NumElem = i + 1;
> }
>
> - return true;
> + return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
> + false, true);
> }
>
> // Below we handle the case of multiple consecutive stores that
>
> Modified: llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll?rev=226845&r1=226844&r2=226845&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll Thu Jan 22 12:21:26 2015
> @@ -434,3 +434,62 @@ define void @loadStoreBaseIndexOffsetSex
> ; <label>:14
> ret void
> }
> +
> +; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
> +define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
> + %vecext0 = extractelement <8 x float> %v, i32 0
> + %vecext1 = extractelement <8 x float> %v, i32 1
> + %vecext2 = extractelement <8 x float> %v, i32 2
> + %vecext3 = extractelement <8 x float> %v, i32 3
> + %vecext4 = extractelement <8 x float> %v, i32 4
> + %vecext5 = extractelement <8 x float> %v, i32 5
> + %vecext6 = extractelement <8 x float> %v, i32 6
> + %vecext7 = extractelement <8 x float> %v, i32 7
> + %arrayidx1 = getelementptr inbounds float* %ptr, i64 1
> + %arrayidx2 = getelementptr inbounds float* %ptr, i64 2
> + %arrayidx3 = getelementptr inbounds float* %ptr, i64 3
> + %arrayidx4 = getelementptr inbounds float* %ptr, i64 4
> + %arrayidx5 = getelementptr inbounds float* %ptr, i64 5
> + %arrayidx6 = getelementptr inbounds float* %ptr, i64 6
> + %arrayidx7 = getelementptr inbounds float* %ptr, i64 7
> + store float %vecext0, float* %ptr, align 4
> + store float %vecext1, float* %arrayidx1, align 4
> + store float %vecext2, float* %arrayidx2, align 4
> + store float %vecext3, float* %arrayidx3, align 4
> + store float %vecext4, float* %arrayidx4, align 4
> + store float %vecext5, float* %arrayidx5, align 4
> + store float %vecext6, float* %arrayidx6, align 4
> + store float %vecext7, float* %arrayidx7, align 4
> + ret void
> +
> +; CHECK-LABEL: merge_vec_element_store
> +; CHECK: vmovups
> +; CHECK-NEXT: vzeroupper
> +; CHECK-NEXT: retq
> +}
> +
> +; This is a minimized test based on real code that was failing.
> +; We could merge stores (and loads) like this...
> +
> +define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
> + %idx0 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 0
> + %idx1 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 1
> + %idx4 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 4
> + %idx5 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 5
> +
> + %a0 = load i64* %idx0, align 8
> + store i64 %a0, i64* %idx4, align 8
> +
> + %b = bitcast i64* %idx1 to <2 x i64>*
> + %v = load <2 x i64>* %b, align 8
> + %a1 = extractelement <2 x i64> %v, i32 0
> + store i64 %a1, i64* %idx5, align 8
> + ret void
> +
> +; CHECK-LABEL: merge_vec_element_and_scalar_load
> +; CHECK: movq (%rdi), %rax
> +; CHECK-NEXT: movq %rax, 32(%rdi)
> +; CHECK-NEXT: movq 8(%rdi), %rax
> +; CHECK-NEXT: movq %rax, 40(%rdi)
> +; CHECK-NEXT: retq
> +}
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list