[llvm] r226845 - merge consecutive stores of extracted vector elements (PR21711)

Thu Jan 22 10:28:00 PST 2015

Nice!

Thanks,
-Quentin

On Jan 22, 2015, at 10:21 AM, Sanjay Patel <spatel at rotateright.com> wrote:

> Author: spatel
> Date: Thu Jan 22 12:21:26 2015
> New Revision: 226845
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=226845&view=rev
> Log:
> merge consecutive stores of extracted vector elements (PR21711)
> 
> This is a 2nd try at the same optimization as http://reviews.llvm.org/D6698. 
> That patch was checked in at r224611, but reverted at r225031 because it
> caused a failure outside of the regression tests.
> 
> The cause of the crash was not recognizing consecutive stores that have mixed
> source values (loads and vector element extracts), so this patch adds a check
> to bail out if any store value is not coming from a vector element extract.
> 
> This patch also refactors the shared logic of the constant source and vector
> extracted elements source cases into a helper function.
> 
> Differential Revision: http://reviews.llvm.org/D6850
> 
> 
> Modified:
>    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
>    llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll
> 
> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=226845&r1=226844&r2=226845&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Thu Jan 22 12:21:26 2015
> @@ -363,6 +363,28 @@ namespace {
>     /// chain (aliasing node.)
>     SDValue FindBetterChain(SDNode *N, SDValue Chain);
> 
> +    /// Holds a pointer to an LSBaseSDNode as well as information on where it
> +    /// is located in a sequence of memory operations connected by a chain.
> +    struct MemOpLink {
> +      MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
> +      MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
> +      // Ptr to the mem node.
> +      LSBaseSDNode *MemNode;
> +      // Offset from the base ptr.
> +      int64_t OffsetFromBase;
> +      // What is the sequence number of this mem node.
> +      // Lowest mem operand in the DAG starts at zero.
> +      unsigned SequenceNum;
> +    };
> +
> +    /// This is a helper function for MergeConsecutiveStores. When the source
> +    /// elements of the consecutive stores are all constants or all extracted
> +    /// vector elements, try to merge them into one larger store.
> +    /// \return True if a merged store was created.
> +    bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
> +                                         EVT MemVT, unsigned NumElem,
> +                                         bool IsConstantSrc, bool UseVector);
> +    
>     /// Merge consecutive store operations into a wide store.
>     /// This optimization uses wide integers or vectors when possible.
>     /// \return True if some memory operations were changed.
> @@ -9706,19 +9728,116 @@ struct BaseIndexOffset {
>   }
> };
> 
> -/// Holds a pointer to an LSBaseSDNode as well as information on where it
> -/// is located in a sequence of memory operations connected by a chain.
> -struct MemOpLink {
> -  MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
> -    MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
> -  // Ptr to the mem node.
> -  LSBaseSDNode *MemNode;
> -  // Offset from the base ptr.
> -  int64_t OffsetFromBase;
> -  // What is the sequence number of this mem node.
> -  // Lowest mem operand in the DAG starts at zero.
> -  unsigned SequenceNum;
> -};
> +bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
> +                  SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT,
> +                  unsigned NumElem, bool IsConstantSrc, bool UseVector) {
> +  // Make sure we have something to merge.
> +  if (NumElem < 2)
> +    return false;
> +  
> +  int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
> +  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
> +  unsigned EarliestNodeUsed = 0;
> +  
> +  for (unsigned i=0; i < NumElem; ++i) {
> +    // Find a chain for the new wide-store operand. Notice that some
> +    // of the store nodes that we found may not be selected for inclusion
> +    // in the wide store. The chain we use needs to be the chain of the
> +    // earliest store node which is *used* and replaced by the wide store.
> +    if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
> +      EarliestNodeUsed = i;
> +  }
> +  
> +  // The earliest Node in the DAG.
> +  LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
> +  SDLoc DL(StoreNodes[0].MemNode);
> +  
> +  SDValue StoredVal;
> +  if (UseVector) {
> +    // Find a legal type for the vector store.
> +    EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
> +    assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
> +    if (IsConstantSrc) {
> +      // A vector store with a constant source implies that the constant is
> +      // zero; we only handle merging stores of constant zeros because the zero
> +      // can be materialized without a load.
> +      // It may be beneficial to loosen this restriction to allow non-zero
> +      // store merging.
> +      StoredVal = DAG.getConstant(0, Ty);
> +    } else {
> +      SmallVector<SDValue, 8> Ops;
> +      for (unsigned i = 0; i < NumElem ; ++i) {
> +        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
> +        SDValue Val = St->getValue();
> +        // All of the operands of a BUILD_VECTOR must have the same type.
> +        if (Val.getValueType() != MemVT)
> +          return false;
> +        Ops.push_back(Val);
> +      }
> +      
> +      // Build the extracted vector elements back into a vector.
> +      StoredVal = DAG.getNode(ISD::BUILD_VECTOR, DL, Ty, Ops);
> +    }
> +  } else {
> +    // We should always use a vector store when merging extracted vector
> +    // elements, so this path implies a store of constants.
> +    assert(IsConstantSrc && "Merged vector elements should use vector store");
> +
> +    unsigned StoreBW = NumElem * ElementSizeBytes * 8;
> +    APInt StoreInt(StoreBW, 0);
> +    
> +    // Construct a single integer constant which is made of the smaller
> +    // constant inputs.
> +    bool IsLE = TLI.isLittleEndian();
> +    for (unsigned i = 0; i < NumElem ; ++i) {
> +      unsigned Idx = IsLE ? (NumElem - 1 - i) : i;
> +      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
> +      SDValue Val = St->getValue();
> +      StoreInt <<= ElementSizeBytes*8;
> +      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
> +        StoreInt |= C->getAPIntValue().zext(StoreBW);
> +      } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
> +        StoreInt |= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
> +      } else {
> +        llvm_unreachable("Invalid constant element type");
> +      }
> +    }
> +    
> +    // Create the new Load and Store operations.
> +    EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
> +    StoredVal = DAG.getConstant(StoreInt, StoreTy);
> +  }
> +  
> +  SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal,
> +                                  FirstInChain->getBasePtr(),
> +                                  FirstInChain->getPointerInfo(),
> +                                  false, false,
> +                                  FirstInChain->getAlignment());
> +  
> +  // Replace the first store with the new store
> +  CombineTo(EarliestOp, NewStore);
> +  // Erase all other stores.
> +  for (unsigned i = 0; i < NumElem ; ++i) {
> +    if (StoreNodes[i].MemNode == EarliestOp)
> +      continue;
> +    StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
> +    // ReplaceAllUsesWith will replace all uses that existed when it was
> +    // called, but graph optimizations may cause new ones to appear. For
> +    // example, the case in pr14333 looks like
> +    //
> +    //  St's chain -> St -> another store -> X
> +    //
> +    // And the only difference from St to the other store is the chain.
> +    // When we change it's chain to be St's chain they become identical,
> +    // get CSEed and the net result is that X is now a use of St.
> +    // Since we know that St is redundant, just iterate.
> +    while (!St->use_empty())
> +      DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
> +    deleteAndRecombine(St);
> +  }
> +  
> +  return true;
> +}
> 
> bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
>   EVT MemVT = St->getMemoryVT();
> @@ -9731,11 +9850,14 @@ bool DAGCombiner::MergeConsecutiveStores
>     return false;
> 
>   // Perform an early exit check. Do not bother looking at stored values that
> -  // are not constants or loads.
> +  // are not constants, loads, or extracted vector elements.
>   SDValue StoredVal = St->getValue();
>   bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
> -  if (!isa<ConstantSDNode>(StoredVal) && !isa<ConstantFPSDNode>(StoredVal) &&
> -      !IsLoadSrc)
> +  bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||
> +                       isa<ConstantFPSDNode>(StoredVal);
> +  bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT);
> +   
> +  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc)
>     return false;
> 
>   // Only look at ends of store sequences.
> @@ -9877,7 +9999,7 @@ bool DAGCombiner::MergeConsecutiveStores
>   LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
> 
>   // Store the constants into memory as one consecutive store.
> -  if (!IsLoadSrc) {
> +  if (IsConstantSrc) {
>     unsigned LastLegalType = 0;
>     unsigned LastLegalVectorType = 0;
>     bool NonZero = false;
> @@ -9926,85 +10048,33 @@ bool DAGCombiner::MergeConsecutiveStores
>     bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
>     unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
> 
> -    // Make sure we have something to merge.
> -    if (NumElem < 2)
> -      return false;
> -
> -    unsigned EarliestNodeUsed = 0;
> -    for (unsigned i=0; i < NumElem; ++i) {
> -      // Find a chain for the new wide-store operand. Notice that some
> -      // of the store nodes that we found may not be selected for inclusion
> -      // in the wide store. The chain we use needs to be the chain of the
> -      // earliest store node which is *used* and replaced by the wide store.
> -      if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
> -        EarliestNodeUsed = i;
> -    }
> -
> -    // The earliest Node in the DAG.
> -    LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
> -    SDLoc DL(StoreNodes[0].MemNode);
> +    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
> +                                           true, UseVector);
> +  }
> 
> -    SDValue StoredVal;
> -    if (UseVector) {
> +  // When extracting multiple vector elements, try to store them
> +  // in one vector store rather than a sequence of scalar stores.
> +  if (IsExtractVecEltSrc) {
> +    unsigned NumElem = 0;
> +    for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
> +      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
> +      SDValue StoredVal = St->getValue();
> +      // This restriction could be loosened.
> +      // Bail out if any stored values are not elements extracted from a vector.
> +      // It should be possible to handle mixed sources, but load sources need
> +      // more careful handling (see the block of code below that handles
> +      // consecutive loads).
> +      if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
> +        return false;
> +      
>       // Find a legal type for the vector store.
> -      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
> -      assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
> -      StoredVal = DAG.getConstant(0, Ty);
> -    } else {
> -      unsigned StoreBW = NumElem * ElementSizeBytes * 8;
> -      APInt StoreInt(StoreBW, 0);
> -
> -      // Construct a single integer constant which is made of the smaller
> -      // constant inputs.
> -      bool IsLE = TLI.isLittleEndian();
> -      for (unsigned i = 0; i < NumElem ; ++i) {
> -        unsigned Idx = IsLE ?(NumElem - 1 - i) : i;
> -        StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
> -        SDValue Val = St->getValue();
> -        StoreInt<<=ElementSizeBytes*8;
> -        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
> -          StoreInt|=C->getAPIntValue().zext(StoreBW);
> -        } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
> -          StoreInt|= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
> -        } else {
> -          llvm_unreachable("Invalid constant element type");
> -        }
> -      }
> -
> -      // Create the new Load and Store operations.
> -      EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
> -      StoredVal = DAG.getConstant(StoreInt, StoreTy);
> -    }
> -
> -    SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal,
> -                                    FirstInChain->getBasePtr(),
> -                                    FirstInChain->getPointerInfo(),
> -                                    false, false,
> -                                    FirstInChain->getAlignment());
> -
> -    // Replace the first store with the new store
> -    CombineTo(EarliestOp, NewStore);
> -    // Erase all other stores.
> -    for (unsigned i = 0; i < NumElem ; ++i) {
> -      if (StoreNodes[i].MemNode == EarliestOp)
> -        continue;
> -      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
> -      // ReplaceAllUsesWith will replace all uses that existed when it was
> -      // called, but graph optimizations may cause new ones to appear. For
> -      // example, the case in pr14333 looks like
> -      //
> -      //  St's chain -> St -> another store -> X
> -      //
> -      // And the only difference from St to the other store is the chain.
> -      // When we change it's chain to be St's chain they become identical,
> -      // get CSEed and the net result is that X is now a use of St.
> -      // Since we know that St is redundant, just iterate.
> -      while (!St->use_empty())
> -        DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
> -      deleteAndRecombine(St);
> +      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
> +      if (TLI.isTypeLegal(Ty))
> +        NumElem = i + 1;
>     }
> 
> -    return true;
> +    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
> +                                           false, true);
>   }
> 
>   // Below we handle the case of multiple consecutive stores that
> 
> Modified: llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll?rev=226845&r1=226844&r2=226845&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll Thu Jan 22 12:21:26 2015
> @@ -434,3 +434,62 @@ define void @loadStoreBaseIndexOffsetSex
> ; <label>:14
>   ret void
> }
> +
> +; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
> +define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
> +  %vecext0 = extractelement <8 x float> %v, i32 0
> +  %vecext1 = extractelement <8 x float> %v, i32 1
> +  %vecext2 = extractelement <8 x float> %v, i32 2
> +  %vecext3 = extractelement <8 x float> %v, i32 3
> +  %vecext4 = extractelement <8 x float> %v, i32 4
> +  %vecext5 = extractelement <8 x float> %v, i32 5
> +  %vecext6 = extractelement <8 x float> %v, i32 6
> +  %vecext7 = extractelement <8 x float> %v, i32 7
> +  %arrayidx1 = getelementptr inbounds float* %ptr, i64 1
> +  %arrayidx2 = getelementptr inbounds float* %ptr, i64 2
> +  %arrayidx3 = getelementptr inbounds float* %ptr, i64 3
> +  %arrayidx4 = getelementptr inbounds float* %ptr, i64 4
> +  %arrayidx5 = getelementptr inbounds float* %ptr, i64 5
> +  %arrayidx6 = getelementptr inbounds float* %ptr, i64 6
> +  %arrayidx7 = getelementptr inbounds float* %ptr, i64 7
> +  store float %vecext0, float* %ptr, align 4
> +  store float %vecext1, float* %arrayidx1, align 4
> +  store float %vecext2, float* %arrayidx2, align 4
> +  store float %vecext3, float* %arrayidx3, align 4
> +  store float %vecext4, float* %arrayidx4, align 4
> +  store float %vecext5, float* %arrayidx5, align 4
> +  store float %vecext6, float* %arrayidx6, align 4
> +  store float %vecext7, float* %arrayidx7, align 4
> +  ret void
> +
> +; CHECK-LABEL: merge_vec_element_store
> +; CHECK: vmovups
> +; CHECK-NEXT: vzeroupper
> +; CHECK-NEXT: retq
> +}
> +
> +; This is a minimized test based on real code that was failing.
> +; We could merge stores (and loads) like this...
> +
> +define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
> +  %idx0 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 0
> +  %idx1 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 1
> +  %idx4 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 4
> +  %idx5 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 5
> +
> +  %a0 = load i64* %idx0, align 8
> +  store i64 %a0, i64* %idx4, align 8
> +
> +  %b = bitcast i64* %idx1 to <2 x i64>*
> +  %v = load <2 x i64>* %b, align 8
> +  %a1 = extractelement <2 x i64> %v, i32 0
> +  store i64 %a1, i64* %idx5, align 8
> +  ret void
> +
> +; CHECK-LABEL: merge_vec_element_and_scalar_load
> +; CHECK:      movq	(%rdi), %rax
> +; CHECK-NEXT: movq	%rax, 32(%rdi)
> +; CHECK-NEXT: movq	8(%rdi), %rax
> +; CHECK-NEXT: movq	%rax, 40(%rdi)
> +; CHECK-NEXT: retq
> +}
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits