[llvm] r209788 - [x86] Fold extract_vector_elt of a load into the Load's address computation.
Michael Spencer
bigcheesegs at gmail.com
Fri Jul 11 15:07:50 PDT 2014
On Fri, Jul 11, 2014 at 12:47 PM, Michael Spencer <bigcheesegs at gmail.com> wrote:
> On Fri, Jul 11, 2014 at 5:36 AM, Dmitry Babokin <babokin at gmail.com> wrote:
>> Michael,
>>
>> Your commit caused a regression described in bug 20115, could you have a
>> look?
>>
>> Dmitry
>
> Investigating now. Thanks for letting me know.
>
> - Michael Spencer
It seems that it creates a dependency loop by moving the use of
%POINT_VALUE from the extractelement up to the original load across
the lifetime end marker. In this specific case that would be fine if
the chains were fixed, but I don't believe it's always valid. I think
the conservative fix here is to not do this optimization if you would
have to move a load or an operation with possible side effects.
- Michael Spencer
>
>>
>>
>> On Thu, May 29, 2014 at 5:42 AM, Michael J. Spencer <bigcheesegs at gmail.com>
>> wrote:
>>>
>>> Author: mspencer
>>> Date: Wed May 28 20:42:45 2014
>>> New Revision: 209788
>>>
>>> URL: http://llvm.org/viewvc/llvm-project?rev=209788&view=rev
>>> Log:
>>> [x86] Fold extract_vector_elt of a load into the Load's address
>>> computation.
>>>
>>> An address only use of an extract element of a load can be simplified to a
>>> load. Without this the result of the extract element is spilled to the
>>> stack so that an address is available.
>>>
>>> Modified:
>>> llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
>>> llvm/trunk/test/CodeGen/X86/vec_splat.ll
>>>
>>> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
>>> URL:
>>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=209788&r1=209787&r2=209788&view=diff
>>>
>>> ==============================================================================
>>> --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
>>> +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Wed May 28
>>> 20:42:45 2014
>>> @@ -169,6 +169,16 @@ namespace {
>>> bool CombineToPostIndexedLoadStore(SDNode *N);
>>> bool SliceUpLoad(SDNode *N);
>>>
>>> + /// \brief Replace an ISD::EXTRACT_VECTOR_ELT of a load with a
>>> narrowed
>>> + /// load.
>>> + ///
>>> + /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
>>> + /// \param InVecVT type of the input vector to EVE with bitcasts
>>> resolved.
>>> + /// \param EltNo index of the vector element to load.
>>> + /// \param OriginalLoad load that EVE came from to be replaced.
>>> + /// \returns EVE on success SDValue() on failure.
>>> + SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
>>> + SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode
>>> *OriginalLoad);
>>> void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
>>> SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
>>> SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
>>> @@ -9675,6 +9685,86 @@ SDValue DAGCombiner::visitINSERT_VECTOR_
>>> return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
>>> }
>>>
>>> +SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
>>> + SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) {
>>> + EVT ResultVT = EVE->getValueType(0);
>>> + EVT VecEltVT = InVecVT.getVectorElementType();
>>> + unsigned Align = OriginalLoad->getAlignment();
>>> + unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
>>> + VecEltVT.getTypeForEVT(*DAG.getContext()));
>>> +
>>> + if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD,
>>> VecEltVT))
>>> + return SDValue();
>>> +
>>> + Align = NewAlign;
>>> +
>>> + SDValue NewPtr = OriginalLoad->getBasePtr();
>>> + SDValue Offset;
>>> + EVT PtrType = NewPtr.getValueType();
>>> + MachinePointerInfo MPI;
>>> + if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
>>> + int Elt = ConstEltNo->getZExtValue();
>>> + unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
>>> + if (TLI.isBigEndian())
>>> + PtrOff = InVecVT.getSizeInBits() / 8 - PtrOff;
>>> + Offset = DAG.getConstant(PtrOff, PtrType);
>>> + MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
>>> + } else {
>>> + Offset = DAG.getNode(
>>> + ISD::MUL, SDLoc(EVE), EltNo.getValueType(), EltNo,
>>> + DAG.getConstant(VecEltVT.getStoreSize(), EltNo.getValueType()));
>>> + if (TLI.isBigEndian())
>>> + Offset = DAG.getNode(
>>> + ISD::SUB, SDLoc(EVE), EltNo.getValueType(),
>>> + DAG.getConstant(InVecVT.getStoreSize(), EltNo.getValueType()),
>>> Offset);
>>> + MPI = OriginalLoad->getPointerInfo();
>>> + }
>>> + NewPtr = DAG.getNode(ISD::ADD, SDLoc(EVE), PtrType, NewPtr, Offset);
>>> +
>>> + // The replacement we need to do here is a little tricky: we need to
>>> + // replace an extractelement of a load with a load.
>>> + // Use ReplaceAllUsesOfValuesWith to do the replacement.
>>> + // Note that this replacement assumes that the extractvalue is the only
>>> + // use of the load; that's okay because we don't want to perform this
>>> + // transformation in other cases anyway.
>>> + SDValue Load;
>>> + SDValue Chain;
>>> + if (ResultVT.bitsGT(VecEltVT)) {
>>> + // If the result type of vextract is wider than the load, then issue
>>> an
>>> + // extending load instead.
>>> + ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
>>> VecEltVT)
>>> + ? ISD::ZEXTLOAD
>>> + : ISD::EXTLOAD;
>>> + Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT,
>>> OriginalLoad->getChain(),
>>> + NewPtr, MPI, VecEltVT,
>>> OriginalLoad->isVolatile(),
>>> + OriginalLoad->isNonTemporal(), Align,
>>> + OriginalLoad->getTBAAInfo());
>>> + Chain = Load.getValue(1);
>>> + } else {
>>> + Load = DAG.getLoad(
>>> + VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI,
>>> + OriginalLoad->isVolatile(), OriginalLoad->isNonTemporal(),
>>> + OriginalLoad->isInvariant(), Align, OriginalLoad->getTBAAInfo());
>>> + Chain = Load.getValue(1);
>>> + if (ResultVT.bitsLT(VecEltVT))
>>> + Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
>>> + else
>>> + Load = DAG.getNode(ISD::BITCAST, SDLoc(EVE), ResultVT, Load);
>>> + }
>>> + WorkListRemover DeadNodes(*this);
>>> + SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
>>> + SDValue To[] = { Load, Chain };
>>> + DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
>>> + // Since we're explicitly calling ReplaceAllUses, add the new node to
>>> the
>>> + // worklist explicitly as well.
>>> + AddToWorkList(Load.getNode());
>>> + AddUsersToWorkList(Load.getNode()); // Add users too
>>> + // Make sure to revisit this node to clean it up; it will usually be
>>> dead.
>>> + AddToWorkList(EVE);
>>> + ++OpsNarrowed;
>>> + return SDValue(EVE, 0);
>>> +}
>>> +
>>> SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
>>> // (vextract (scalar_to_vector val, 0) -> val
>>> SDValue InVec = N->getOperand(0);
>>> @@ -9743,6 +9833,38 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR
>>> }
>>> }
>>>
>>> + bool BCNumEltsChanged = false;
>>> + EVT ExtVT = VT.getVectorElementType();
>>> + EVT LVT = ExtVT;
>>> +
>>> + // If the result of load has to be truncated, then it's not necessarily
>>> + // profitable.
>>> + if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
>>> + return SDValue();
>>> +
>>> + if (InVec.getOpcode() == ISD::BITCAST) {
>>> + // Don't duplicate a load with other uses.
>>> + if (!InVec.hasOneUse())
>>> + return SDValue();
>>> +
>>> + EVT BCVT = InVec.getOperand(0).getValueType();
>>> + if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
>>> + return SDValue();
>>> + if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
>>> + BCNumEltsChanged = true;
>>> + InVec = InVec.getOperand(0);
>>> + ExtVT = BCVT.getVectorElementType();
>>> + }
>>> +
>>> + // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size)
>>> + if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() &&
>>> + ISD::isNormalLoad(InVec.getNode())) {
>>> + SDValue Index = N->getOperand(1);
>>> + if (LoadSDNode *OrigLoad = dyn_cast<LoadSDNode>(InVec))
>>> + return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index,
>>> + OrigLoad);
>>> + }
>>> +
>>> // Perform only after legalization to ensure build_vector /
>>> vector_shuffle
>>> // optimizations have already been done.
>>> if (!LegalOperations) return SDValue();
>>> @@ -9753,30 +9875,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR
>>>
>>> if (ConstEltNo) {
>>> int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
>>> - bool NewLoad = false;
>>> - bool BCNumEltsChanged = false;
>>> - EVT ExtVT = VT.getVectorElementType();
>>> - EVT LVT = ExtVT;
>>> -
>>> - // If the result of load has to be truncated, then it's not
>>> necessarily
>>> - // profitable.
>>> - if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
>>> - return SDValue();
>>> -
>>> - if (InVec.getOpcode() == ISD::BITCAST) {
>>> - // Don't duplicate a load with other uses.
>>> - if (!InVec.hasOneUse())
>>> - return SDValue();
>>> -
>>> - EVT BCVT = InVec.getOperand(0).getValueType();
>>> - if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
>>> - return SDValue();
>>> - if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
>>> - BCNumEltsChanged = true;
>>> - InVec = InVec.getOperand(0);
>>> - ExtVT = BCVT.getVectorElementType();
>>> - NewLoad = true;
>>> - }
>>>
>>> LoadSDNode *LN0 = nullptr;
>>> const ShuffleVectorSDNode *SVN = nullptr;
>>> @@ -9819,6 +9917,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR
>>> if (ISD::isNormalLoad(InVec.getNode())) {
>>> LN0 = cast<LoadSDNode>(InVec);
>>> Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems;
>>> + EltNo = DAG.getConstant(Elt, EltNo.getValueType());
>>> }
>>> }
>>>
>>> @@ -9831,72 +9930,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR
>>> if (Elt == -1)
>>> return DAG.getUNDEF(LVT);
>>>
>>> - unsigned Align = LN0->getAlignment();
>>> - if (NewLoad) {
>>> - // Check the resultant load doesn't need a higher alignment than
>>> the
>>> - // original load.
>>> - unsigned NewAlign =
>>> - TLI.getDataLayout()
>>> - ->getABITypeAlignment(LVT.getTypeForEVT(*DAG.getContext()));
>>> -
>>> - if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD,
>>> LVT))
>>> - return SDValue();
>>> -
>>> - Align = NewAlign;
>>> - }
>>> -
>>> - SDValue NewPtr = LN0->getBasePtr();
>>> - unsigned PtrOff = 0;
>>> -
>>> - if (Elt) {
>>> - PtrOff = LVT.getSizeInBits() * Elt / 8;
>>> - EVT PtrType = NewPtr.getValueType();
>>> - if (TLI.isBigEndian())
>>> - PtrOff = VT.getSizeInBits() / 8 - PtrOff;
>>> - NewPtr = DAG.getNode(ISD::ADD, SDLoc(N), PtrType, NewPtr,
>>> - DAG.getConstant(PtrOff, PtrType));
>>> - }
>>> -
>>> - // The replacement we need to do here is a little tricky: we need to
>>> - // replace an extractelement of a load with a load.
>>> - // Use ReplaceAllUsesOfValuesWith to do the replacement.
>>> - // Note that this replacement assumes that the extractvalue is the
>>> only
>>> - // use of the load; that's okay because we don't want to perform this
>>> - // transformation in other cases anyway.
>>> - SDValue Load;
>>> - SDValue Chain;
>>> - if (NVT.bitsGT(LVT)) {
>>> - // If the result type of vextract is wider than the load, then
>>> issue an
>>> - // extending load instead.
>>> - ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, LVT)
>>> - ? ISD::ZEXTLOAD : ISD::EXTLOAD;
>>> - Load = DAG.getExtLoad(ExtType, SDLoc(N), NVT, LN0->getChain(),
>>> - NewPtr,
>>> LN0->getPointerInfo().getWithOffset(PtrOff),
>>> - LVT, LN0->isVolatile(), LN0->isNonTemporal(),
>>> - Align, LN0->getTBAAInfo());
>>> - Chain = Load.getValue(1);
>>> - } else {
>>> - Load = DAG.getLoad(LVT, SDLoc(N), LN0->getChain(), NewPtr,
>>> - LN0->getPointerInfo().getWithOffset(PtrOff),
>>> - LN0->isVolatile(), LN0->isNonTemporal(),
>>> - LN0->isInvariant(), Align, LN0->getTBAAInfo());
>>> - Chain = Load.getValue(1);
>>> - if (NVT.bitsLT(LVT))
>>> - Load = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, Load);
>>> - else
>>> - Load = DAG.getNode(ISD::BITCAST, SDLoc(N), NVT, Load);
>>> - }
>>> - WorkListRemover DeadNodes(*this);
>>> - SDValue From[] = { SDValue(N, 0), SDValue(LN0,1) };
>>> - SDValue To[] = { Load, Chain };
>>> - DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
>>> - // Since we're explcitly calling ReplaceAllUses, add the new node to
>>> the
>>> - // worklist explicitly as well.
>>> - AddToWorkList(Load.getNode());
>>> - AddUsersToWorkList(Load.getNode()); // Add users too
>>> - // Make sure to revisit this node to clean it up; it will usually be
>>> dead.
>>> - AddToWorkList(N);
>>> - return SDValue(N, 0);
>>> + return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo,
>>> LN0);
>>> }
>>>
>>> return SDValue();
>>>
>>> Modified: llvm/trunk/test/CodeGen/X86/vec_splat.ll
>>> URL:
>>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_splat.ll?rev=209788&r1=209787&r2=209788&view=diff
>>>
>>> ==============================================================================
>>> --- llvm/trunk/test/CodeGen/X86/vec_splat.ll (original)
>>> +++ llvm/trunk/test/CodeGen/X86/vec_splat.ll Wed May 28 20:42:45 2014
>>> @@ -1,5 +1,6 @@
>>> ; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s
>>> -check-prefix=SSE2
>>> ; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s
>>> -check-prefix=SSE3
>>> +; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
>>> -check-prefix=AVX
>>>
>>> define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X)
>>> nounwind {
>>> %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0
>>> ; <<4 x float>> [#uses=1]
>>> @@ -37,6 +38,23 @@ define void @test_v2sd(<2 x double>* %P,
>>> define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly
>>> %ptr, i64 %i, i64 %j) nounwind {
>>> %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
>>> %2 = load <4 x float>* %1, align 16
>>> + %3 = trunc i64 %j to i32
>>> + %4 = extractelement <4 x float> %2, i32 %3
>>> + %5 = insertelement <4 x float> undef, float %4, i32 0
>>> + %6 = insertelement <4 x float> %5, float %4, i32 1
>>> + %7 = insertelement <4 x float> %6, float %4, i32 2
>>> + %8 = insertelement <4 x float> %7, float %4, i32 3
>>> + ret <4 x float> %8
>>> +
>>> +; AVX-LABEL: load_extract_splat
>>> +; AVX-NOT: rsp
>>> +; AVX: vbroadcastss
>>> +}
>>> +
>>> +; Fold extract of a load into the load's address computation. This avoids
>>> spilling to the stack.
>>> +define <4 x float> @load_extract_splat1(<4 x float>* nocapture readonly
>>> %ptr, i64 %i, i64 %j) nounwind {
>>> + %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
>>> + %2 = load <4 x float>* %1, align 16
>>> %3 = extractelement <4 x float> %2, i64 %j
>>> %4 = insertelement <4 x float> undef, float %3, i32 0
>>> %5 = insertelement <4 x float> %4, float %3, i32 1
>>> @@ -44,7 +62,7 @@ define <4 x float> @load_extract_splat(<
>>> %7 = insertelement <4 x float> %6, float %3, i32 3
>>> ret <4 x float> %7
>>>
>>> -; AVX-LABEL: load_extract_splat
>>> +; AVX-LABEL: load_extract_splat1
>>> ; AVX-NOT: movs
>>> ; AVX: vbroadcastss
>>> }
>>>
>>>
>>> _______________________________________________
>>> llvm-commits mailing list
>>> llvm-commits at cs.uiuc.edu
>>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>>
>>
More information about the llvm-commits
mailing list