<div dir="ltr">Thanks, Tom. Taking a look now.<br></div><div class="gmail_extra"><br><div class="gmail_quote">On Mon, Jan 4, 2016 at 1:25 PM, Tom Stellard <span dir="ltr"><<a href="mailto:tom@stellard.net" target="_blank">tom@stellard.net</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Hi Sanjay,<br>

<br>

This commit caused a regression: <a href="https://llvm.org/bugs/show_bug.cgi?id=26015" rel="noreferrer" target="_blank">https://llvm.org/bugs/show_bug.cgi?id=26015</a><br>

<span class="HOEnZb"><font color="#888888"><br>

-Tom<br>

</font></span><div class="HOEnZb"><div class="h5"><br>

On Thu, Dec 24, 2015 at 09:17:56PM -0000, Sanjay Patel via llvm-commits wrote:<br>

> Author: spatel<br>

> Date: Thu Dec 24 15:17:56 2015<br>

> New Revision: 256394<br>

><br>

> URL: <a href="http://llvm.org/viewvc/llvm-project?rev=256394&view=rev" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project?rev=256394&view=rev</a><br>

> Log:<br>

> [InstCombine] transform more extract/insert pairs into shuffles (PR2109)<br>

><br>

> This is an extension of the shuffle combining from r203229:<br>

> <a href="http://reviews.llvm.org/rL203229" rel="noreferrer" target="_blank">http://reviews.llvm.org/rL203229</a><br>

><br>

> The idea is to widen a short input vector with undef elements so the<br>

> existing shuffle transform for extract/insert can kick in.<br>

><br>

> The motivation is to finally solve PR2109:<br>

> <a href="https://llvm.org/bugs/show_bug.cgi?id=2109" rel="noreferrer" target="_blank">https://llvm.org/bugs/show_bug.cgi?id=2109</a><br>

><br>

> For that example, the IR becomes:<br>

><br>

> %1 = bitcast <2 x i32>* %P to <2 x float>*<br>

> %ld1 = load <2 x float>, <2 x float>* %1, align 8<br>

> %2 = shufflevector <2 x float> %ld1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef><br>

> %i2 = shufflevector <4 x float> %A, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5><br>

> ret <4 x float> %i2<br>

><br>

> And x86 SSE output improves from:<br>

><br>

> movq  (%rdi), %xmm1           ## xmm1 = mem[0],zero<br>

> movdqa        %xmm1, %xmm2<br>

> shufps        $229, %xmm2, %xmm2      ## xmm2 = xmm2[1,1,2,3]<br>

> shufps        $48, %xmm0, %xmm1       ## xmm1 = xmm1[0,0],xmm0[3,0]<br>

> shufps        $132, %xmm1, %xmm0      ## xmm0 = xmm0[0,1],xmm1[0,2]<br>

> shufps        $32, %xmm0, %xmm2       ## xmm2 = xmm2[0,0],xmm0[2,0]<br>

> shufps        $36, %xmm2, %xmm0       ## xmm0 = xmm0[0,1],xmm2[2,0]<br>

> retq<br>

><br>

> To the almost optimal:<br>

><br>

> movhpd        (%rdi), %xmm0<br>

><br>

> Note: There's a tension in the existing transform related to generating<br>

> arbitrary shufflevector masks. We avoid that in other places in InstCombine<br>

> because we're scared that codegen can't handle strange masks, but it looks<br>

> like we're ok with producing those here. I purposely chose weird insert/extract<br>

> indexes for the regression tests to see the effect in these cases.<br>

> For PowerPC+Altivec, AArch64, and X86+SSE/AVX, I think the codegen is equal or<br>

> better for these examples.<br>

><br>

> Differential Revision: <a href="http://reviews.llvm.org/D15096" rel="noreferrer" target="_blank">http://reviews.llvm.org/D15096</a><br>

><br>

><br>

> Modified:<br>

>     llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp<br>

>     llvm/trunk/test/Transforms/InstCombine/insert-extract-shuffle.ll<br>

><br>

> Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp<br>

> URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp?rev=256394&r1=256393&r2=256394&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp?rev=256394&r1=256393&r2=256394&view=diff</a><br>

> ==============================================================================<br>

> --- llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp (original)<br>

> +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp Thu Dec 24 15:17:56 2015<br>

> @@ -352,6 +352,48 @@ static bool collectSingleShuffleElements<br>

>    return false;<br>

>  }<br>

><br>

> +/// If we have insertion into a vector that is wider than the vector that we<br>

> +/// are extracting from, try to widen the source vector to allow a single<br>

> +/// shufflevector to replace one or more insert/extract pairs.<br>

> +static void replaceExtractElements(InsertElementInst *InsElt,<br>

> +                                   ExtractElementInst *ExtElt,<br>

> +                                   InstCombiner &IC) {<br>

> +  VectorType *InsVecType = InsElt->getType();<br>

> +  VectorType *ExtVecType = ExtElt->getVectorOperandType();<br>

> +  unsigned NumInsElts = InsVecType->getVectorNumElements();<br>

> +  unsigned NumExtElts = ExtVecType->getVectorNumElements();<br>

> +<br>

> +  // The inserted-to vector must be wider than the extracted-from vector.<br>

> +  if (InsVecType->getElementType() != ExtVecType->getElementType() ||<br>

> +      NumExtElts >= NumInsElts)<br>

> +    return;<br>

> +<br>

> +  // Create a shuffle mask to widen the extended-from vector using undefined<br>

> +  // values. The mask selects all of the values of the original vector followed<br>

> +  // by as many undefined values as needed to create a vector of the same length<br>

> +  // as the inserted-to vector.<br>

> +  SmallVector<Constant *, 16> ExtendMask;<br>

> +  IntegerType *IntType = Type::getInt32Ty(InsElt->getContext());<br>

> +  for (unsigned i = 0; i < NumExtElts; ++i)<br>

> +    ExtendMask.push_back(ConstantInt::get(IntType, i));<br>

> +  for (unsigned i = NumExtElts; i < NumInsElts; ++i)<br>

> +    ExtendMask.push_back(UndefValue::get(IntType));<br>

> +<br>

> +  Value *ExtVecOp = ExtElt->getVectorOperand();<br>

> +  auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType),<br>

> +                                        ConstantVector::get(ExtendMask));<br>

> +<br>

> +  // Replace all extracts from the original narrow vector with extracts from<br>

> +  // the new wide vector.<br>

> +  WideVec->insertBefore(ExtElt);<br>

> +  for (User *U : ExtVecOp->users()) {<br>

> +    if (ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U)) {<br>

> +      auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));<br>

> +      NewExt->insertAfter(WideVec);<br>

> +      IC.ReplaceInstUsesWith(*OldExt, NewExt);<br>

> +    }<br>

> +  }<br>

> +}<br>

><br>

>  /// We are building a shuffle to create V, which is a sequence of insertelement,<br>

>  /// extractelement pairs. If PermittedRHS is set, then we must either use it or<br>

> @@ -365,7 +407,8 @@ typedef std::pair<Value *, Value *> Shuf<br>

><br>

>  static ShuffleOps collectShuffleElements(Value *V,<br>

>                                           SmallVectorImpl<Constant *> &Mask,<br>

> -                                         Value *PermittedRHS) {<br>

> +                                         Value *PermittedRHS,<br>

> +                                         InstCombiner &IC) {<br>

>    assert(V->getType()->isVectorTy() && "Invalid shuffle!");<br>

>    unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();<br>

><br>

> @@ -396,10 +439,14 @@ static ShuffleOps collectShuffleElements<br>

>          // otherwise we'd end up with a shuffle of three inputs.<br>

>          if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {<br>

>            Value *RHS = EI->getOperand(0);<br>

> -          ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS);<br>

> +          ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC);<br>

>            assert(LR.second == nullptr || LR.second == RHS);<br>

><br>

>            if (LR.first->getType() != RHS->getType()) {<br>

> +            // Although we are giving up for now, see if we can create extracts<br>

> +            // that match the inserts for another round of combining.<br>

> +            replaceExtractElements(IEI, EI, IC);<br>

> +<br>

>              // We tried our best, but we can't find anything compatible with RHS<br>

>              // further up the chain. Return a trivial shuffle.<br>

>              for (unsigned i = 0; i < NumElts; ++i)<br>

> @@ -512,7 +559,7 @@ Instruction *InstCombiner::visitInsertEl<br>

>        // (and any insertelements it points to), into one big shuffle.<br>

>        if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) {<br>

>          SmallVector<Constant*, 16> Mask;<br>

> -        ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr);<br>

> +        ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);<br>

><br>

>          // The proposed shuffle may be trivial, in which case we shouldn't<br>

>          // perform the combine.<br>

><br>

> Modified: llvm/trunk/test/Transforms/InstCombine/insert-extract-shuffle.ll<br>

> URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/insert-extract-shuffle.ll?rev=256394&r1=256393&r2=256394&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/insert-extract-shuffle.ll?rev=256394&r1=256393&r2=256394&view=diff</a><br>

> ==============================================================================<br>

> --- llvm/trunk/test/Transforms/InstCombine/insert-extract-shuffle.ll (original)<br>

> +++ llvm/trunk/test/Transforms/InstCombine/insert-extract-shuffle.ll Thu Dec 24 15:17:56 2015<br>

> @@ -26,8 +26,8 @@ define <4 x i16> @test2(<8 x i16> %in, <<br>

><br>

>  define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) {<br>

>  ; CHECK-LABEL: @test_vcopyq_lane_p64<br>

> -; CHECK-NEXT: extractelement<br>

> -; CHECK-NEXT: insertelement<br>

> +; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 0, i32 undef><br>

> +; CHECK-NEXT: shufflevector <2 x i64> %a, <2 x i64> %[[WIDEVEC]], <2 x i32> <i32 0, i32 2><br>

>  ; CHECK-NEXT: ret <2 x i64> %res<br>

>    %elt = extractelement <1 x i64> %b, i32 0<br>

>    %res = insertelement <2 x i64> %a, i64 %elt, i32 1<br>

> @@ -38,10 +38,8 @@ define <2 x i64> @test_vcopyq_lane_p64(<<br>

><br>

>  define <4 x float> @widen_extract2(<4 x float> %ins, <2 x float> %ext) {<br>

>  ; CHECK-LABEL: @widen_extract2(<br>

> -; CHECK-NEXT: extractelement<br>

> -; CHECK-NEXT: extractelement<br>

> -; CHECK-NEXT: insertelement<br>

> -; CHECK-NEXT: insertelement<br>

> +; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <2 x float> %ext, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef><br>

> +; CHECK-NEXT: shufflevector <4 x float> %ins, <4 x float> %[[WIDEVEC]], <4 x i32> <i32 0, i32 4, i32 2, i32 5><br>

>  ; CHECK-NEXT: ret <4 x float> %i2<br>

>    %e1 = extractelement <2 x float> %ext, i32 0<br>

>    %e2 = extractelement <2 x float> %ext, i32 1<br>

> @@ -52,12 +50,8 @@ define <4 x float> @widen_extract2(<4 x<br>

><br>

>  define <4 x float> @widen_extract3(<4 x float> %ins, <3 x float> %ext) {<br>

>  ; CHECK-LABEL: @widen_extract3(<br>

> -; CHECK-NEXT: extractelement<br>

> -; CHECK-NEXT: extractelement<br>

> -; CHECK-NEXT: extractelement<br>

> -; CHECK-NEXT: insertelement<br>

> -; CHECK-NEXT: insertelement<br>

> -; CHECK-NEXT: insertelement<br>

> +; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <3 x float> %ext, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef><br>

> +; CHECK-NEXT: shufflevector <4 x float> %ins, <4 x float> %[[WIDEVEC]], <4 x i32> <i32 6, i32 5, i32 4, i32 3><br>

>  ; CHECK-NEXT: ret <4 x float> %i3<br>

>    %e1 = extractelement <3 x float> %ext, i32 0<br>

>    %e2 = extractelement <3 x float> %ext, i32 1<br>

> @@ -68,10 +62,10 @@ define <4 x float> @widen_extract3(<4 x<br>

>    ret <4 x float> %i3<br>

>  }<br>

><br>

> -define <8 x float> @too_wide(<8 x float> %ins, <2 x float> %ext) {<br>

> -; CHECK-LABEL: @too_wide(<br>

> -; CHECK-NEXT: extractelement<br>

> -; CHECK-NEXT: insertelement<br>

> +define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) {<br>

> +; CHECK-LABEL: @widen_extract4(<br>

> +; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <2 x float> %ext, <2 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef><br>

> +; CHECK-NEXT: shufflevector <8 x float> %ins, <8 x float> %[[WIDEVEC]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7><br>

>  ; CHECK-NEXT: ret <8 x float> %i1<br>

>    %e1 = extractelement <2 x float> %ext, i32 0<br>

>    %i1 = insertelement <8 x float> %ins, float %e1, i32 2<br>

><br>

><br>

> _______________________________________________<br>

> llvm-commits mailing list<br>

> <a href="mailto:llvm-commits@lists.llvm.org">llvm-commits@lists.llvm.org</a><br>

> <a href="http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits" rel="noreferrer" target="_blank">http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits</a><br>

</div></div></blockquote></div><br></div>