[llvm] r256394 - [InstCombine] transform more extract/insert pairs into shuffles (PR2109)

Mon Jan 4 12:25:44 PST 2016

Hi Sanjay,

This commit caused a regression: https://llvm.org/bugs/show_bug.cgi?id=26015

-Tom

On Thu, Dec 24, 2015 at 09:17:56PM -0000, Sanjay Patel via llvm-commits wrote:
> Author: spatel
> Date: Thu Dec 24 15:17:56 2015
> New Revision: 256394
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=256394&view=rev
> Log:
> [InstCombine] transform more extract/insert pairs into shuffles (PR2109)
> 
> This is an extension of the shuffle combining from r203229:
> http://reviews.llvm.org/rL203229
> 
> The idea is to widen a short input vector with undef elements so the
> existing shuffle transform for extract/insert can kick in.
> 
> The motivation is to finally solve PR2109:
> https://llvm.org/bugs/show_bug.cgi?id=2109
> 
> For that example, the IR becomes:
> 
> %1 = bitcast <2 x i32>* %P to <2 x float>*
> %ld1 = load <2 x float>, <2 x float>* %1, align 8
> %2 = shufflevector <2 x float> %ld1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
> %i2 = shufflevector <4 x float> %A, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
> ret <4 x float> %i2
> 
> And x86 SSE output improves from:
> 
> movq	(%rdi), %xmm1           ## xmm1 = mem[0],zero
> movdqa	%xmm1, %xmm2
> shufps	$229, %xmm2, %xmm2      ## xmm2 = xmm2[1,1,2,3]
> shufps	$48, %xmm0, %xmm1       ## xmm1 = xmm1[0,0],xmm0[3,0]
> shufps	$132, %xmm1, %xmm0      ## xmm0 = xmm0[0,1],xmm1[0,2]
> shufps	$32, %xmm0, %xmm2       ## xmm2 = xmm2[0,0],xmm0[2,0]
> shufps	$36, %xmm2, %xmm0       ## xmm0 = xmm0[0,1],xmm2[2,0]
> retq
> 
> To the almost optimal:
> 
> movhpd	(%rdi), %xmm0
> 
> Note: There's a tension in the existing transform related to generating
> arbitrary shufflevector masks. We avoid that in other places in InstCombine
> because we're scared that codegen can't handle strange masks, but it looks
> like we're ok with producing those here. I purposely chose weird insert/extract
> indexes for the regression tests to see the effect in these cases. 
> For PowerPC+Altivec, AArch64, and X86+SSE/AVX, I think the codegen is equal or
> better for these examples.
> 
> Differential Revision: http://reviews.llvm.org/D15096
> 
> 
> Modified:
>     llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
>     llvm/trunk/test/Transforms/InstCombine/insert-extract-shuffle.ll
> 
> Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp?rev=256394&r1=256393&r2=256394&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp (original)
> +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp Thu Dec 24 15:17:56 2015
> @@ -352,6 +352,48 @@ static bool collectSingleShuffleElements
>    return false;
>  }
>  
> +/// If we have insertion into a vector that is wider than the vector that we
> +/// are extracting from, try to widen the source vector to allow a single
> +/// shufflevector to replace one or more insert/extract pairs.
> +static void replaceExtractElements(InsertElementInst *InsElt,
> +                                   ExtractElementInst *ExtElt,
> +                                   InstCombiner &IC) {
> +  VectorType *InsVecType = InsElt->getType();
> +  VectorType *ExtVecType = ExtElt->getVectorOperandType();
> +  unsigned NumInsElts = InsVecType->getVectorNumElements();
> +  unsigned NumExtElts = ExtVecType->getVectorNumElements();
> +
> +  // The inserted-to vector must be wider than the extracted-from vector.
> +  if (InsVecType->getElementType() != ExtVecType->getElementType() ||
> +      NumExtElts >= NumInsElts)
> +    return;
> +
> +  // Create a shuffle mask to widen the extended-from vector using undefined
> +  // values. The mask selects all of the values of the original vector followed
> +  // by as many undefined values as needed to create a vector of the same length
> +  // as the inserted-to vector.
> +  SmallVector<Constant *, 16> ExtendMask;
> +  IntegerType *IntType = Type::getInt32Ty(InsElt->getContext());
> +  for (unsigned i = 0; i < NumExtElts; ++i)
> +    ExtendMask.push_back(ConstantInt::get(IntType, i));
> +  for (unsigned i = NumExtElts; i < NumInsElts; ++i)
> +    ExtendMask.push_back(UndefValue::get(IntType));
> +
> +  Value *ExtVecOp = ExtElt->getVectorOperand();
> +  auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType),
> +                                        ConstantVector::get(ExtendMask));
> +
> +  // Replace all extracts from the original narrow vector with extracts from
> +  // the new wide vector.
> +  WideVec->insertBefore(ExtElt);
> +  for (User *U : ExtVecOp->users()) {
> +    if (ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U)) {
> +      auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
> +      NewExt->insertAfter(WideVec);
> +      IC.ReplaceInstUsesWith(*OldExt, NewExt);
> +    }
> +  }
> +}
>  
>  /// We are building a shuffle to create V, which is a sequence of insertelement,
>  /// extractelement pairs. If PermittedRHS is set, then we must either use it or
> @@ -365,7 +407,8 @@ typedef std::pair<Value *, Value *> Shuf
>  
>  static ShuffleOps collectShuffleElements(Value *V,
>                                           SmallVectorImpl<Constant *> &Mask,
> -                                         Value *PermittedRHS) {
> +                                         Value *PermittedRHS,
> +                                         InstCombiner &IC) {
>    assert(V->getType()->isVectorTy() && "Invalid shuffle!");
>    unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
>  
> @@ -396,10 +439,14 @@ static ShuffleOps collectShuffleElements
>          // otherwise we'd end up with a shuffle of three inputs.
>          if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {
>            Value *RHS = EI->getOperand(0);
> -          ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS);
> +          ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC);
>            assert(LR.second == nullptr || LR.second == RHS);
>  
>            if (LR.first->getType() != RHS->getType()) {
> +            // Although we are giving up for now, see if we can create extracts
> +            // that match the inserts for another round of combining.
> +            replaceExtractElements(IEI, EI, IC);
> +
>              // We tried our best, but we can't find anything compatible with RHS
>              // further up the chain. Return a trivial shuffle.
>              for (unsigned i = 0; i < NumElts; ++i)
> @@ -512,7 +559,7 @@ Instruction *InstCombiner::visitInsertEl
>        // (and any insertelements it points to), into one big shuffle.
>        if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) {
>          SmallVector<Constant*, 16> Mask;
> -        ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr);
> +        ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
>  
>          // The proposed shuffle may be trivial, in which case we shouldn't
>          // perform the combine.
> 
> Modified: llvm/trunk/test/Transforms/InstCombine/insert-extract-shuffle.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/insert-extract-shuffle.ll?rev=256394&r1=256393&r2=256394&view=diff
> ==============================================================================
> --- llvm/trunk/test/Transforms/InstCombine/insert-extract-shuffle.ll (original)
> +++ llvm/trunk/test/Transforms/InstCombine/insert-extract-shuffle.ll Thu Dec 24 15:17:56 2015
> @@ -26,8 +26,8 @@ define <4 x i16> @test2(<8 x i16> %in, <
>  
>  define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) {
>  ; CHECK-LABEL: @test_vcopyq_lane_p64
> -; CHECK-NEXT: extractelement
> -; CHECK-NEXT: insertelement
> +; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 0, i32 undef>
> +; CHECK-NEXT: shufflevector <2 x i64> %a, <2 x i64> %[[WIDEVEC]], <2 x i32> <i32 0, i32 2>
>  ; CHECK-NEXT: ret <2 x i64> %res
>    %elt = extractelement <1 x i64> %b, i32 0
>    %res = insertelement <2 x i64> %a, i64 %elt, i32 1
> @@ -38,10 +38,8 @@ define <2 x i64> @test_vcopyq_lane_p64(<
>  
>  define <4 x float> @widen_extract2(<4 x float> %ins, <2 x float> %ext) {
>  ; CHECK-LABEL: @widen_extract2(
> -; CHECK-NEXT: extractelement
> -; CHECK-NEXT: extractelement
> -; CHECK-NEXT: insertelement
> -; CHECK-NEXT: insertelement
> +; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <2 x float> %ext, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
> +; CHECK-NEXT: shufflevector <4 x float> %ins, <4 x float> %[[WIDEVEC]], <4 x i32> <i32 0, i32 4, i32 2, i32 5>
>  ; CHECK-NEXT: ret <4 x float> %i2
>    %e1 = extractelement <2 x float> %ext, i32 0
>    %e2 = extractelement <2 x float> %ext, i32 1
> @@ -52,12 +50,8 @@ define <4 x float> @widen_extract2(<4 x
>  
>  define <4 x float> @widen_extract3(<4 x float> %ins, <3 x float> %ext) {
>  ; CHECK-LABEL: @widen_extract3(
> -; CHECK-NEXT: extractelement
> -; CHECK-NEXT: extractelement
> -; CHECK-NEXT: extractelement
> -; CHECK-NEXT: insertelement
> -; CHECK-NEXT: insertelement
> -; CHECK-NEXT: insertelement
> +; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <3 x float> %ext, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
> +; CHECK-NEXT: shufflevector <4 x float> %ins, <4 x float> %[[WIDEVEC]], <4 x i32> <i32 6, i32 5, i32 4, i32 3>
>  ; CHECK-NEXT: ret <4 x float> %i3
>    %e1 = extractelement <3 x float> %ext, i32 0
>    %e2 = extractelement <3 x float> %ext, i32 1
> @@ -68,10 +62,10 @@ define <4 x float> @widen_extract3(<4 x
>    ret <4 x float> %i3
>  }
>  
> -define <8 x float> @too_wide(<8 x float> %ins, <2 x float> %ext) {
> -; CHECK-LABEL: @too_wide(
> -; CHECK-NEXT: extractelement
> -; CHECK-NEXT: insertelement
> +define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) {
> +; CHECK-LABEL: @widen_extract4(
> +; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <2 x float> %ext, <2 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; CHECK-NEXT: shufflevector <8 x float> %ins, <8 x float> %[[WIDEVEC]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7>
>  ; CHECK-NEXT: ret <8 x float> %i1
>    %e1 = extractelement <2 x float> %ext, i32 0
>    %i1 = insertelement <8 x float> %ins, float %e1, i32 2
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits