[llvm] r244872 - [InstCombine] SSE/AVX vector shifts demanded shift amount bits

Fri Aug 21 11:35:57 PDT 2015

> On 2015-Aug-13, at 00:39, Simon Pilgrim via llvm-commits <llvm-commits at lists.llvm.org> wrote:
> 
> Author: rksimon
> Date: Thu Aug 13 02:39:03 2015
> New Revision: 244872
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=244872&view=rev
> Log:
> [InstCombine] SSE/AVX vector shifts demanded shift amount bits
> 
> Most SSE/AVX (non-constant) vector shift instructions only use the lower 64-bits of the 128-bit shift amount vector operand, this patch calls SimplifyDemandedVectorElts to optimize for this.
> 
> I had to refactor some of my recent InstCombiner work on the vector shifts to avoid quite a bit of duplicate code, it means that SimplifyX86immshift now (re)decodes the type of shift.
> 
> Differential Revision: http://reviews.llvm.org/D11938
> 
> Modified:
>    llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
>    llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll
> 
> Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp?rev=244872&r1=244871&r2=244872&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp (original)
> +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp Thu Aug 13 02:39:03 2015
> @@ -198,8 +198,52 @@ Instruction *InstCombiner::SimplifyMemSe
> }
> 
> static Value *SimplifyX86immshift(const IntrinsicInst &II,
> -                                  InstCombiner::BuilderTy &Builder,
> -                                  bool LogicalShift, bool ShiftLeft) {
> +                                  InstCombiner::BuilderTy &Builder) {
> +  bool LogicalShift = false;
> +  bool ShiftLeft = false;
> +
> +  switch (II.getIntrinsicID()) {
> +  default:
> +    return nullptr;
> +  case Intrinsic::x86_sse2_psra_d:
> +  case Intrinsic::x86_sse2_psra_w:
> +  case Intrinsic::x86_sse2_psrai_d:
> +  case Intrinsic::x86_sse2_psrai_w:
> +  case Intrinsic::x86_avx2_psra_d:
> +  case Intrinsic::x86_avx2_psra_w:
> +  case Intrinsic::x86_avx2_psrai_d:
> +  case Intrinsic::x86_avx2_psrai_w:
> +    LogicalShift = false; ShiftLeft = false;
> +    break;
> +  case Intrinsic::x86_sse2_psrl_d:
> +  case Intrinsic::x86_sse2_psrl_q:
> +  case Intrinsic::x86_sse2_psrl_w:
> +  case Intrinsic::x86_sse2_psrli_d:
> +  case Intrinsic::x86_sse2_psrli_q:
> +  case Intrinsic::x86_sse2_psrli_w:
> +  case Intrinsic::x86_avx2_psrl_d:
> +  case Intrinsic::x86_avx2_psrl_q:
> +  case Intrinsic::x86_avx2_psrl_w:
> +  case Intrinsic::x86_avx2_psrli_d:
> +  case Intrinsic::x86_avx2_psrli_q:
> +  case Intrinsic::x86_avx2_psrli_w:
> +    LogicalShift = true; ShiftLeft = false;
> +    break;
> +  case Intrinsic::x86_sse2_psll_d:
> +  case Intrinsic::x86_sse2_psll_q:
> +  case Intrinsic::x86_sse2_psll_w:
> +  case Intrinsic::x86_sse2_pslli_d:
> +  case Intrinsic::x86_sse2_pslli_q:
> +  case Intrinsic::x86_sse2_pslli_w:
> +  case Intrinsic::x86_avx2_psll_d:
> +  case Intrinsic::x86_avx2_psll_q:
> +  case Intrinsic::x86_avx2_psll_w:
> +  case Intrinsic::x86_avx2_pslli_d:
> +  case Intrinsic::x86_avx2_pslli_q:
> +  case Intrinsic::x86_avx2_pslli_w:
> +    LogicalShift = true; ShiftLeft = true;
> +    break;
> +  }
>   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
> 
>   // Simplify if count is constant.
> @@ -788,51 +832,64 @@ Instruction *InstCombiner::visitCallInst
>   }
> 
>   // Constant fold ashr( <A x Bi>, Ci ).
> -  case Intrinsic::x86_sse2_psra_d:
> -  case Intrinsic::x86_sse2_psra_w:
> +  // Constant fold lshr( <A x Bi>, Ci ).
> +  // Constant fold shl( <A x Bi>, Ci ).
>   case Intrinsic::x86_sse2_psrai_d:
>   case Intrinsic::x86_sse2_psrai_w:
> -  case Intrinsic::x86_avx2_psra_d:
> -  case Intrinsic::x86_avx2_psra_w:
>   case Intrinsic::x86_avx2_psrai_d:
>   case Intrinsic::x86_avx2_psrai_w:
> -    if (Value *V = SimplifyX86immshift(*II, *Builder, false, false))
> -      return ReplaceInstUsesWith(*II, V);
> -    break;
> -
> -  // Constant fold lshr( <A x Bi>, Ci ).
> -  case Intrinsic::x86_sse2_psrl_d:
> -  case Intrinsic::x86_sse2_psrl_q:
> -  case Intrinsic::x86_sse2_psrl_w:
>   case Intrinsic::x86_sse2_psrli_d:
>   case Intrinsic::x86_sse2_psrli_q:
>   case Intrinsic::x86_sse2_psrli_w:
> -  case Intrinsic::x86_avx2_psrl_d:
> -  case Intrinsic::x86_avx2_psrl_q:
> -  case Intrinsic::x86_avx2_psrl_w:
>   case Intrinsic::x86_avx2_psrli_d:
>   case Intrinsic::x86_avx2_psrli_q:
>   case Intrinsic::x86_avx2_psrli_w:
> -    if (Value *V = SimplifyX86immshift(*II, *Builder, true, false))
> +  case Intrinsic::x86_sse2_pslli_d:
> +  case Intrinsic::x86_sse2_pslli_q:
> +  case Intrinsic::x86_sse2_pslli_w:
> +  case Intrinsic::x86_avx2_pslli_d:
> +  case Intrinsic::x86_avx2_pslli_q:
> +  case Intrinsic::x86_avx2_pslli_w:
> +    if (Value *V = SimplifyX86immshift(*II, *Builder))
>       return ReplaceInstUsesWith(*II, V);
>     break;
> 
> -  // Constant fold shl( <A x Bi>, Ci ).
> +  case Intrinsic::x86_sse2_psra_d:
> +  case Intrinsic::x86_sse2_psra_w:
> +  case Intrinsic::x86_avx2_psra_d:
> +  case Intrinsic::x86_avx2_psra_w:
> +  case Intrinsic::x86_sse2_psrl_d:
> +  case Intrinsic::x86_sse2_psrl_q:
> +  case Intrinsic::x86_sse2_psrl_w:
> +  case Intrinsic::x86_avx2_psrl_d:
> +  case Intrinsic::x86_avx2_psrl_q:
> +  case Intrinsic::x86_avx2_psrl_w:
>   case Intrinsic::x86_sse2_psll_d:
>   case Intrinsic::x86_sse2_psll_q:
>   case Intrinsic::x86_sse2_psll_w:
> -  case Intrinsic::x86_sse2_pslli_d:
> -  case Intrinsic::x86_sse2_pslli_q:
> -  case Intrinsic::x86_sse2_pslli_w:
>   case Intrinsic::x86_avx2_psll_d:
>   case Intrinsic::x86_avx2_psll_q:
> -  case Intrinsic::x86_avx2_psll_w:
> -  case Intrinsic::x86_avx2_pslli_d:
> -  case Intrinsic::x86_avx2_pslli_q:
> -  case Intrinsic::x86_avx2_pslli_w:
> -    if (Value *V = SimplifyX86immshift(*II, *Builder, true, true))
> +  case Intrinsic::x86_avx2_psll_w: {
> +    if (Value *V = SimplifyX86immshift(*II, *Builder))
>       return ReplaceInstUsesWith(*II, V);
> +
> +    // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
> +    // operand to compute the shift amount.
> +    auto ShiftAmt = II->getArgOperand(1);
> +    auto ShiftType = cast<VectorType>(ShiftAmt->getType());
> +    assert(ShiftType->getPrimitiveSizeInBits() == 128 &&
> +           "Unexpected packed shift size");
> +    unsigned VWidth = ShiftType->getNumElements();
> +
> +    APInt DemandedElts = APInt::getLowBitsSet(VWidth, VWidth / 2);
> +    APInt UndefElts(VWidth, 0);
> +    if (Value *V =
> +            SimplifyDemandedVectorElts(ShiftAmt, DemandedElts, UndefElts)) {
> +      II->setArgOperand(1, V);
> +      return II;
> +    }
>     break;
> +  }
> 
>   case Intrinsic::x86_sse41_pmovsxbd:
>   case Intrinsic::x86_sse41_pmovsxbq:
> 
> Modified: llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll?rev=244872&r1=244871&r2=244872&view=diff
> ==============================================================================
> --- llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll (original)
> +++ llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll Thu Aug 13 02:39:03 2015
> @@ -826,6 +826,154 @@ define <4 x i64> @avx2_psll_q_64(<4 x i6
> }
> 
> ;
> +; Vector Demanded Bits
> +;
> +
> +define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @sse2_psra_w_var
> +; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %a)
> +; CHECK-NEXT: ret <8 x i16> %1
> +  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
> +  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1)
> +  ret <8 x i16> %2
> +}

Not obvious via email, but these test changes all have windows line
endings.  Would you please fix that?

> +
> +define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @sse2_psra_d_var
> +; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a)
> +; CHECK-NEXT: ret <4 x i32> %1
> +  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
> +  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1)
> +  ret <4 x i32> %2
> +}
> +
> +define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @avx2_psra_w_var
> +; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a)
> +; CHECK-NEXT: ret <16 x i16> %1
> +  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
> +  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1)
> +  ret <16 x i16> %2
> +}
> +
> +define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @avx2_psra_d_var
> +; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %a)
> +; CHECK-NEXT: ret <8 x i32> %1
> +  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
> +  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1)
> +  ret <8 x i32> %2
> +}
> +
> +define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @sse2_psrl_w_var
> +; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a)
> +; CHECK-NEXT: ret <8 x i16> %1
> +  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
> +  %2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1)
> +  ret <8 x i16> %2
> +}
> +
> +define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @sse2_psrl_d_var
> +; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %a)
> +; CHECK-NEXT: ret <4 x i32> %1
> +  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
> +  %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1)
> +  ret <4 x i32> %2
> +}
> +
> +define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @sse2_psrl_q_var
> +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %a)
> +; CHECK-NEXT: ret <2 x i64> %1
> +  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
> +  %2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1)
> +  ret <2 x i64> %2
> +}
> +
> +define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @avx2_psrl_w_var
> +; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %a)
> +; CHECK-NEXT: ret <16 x i16> %1
> +  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
> +  %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1)
> +  ret <16 x i16> %2
> +}
> +
> +define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @avx2_psrl_d_var
> +; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a)
> +; CHECK-NEXT: ret <8 x i32> %1
> +  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
> +  %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1)
> +  ret <8 x i32> %2
> +}
> +
> +define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @avx2_psrl_q_var
> +; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a)
> +; CHECK-NEXT: ret <4 x i64> %1
> +  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
> +  %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1)
> +  ret <4 x i64> %2
> +}
> +
> +define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @sse2_psll_w_var
> +; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a)
> +; CHECK-NEXT: ret <8 x i16> %1
> +  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
> +  %2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1)
> +  ret <8 x i16> %2
> +}
> +
> +define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @sse2_psll_d_var
> +; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %a)
> +; CHECK-NEXT: ret <4 x i32> %1
> +  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
> +  %2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1)
> +  ret <4 x i32> %2
> +}
> +
> +define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @sse2_psll_q_var
> +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %a)
> +; CHECK-NEXT: ret <2 x i64> %1
> +  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
> +  %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1)
> +  ret <2 x i64> %2
> +}
> +
> +define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @avx2_psll_w_var
> +; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %a)
> +; CHECK-NEXT: ret <16 x i16> %1
> +  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
> +  %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1)
> +  ret <16 x i16> %2
> +}
> +
> +define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @avx2_psll_d_var
> +; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %a)
> +; CHECK-NEXT: ret <8 x i32> %1
> +  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
> +  %2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1)
> +  ret <8 x i32> %2
> +}
> +
> +define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
> +; CHECK-LABEL: @avx2_psll_q_var
> +; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %a)
> +; CHECK-NEXT: ret <4 x i64> %1
> +  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
> +  %2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1)
> +  ret <4 x i64> %2
> +}
> +
> +;
> ; Constant Folding
> ;
> 
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits