[llvm-commits] [llvm] r170439 - in /llvm/trunk: lib/Transforms/Vectorize/LoopVectorize.cpp test/Transforms/LoopVectorize/reduction.ll

Tue Dec 18 11:13:08 PST 2012

On Dec 18, 2012, at 10:40 AM, Benjamin Kramer <benny.kra at googlemail.com> wrote:

> Author: d0k
> Date: Tue Dec 18 12:40:20 2012
> New Revision: 170439
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=170439&view=rev
> Log:
> LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations.

Cool.  Does this allow target-specific hooks so that (for example) X86 can use haddps?

-Chris

> 
> For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
> 	movdqa	%xmm0, %xmm1
> 	movhlps	%xmm1, %xmm1            ## xmm1 = xmm1[1,1]
> 	paddw	%xmm0, %xmm1
> 	pshufd	$1, %xmm1, %xmm0        ## xmm0 = xmm1[1,0,0,0]
> 	paddw	%xmm1, %xmm0
> 	phaddw	%xmm0, %xmm0
> 	pextrb	$0, %xmm0, %edx
> 
> instead of
> 	pextrb	$2, %xmm0, %esi
> 	pextrb	$0, %xmm0, %edx
> 	addb	%sil, %dl
> 	pextrb	$4, %xmm0, %esi
> 	addb	%dl, %sil
> 	pextrb	$6, %xmm0, %edx
> 	addb	%sil, %dl
> 	pextrb	$8, %xmm0, %esi
> 	addb	%dl, %sil
> 	pextrb	$10, %xmm0, %edi
> 	pextrb	$14, %xmm0, %edx
> 	addb	%sil, %dil
> 	pextrb	$12, %xmm0, %esi
> 	addb	%dil, %sil
> 	addb	%sil, %dl
> 
> Modified:
>    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
>    llvm/trunk/test/Transforms/LoopVectorize/reduction.ll
> 
> Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=170439&r1=170438&r2=170439&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Tue Dec 18 12:40:20 2012
> @@ -817,34 +817,53 @@
>     NewPhi->addIncoming(VectorStart, LoopBypassBlock);
>     NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
> 
> -    // Extract the first scalar.
> -    Value *Scalar0 =
> -    Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
> -    // Extract and reduce the remaining vector elements.
> -    for (unsigned i=1; i < VF; ++i) {
> -      Value *Scalar1 =
> -      Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
> +    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
> +    // and vector ops, reducing the set of values being computed by half each
> +    // round.
> +    assert(isPowerOf2_32(VF) &&
> +           "Reduction emission only supported for pow2 vectors!");
> +    Value *TmpVec = NewPhi;
> +    SmallVector<Constant*, 32> ShuffleMask(VF, 0);
> +    for (unsigned i = VF; i != 1; i >>= 1) {
> +      // Move the upper half of the vector to the lower half.
> +      for (unsigned j = 0; j != i/2; ++j)
> +        ShuffleMask[j] = Builder.getInt32(i/2 + j);
> +
> +      // Fill the rest of the mask with undef.
> +      std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
> +                UndefValue::get(Builder.getInt32Ty()));
> +
> +      Value *Shuf =
> +        Builder.CreateShuffleVector(TmpVec,
> +                                    UndefValue::get(TmpVec->getType()),
> +                                    ConstantVector::get(ShuffleMask),
> +                                    "rdx.shuf");
> +
> +      // Emit the operation on the shuffled value.
>       switch (RdxDesc.Kind) {
>       case LoopVectorizationLegality::IntegerAdd:
> -        Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx");
> +        TmpVec = Builder.CreateAdd(TmpVec, Shuf, "add.rdx");
>         break;
>       case LoopVectorizationLegality::IntegerMult:
> -        Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx");
> +        TmpVec = Builder.CreateMul(TmpVec, Shuf, "mul.rdx");
>         break;
>       case LoopVectorizationLegality::IntegerOr:
> -        Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx");
> +        TmpVec = Builder.CreateOr(TmpVec, Shuf, "or.rdx");
>         break;
>       case LoopVectorizationLegality::IntegerAnd:
> -        Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx");
> +        TmpVec = Builder.CreateAnd(TmpVec, Shuf, "and.rdx");
>         break;
>       case LoopVectorizationLegality::IntegerXor:
> -        Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx");
> +        TmpVec = Builder.CreateXor(TmpVec, Shuf, "xor.rdx");
>         break;
>       default:
>         llvm_unreachable("Unknown reduction operation");
>       }
>     }
> 
> +    // The result is in the first element of the vector.
> +    Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
> +
>     // Now, we need to fix the users of the reduction variable
>     // inside and outside of the scalar remainder loop.
>     // We know that the loop is in LCSSA form. We need to update the
> 
> Modified: llvm/trunk/test/Transforms/LoopVectorize/reduction.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/reduction.ll?rev=170439&r1=170438&r2=170439&view=diff
> ==============================================================================
> --- llvm/trunk/test/Transforms/LoopVectorize/reduction.ll (original)
> +++ llvm/trunk/test/Transforms/LoopVectorize/reduction.ll Tue Dec 18 12:40:20 2012
> @@ -7,6 +7,11 @@
> ;CHECK: phi <4 x i32>
> ;CHECK: load <4 x i32>
> ;CHECK: add <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +;CHECK: add <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
> +;CHECK: add <4 x i32>
> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
> ;CHECK: ret i32
> define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
>   %1 = icmp sgt i32 %n, 0
> @@ -37,6 +42,11 @@
> ;CHECK: phi <4 x i32>
> ;CHECK: load <4 x i32>
> ;CHECK: mul <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +;CHECK: mul <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
> +;CHECK: mul <4 x i32>
> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
> ;CHECK: ret i32
> define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
>   %1 = icmp sgt i32 %n, 0
> @@ -67,6 +77,11 @@
> ;CHECK: phi <4 x i32>
> ;CHECK: load <4 x i32>
> ;CHECK: mul nsw <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +;CHECK: add <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
> +;CHECK: add <4 x i32>
> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
> ;CHECK: ret i32
> define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
>   %1 = icmp sgt i32 %n, 0
> @@ -95,6 +110,11 @@
> 
> ;CHECK: @reduction_mul
> ;CHECK: mul <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +;CHECK: mul <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
> +;CHECK: mul <4 x i32>
> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
> ;CHECK: ret i32
> define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
>   %1 = icmp sgt i32 %n, 0
> @@ -124,6 +144,11 @@
> ;CHECK: @start_at_non_zero
> ;CHECK: phi <4 x i32>
> ;CHECK: <i32 120, i32 0, i32 0, i32 0>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +;CHECK: add <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
> +;CHECK: add <4 x i32>
> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
> ;CHECK: ret i32
> define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
> entry:
> @@ -152,6 +177,11 @@
> ;CHECK: @reduction_and
> ;CHECK: and <4 x i32>
> ;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +;CHECK: and <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
> +;CHECK: and <4 x i32>
> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
> ;CHECK: ret i32
> define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
> entry:
> @@ -179,6 +209,11 @@
> 
> ;CHECK: @reduction_or
> ;CHECK: or <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +;CHECK: or <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
> +;CHECK: or <4 x i32>
> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
> ;CHECK: ret i32
> define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
> entry:
> @@ -206,6 +241,11 @@
> 
> ;CHECK: @reduction_xor
> ;CHECK: xor <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +;CHECK: xor <4 x i32>
> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
> +;CHECK: xor <4 x i32>
> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
> ;CHECK: ret i32
> define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
> entry:
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits