[llvm] r271410 - [LV] For some IVs, use vector phis instead of widening in the loop body
Michael Kuperstein via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 1 10:16:50 PDT 2016
Author: mkuper
Date: Wed Jun 1 12:16:46 2016
New Revision: 271410
URL: http://llvm.org/viewvc/llvm-project?rev=271410&view=rev
Log:
[LV] For some IVs, use vector phis instead of widening in the loop body
Previously, whenever we needed a vector IV, we would create it on the fly,
by splatting the scalar IV and adding a step vector. Instead, we can create a
real vector IV. This tends to save a couple of instructions per iteration.
This only changes the behavior for the most basic case - integer primary
IVs with a constant step.
Differential Revision: http://reviews.llvm.org/D20315
Modified:
llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll
llvm/trunk/test/Transforms/LoopVectorize/cast-induction.ll
llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll
llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll
llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll
llvm/trunk/test/Transforms/LoopVectorize/induction.ll
llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll
Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=271410&r1=271409&r2=271410&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Wed Jun 1 12:16:46 2016
@@ -422,6 +422,14 @@ protected:
/// from SCEV or creates a new using SCEVExpander.
virtual Value *getStepVector(Value *Val, int StartIdx, const SCEV *Step);
+ /// Create a vector induction variable based on an existing scalar one.
+ /// Currently only works for integer primary induction variables with
+ /// a constant step.
+ /// If TruncType is provided, instead of widening the original IV, we
+ /// widen a version of the IV truncated to TruncType.
+ void widenInductionVariable(const InductionDescriptor &II, VectorParts &Entry,
+ IntegerType *TruncType = nullptr);
+
/// When we go over instructions in the basic block we rely on previous
/// values within the current basic block or on loop invariant values.
/// When we widen (vectorize) values we place them in the map. If the values
@@ -2099,6 +2107,40 @@ Value *InnerLoopVectorizer::getStepVecto
return getStepVector(Val, StartIdx, StepValue);
}
+void InnerLoopVectorizer::widenInductionVariable(const InductionDescriptor &II,
+ VectorParts &Entry,
+ IntegerType *TruncType) {
+ Value *Start = II.getStartValue();
+ ConstantInt *Step = II.getConstIntStepValue();
+ assert(Step && "Can not widen an IV with a non-constant step");
+
+ // Construct the initial value of the vector IV in the vector loop preheader
+ auto CurrIP = Builder.saveIP();
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ if (TruncType) {
+ Step = ConstantInt::getSigned(TruncType, Step->getSExtValue());
+ Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+ }
+ Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
+ Value *SteppedStart = getStepVector(SplatStart, 0, Step);
+ Builder.restoreIP(CurrIP);
+
+ Value *SplatVF =
+ ConstantVector::getSplat(VF, ConstantInt::get(Start->getType(), VF));
+ // We may need to add the step a number of times, depending on the unroll
+ // factor. The last of those goes into the PHI.
+ PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
+ &*LoopVectorBody->getFirstInsertionPt());
+ Value *LastInduction = VecInd;
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Entry[Part] = LastInduction;
+ LastInduction = Builder.CreateAdd(LastInduction, SplatVF, "step.add");
+ }
+
+ VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
+ VecInd->addIncoming(LastInduction, LoopVectorBody);
+}
+
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
Value *Step) {
assert(Val->getType()->isVectorTy() && "Must be a vector");
@@ -4056,19 +4098,25 @@ void InnerLoopVectorizer::widenPHIInstru
llvm_unreachable("Unknown induction");
case InductionDescriptor::IK_IntInduction: {
assert(P->getType() == II.getStartValue()->getType() && "Types must match");
- // Handle other induction variables that are now based on the
- // canonical one.
- Value *V = Induction;
- if (P != OldInduction) {
- V = Builder.CreateSExtOrTrunc(Induction, P->getType());
- V = II.transform(Builder, V, PSE.getSE(), DL);
- V->setName("offset.idx");
- }
- Value *Broadcasted = getBroadcastInstrs(V);
- // After broadcasting the induction variable we need to make the vector
- // consecutive by adding 0, 1, 2, etc.
- for (unsigned part = 0; part < UF; ++part)
- Entry[part] = getStepVector(Broadcasted, VF * part, II.getStep());
+ if (P != OldInduction || VF == 1) {
+ Value *V = Induction;
+ // Handle other induction variables that are now based on the
+ // canonical one.
+ if (P != OldInduction) {
+ V = Builder.CreateSExtOrTrunc(Induction, P->getType());
+ V = II.transform(Builder, V, PSE.getSE(), DL);
+ V->setName("offset.idx");
+ }
+ Value *Broadcasted = getBroadcastInstrs(V);
+ // After broadcasting the induction variable we need to make the vector
+ // consecutive by adding 0, 1, 2, etc.
+ for (unsigned part = 0; part < UF; ++part)
+ Entry[part] = getStepVector(Broadcasted, VF * part, II.getStep());
+ } else {
+ // Instead of re-creating the vector IV by splatting the scalar IV
+ // in each iteration, we can make a new independent vector IV.
+ widenInductionVariable(II, Entry);
+ }
return;
}
case InductionDescriptor::IK_PtrInduction:
@@ -4239,15 +4287,23 @@ void InnerLoopVectorizer::vectorizeBlock
if (CI->getOperand(0) == OldInduction &&
it->getOpcode() == Instruction::Trunc) {
InductionDescriptor II =
- Legal->getInductionVars()->lookup(OldInduction);
+ Legal->getInductionVars()->lookup(OldInduction);
if (auto StepValue = II.getConstIntStepValue()) {
- StepValue = ConstantInt::getSigned(cast<IntegerType>(CI->getType()),
- StepValue->getSExtValue());
- Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
- CI->getType());
- Value *Broadcasted = getBroadcastInstrs(ScalarCast);
- for (unsigned Part = 0; Part < UF; ++Part)
- Entry[Part] = getStepVector(Broadcasted, VF * Part, StepValue);
+ IntegerType *TruncType = cast<IntegerType>(CI->getType());
+ if (VF == 1) {
+ StepValue =
+ ConstantInt::getSigned(TruncType, StepValue->getSExtValue());
+ Value *ScalarCast =
+ Builder.CreateCast(CI->getOpcode(), Induction, CI->getType());
+ Value *Broadcasted = getBroadcastInstrs(ScalarCast);
+ for (unsigned Part = 0; Part < UF; ++Part)
+ Entry[Part] = getStepVector(Broadcasted, VF * Part, StepValue);
+ } else {
+ // Truncating a vector induction variable on each iteration
+ // may be expensive. Instead, truncate the initial value, and create
+ // a new, truncated, vector IV based on that.
+ widenInductionVariable(II, Entry, TruncType);
+ }
addMetadata(Entry, &*it);
break;
}
Modified: llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll?rev=271410&r1=271409&r2=271410&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll Wed Jun 1 12:16:46 2016
@@ -43,7 +43,7 @@ for.end12:
; CHECK-LABEL: @s173
; CHECK: load <4 x float>, <4 x float>*
-; CHECK: add i64 %index, 16000
+; CHECK: add nsw i64 %.lhs, 16000
; CHECK: ret i32 0
}
Modified: llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll?rev=271410&r1=271409&r2=271410&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll Wed Jun 1 12:16:46 2016
@@ -95,7 +95,7 @@ for.end:
%struct.In = type { float, float }
;AVX512-LABEL: @foo2
-;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
+;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
;AVX512: llvm.masked.gather.v16f32
;AVX512: llvm.masked.store.v16f32
;AVX512: ret void
@@ -170,10 +170,10 @@ for.end:
;}
;AVX512-LABEL: @foo3
-;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
+;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
;AVX512: llvm.masked.gather.v16f32
;AVX512: fadd <16 x float>
-;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %induction, i32 1
+;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1
;AVX512: llvm.masked.scatter.v16f32
;AVX512: ret void
Modified: llvm/trunk/test/Transforms/LoopVectorize/cast-induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/cast-induction.ll?rev=271410&r1=271409&r2=271410&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/cast-induction.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/cast-induction.ll Wed Jun 1 12:16:46 2016
@@ -8,7 +8,7 @@ target triple = "x86_64-apple-macosx10.8
@a = common global [2048 x i32] zeroinitializer, align 16
;CHECK-LABEL: @example12(
-;CHECK: trunc i64
+;CHECK: %vec.ind1 = phi <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret void
define void @example12() nounwind uwtable ssp {
Modified: llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll?rev=271410&r1=271409&r2=271410&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll Wed Jun 1 12:16:46 2016
@@ -368,7 +368,7 @@ define void @example11() nounwind uwtabl
}
;CHECK-LABEL: @example12(
-;CHECK: trunc i64
+;CHECK: %vec.ind1 = phi <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret void
define void @example12() nounwind uwtable ssp {
Modified: llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll?rev=271410&r1=271409&r2=271410&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll Wed Jun 1 12:16:46 2016
@@ -12,10 +12,11 @@ target datalayout = "e-m:e-i64:64-i128:1
; CHECK-LABEL: @foo
; CHECK: vector.body
-; CHECK: %0 = getelementptr inbounds double*, double** %in, i64 %index
-; CHECK: %1 = bitcast double** %0 to <4 x i64>*
-; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %1, align 8
-; CHECK: %2 = icmp eq <4 x i64> %wide.load, zeroinitializer
+; CHECK: %0 = phi
+; CHECK: %2 = getelementptr inbounds double*, double** %in, i64 %0
+; CHECK: %3 = bitcast double** %2 to <4 x i64>*
+; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %3, align 8
+; CHECK: %4 = icmp eq <4 x i64> %wide.load, zeroinitializer
; CHECK: br i1
define void @foo(double** noalias nocapture readonly %in, double** noalias nocapture readnone %out, i8* noalias nocapture %res) #0 {
@@ -37,4 +38,4 @@ for.body:
for.end:
ret void
-}
\ No newline at end of file
+}
Modified: llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll?rev=271410&r1=271409&r2=271410&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll Wed Jun 1 12:16:46 2016
@@ -12,7 +12,7 @@ target datalayout = "e-p:32:32:32-i1:8:8
@PA = external global i32*
-;; === First, the tests that should always vectorize, wither statically or by adding run-time checks ===
+;; === First, the tests that should always vectorize, whether statically or by adding run-time checks ===
; /// Different objects, positive induction, constant distance
@@ -387,7 +387,7 @@ for.end:
; return Foo.A[a];
; }
; CHECK-LABEL: define i32 @noAlias08(
-; CHECK: sub <4 x i32>
+; CHECK: sub nuw nsw <4 x i32>
; CHECK: ret
define i32 @noAlias08(i32 %a) #0 {
@@ -439,7 +439,7 @@ for.end:
; return Foo.A[a];
; }
; CHECK-LABEL: define i32 @noAlias09(
-; CHECK: sub <4 x i32>
+; CHECK: sub nuw nsw <4 x i32>
; CHECK: ret
define i32 @noAlias09(i32 %a) #0 {
@@ -721,7 +721,7 @@ for.end:
; return Foo.A[a];
; }
; CHECK-LABEL: define i32 @noAlias14(
-; CHECK: sub <4 x i32>
+; CHECK: sub nuw nsw <4 x i32>
; CHECK: ret
define i32 @noAlias14(i32 %a) #0 {
Modified: llvm/trunk/test/Transforms/LoopVectorize/induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/induction.ll?rev=271410&r1=271409&r2=271410&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/induction.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/induction.ll Wed Jun 1 12:16:46 2016
@@ -1,4 +1,6 @@
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -27,8 +29,6 @@ for.end:
ret void
}
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
-
; Make sure we remove unneeded vectorization of induction variables.
; In order for instcombine to cleanup the vectorized induction variables that we
; create in the loop vectorizer we need to perform some form of redundancy
@@ -241,3 +241,64 @@ entry:
exit:
ret void
}
+
+; Check that we generate vectorized IVs in the pre-header
+; instead of widening the scalar IV inside the loop, when
+; we know how to do that.
+; IND-LABEL: veciv
+; IND: vector.body:
+; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %step.add, %vector.body ]
+; IND: %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; IND: %index.next = add i32 %index, 2
+; IND: %[[CMP:.*]] = icmp eq i32 %index.next
+; IND: br i1 %[[CMP]]
+; UNROLL-LABEL: veciv
+; UNROLL: vector.body:
+; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %step.add1, %vector.body ]
+; UNROLL: %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; UNROLL: %step.add1 = add <2 x i32> %vec.ind, <i32 4, i32 4>
+; UNROLL: %index.next = add i32 %index, 4
+; UNROLL: %[[CMP:.*]] = icmp eq i32 %index.next
+; UNROLL: br i1 %[[CMP]]
+define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
+for.body.preheader:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv
+ store i32 %indvars.iv, i32* %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+ %exitcond = icmp eq i32 %indvars.iv.next, %k
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+; IND-LABEL: trunciv
+; IND: vector.body:
+; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %[[VECIND:.*]] = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %[[STEPADD:.*]], %vector.body ]
+; IND: %[[STEPADD]] = add <2 x i32> %[[VECIND]], <i32 2, i32 2>
+; IND: %index.next = add i64 %index, 2
+; IND: %[[CMP:.*]] = icmp eq i64 %index.next
+; IND: br i1 %[[CMP]]
+define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
+for.body.preheader:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+ %trunc.iv = trunc i64 %indvars.iv to i32
+ %arrayidx = getelementptr inbounds i32, i32* %a, i32 %trunc.iv
+ store i32 %trunc.iv, i32* %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %k
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ ret void
+}
Modified: llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll?rev=271410&r1=271409&r2=271410&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll Wed Jun 1 12:16:46 2016
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
@@ -6,8 +6,11 @@ target triple = "x86_64-apple-macosx10.8
@array = common global [1024 x i32] zeroinitializer, align 16
;CHECK-LABEL: @array_at_plus_one(
-;CHECK: add i64 %index, 12
-;CHECK: trunc i64
+;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK: %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %step.add, %vector.body ]
+;CHECK: %vec.ind1 = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %step.add2, %vector.body ]
+;CHECK: add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+;CHECK: add nsw <4 x i64> %vec.ind, <i64 12, i64 12, i64 12, i64 12>
;CHECK: ret i32
define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
More information about the llvm-commits
mailing list