[llvm] r282087 - [LV] Don't emit unused scalars for uniform instructions
Matthew Simpson via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 21 09:50:24 PDT 2016
Author: mssimpso
Date: Wed Sep 21 11:50:24 2016
New Revision: 282087
URL: http://llvm.org/viewvc/llvm-project?rev=282087&view=rev
Log:
[LV] Don't emit unused scalars for uniform instructions
If we identify an instruction as uniform after vectorization, we know that we
should only use the value corresponding to the first vector lane of each unroll
iteration. However, when scalarizing such instructions, we still produce values
for the other vector lanes. This patch prevents us from generating the unused
scalars.
Differential Revision: https://reviews.llvm.org/D24275
Modified:
llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/trunk/test/Transforms/LoopVectorize/induction.ll
llvm/trunk/test/Transforms/LoopVectorize/reverse_induction.ll
Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=282087&r1=282086&r2=282087&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Wed Sep 21 11:50:24 2016
@@ -2281,11 +2281,28 @@ void InnerLoopVectorizer::buildScalarSte
assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
"Val and Step should have the same integer type");
+ auto scalarUserIsUniform = [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return !OrigLoop->contains(I) || !Legal->isScalarAfterVectorization(I) ||
+ Legal->isUniformAfterVectorization(I);
+ };
+
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If EntryVal is uniform or all it's scalar users are uniform, we
+ // only need to generate the first lane. Otherwise, we generate all VF
+ // values. We are essentially determining if the induction variable has no
+ // "multi-scalar" (non-uniform scalar) users.
+ unsigned Lanes =
+ Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ||
+ all_of(EntryVal->users(), scalarUserIsUniform)
+ ? 1
+ : VF;
+
// Compute the scalar steps and save the results in VectorLoopValueMap.
ScalarParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part].resize(VF);
- for (unsigned Lane = 0; Lane < VF; ++Lane) {
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane);
auto *Mul = Builder.CreateMul(StartIdx, Step);
auto *Add = Builder.CreateAdd(ScalarIV, Mul);
@@ -2332,6 +2349,9 @@ InnerLoopVectorizer::getVectorValue(Valu
// Initialize a new vector map entry.
VectorParts Entry(UF);
+ // If we've scalarized a value, that value should be an instruction.
+ auto *I = cast<Instruction>(V);
+
// If we aren't vectorizing, we can just copy the scalar map values over to
// the vector map.
if (VF == 1) {
@@ -2340,9 +2360,12 @@ InnerLoopVectorizer::getVectorValue(Valu
return VectorLoopValueMap.initVector(V, Entry);
}
- // Get the last scalarized instruction. This corresponds to the instruction
- // we created for the last vector lane on the last unroll iteration.
- auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, VF - 1));
+ // Get the last scalar instruction we generated for V. If the value is
+ // known to be uniform after vectorization, this corresponds to lane zero
+ // of the last unroll iteration. Otherwise, the last instruction is the one
+ // we created for the last vector lane of the last unroll iteration.
+ unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1;
+ auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));
// Set the insert point after the last scalarized instruction. This ensures
// the insertelement sequence will directly follow the scalar definitions.
@@ -2350,15 +2373,24 @@ InnerLoopVectorizer::getVectorValue(Valu
auto NewIP = std::next(BasicBlock::iterator(LastInst));
Builder.SetInsertPoint(&*NewIP);
- // However, if we are vectorizing, we need to construct the vector values
- // using insertelement instructions. Since the resulting vectors are stored
- // in VectorLoopValueMap, we will only generate the insertelements once.
+ // However, if we are vectorizing, we need to construct the vector values.
+ // If the value is known to be uniform after vectorization, we can just
+ // broadcast the scalar value corresponding to lane zero for each unroll
+ // iteration. Otherwise, we construct the vector values using insertelement
+ // instructions. Since the resulting vectors are stored in
+ // VectorLoopValueMap, we will only generate the insertelements once.
for (unsigned Part = 0; Part < UF; ++Part) {
- Value *Insert = UndefValue::get(VectorType::get(V->getType(), VF));
- for (unsigned Lane = 0; Lane < VF; ++Lane)
- Insert = Builder.CreateInsertElement(
- Insert, getScalarValue(V, Part, Lane), Builder.getInt32(Lane));
- Entry[Part] = Insert;
+ Value *VectorValue = nullptr;
+ if (Legal->isUniformAfterVectorization(I)) {
+ VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
+ } else {
+ VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
+ for (unsigned Lane = 0; Lane < VF; ++Lane)
+ VectorValue = Builder.CreateInsertElement(
+ VectorValue, getScalarValue(V, Part, Lane),
+ Builder.getInt32(Lane));
+ }
+ Entry[Part] = VectorValue;
}
Builder.restoreIP(OldIP);
return VectorLoopValueMap.initVector(V, Entry);
@@ -2378,6 +2410,9 @@ Value *InnerLoopVectorizer::getScalarVal
if (OrigLoop->isLoopInvariant(V))
return V;
+ assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V))
+ : true && "Uniform values only have lane zero");
+
// If the value from the original loop has not been vectorized, it is
// represented by UF x VF scalar values in the new loop. Return the requested
// scalar value.
@@ -2884,11 +2919,16 @@ void InnerLoopVectorizer::scalarizeInstr
if (IfPredicateInstr)
Cond = createBlockInMask(Instr->getParent());
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If the instruction is uniform, we only need to generate the
+ // first lane. Otherwise, we generate all VF values.
+ unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF;
+
// For each vector unroll 'part':
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part].resize(VF);
// For each scalar that we create:
- for (unsigned Lane = 0; Lane < VF; ++Lane) {
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
// Start if-block.
Value *Cmp = nullptr;
@@ -4398,12 +4438,16 @@ void InnerLoopVectorizer::widenPHIInstru
// This is the normalized GEP that starts counting at zero.
Value *PtrInd = Induction;
PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If the instruction is uniform, we only need to generate the
+ // first lane. Otherwise, we generate all VF values.
+ unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF;
// These are the scalar results. Notice that we don't generate vector GEPs
// because scalar GEPs result in better code.
ScalarParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part].resize(VF);
- for (unsigned Lane = 0; Lane < VF; ++Lane) {
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
Modified: llvm/trunk/test/Transforms/LoopVectorize/induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/induction.ll?rev=282087&r1=282086&r2=282087&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/induction.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/induction.ll Wed Sep 21 11:50:24 2016
@@ -78,21 +78,15 @@ loopexit:
; CHECK: vector.body:
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %[[i0:.+]] = add i64 %index, 0
-; CHECK: %[[i1:.+]] = add i64 %index, 1
; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[i0]]
-; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[i1]]
;
; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_01(
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; UNROLL-NO-IC: %[[i0:.+]] = add i64 %index, 0
-; UNROLL-NO-IC: %[[i1:.+]] = add i64 %index, 1
; UNROLL-NO-IC: %[[i2:.+]] = add i64 %index, 2
-; UNROLL-NO-IC: %[[i3:.+]] = add i64 %index, 3
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i0]]
-; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i1]]
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i2]]
-; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i3]]
;
; IND-LABEL: @scalarize_induction_variable_01(
; IND: vector.body:
@@ -611,9 +605,7 @@ exit:
; CHECK: %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
; CHECK: %offset.idx = add i32 %i, %index
; CHECK: %[[A1:.*]] = add i32 %offset.idx, 0
-; CHECK: %[[A2:.*]] = add i32 %offset.idx, 1
; CHECK: %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A1]]
-; CHECK: %[[G2:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A2]]
; CHECK: %[[G3:.*]] = getelementptr i32, i32* %[[G1]], i32 0
; CHECK: %[[B1:.*]] = bitcast i32* %[[G3]] to <2 x i32>*
; CHECK: store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
Modified: llvm/trunk/test/Transforms/LoopVectorize/reverse_induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/reverse_induction.ll?rev=282087&r1=282086&r2=282087&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/reverse_induction.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/reverse_induction.ll Wed Sep 21 11:50:24 2016
@@ -8,13 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i64 %startval, %index
; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
-; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
-; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
-; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
-; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
-; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
-; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) {
entry:
@@ -40,13 +34,7 @@ loopend:
; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i128 %startval, %index
; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0
-; CHECK: %[[a1:.+]] = add i128 %offset.idx, -1
-; CHECK: %[[a2:.+]] = add i128 %offset.idx, -2
-; CHECK: %[[a3:.+]] = add i128 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4
-; CHECK: %[[a5:.+]] = add i128 %offset.idx, -5
-; CHECK: %[[a6:.+]] = add i128 %offset.idx, -6
-; CHECK: %[[a7:.+]] = add i128 %offset.idx, -7
define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) {
entry:
@@ -72,13 +60,7 @@ loopend:
; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i16 %startval, {{.*}}
; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0
-; CHECK: %[[a1:.+]] = add i16 %offset.idx, -1
-; CHECK: %[[a2:.+]] = add i16 %offset.idx, -2
-; CHECK: %[[a3:.+]] = add i16 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4
-; CHECK: %[[a5:.+]] = add i16 %offset.idx, -5
-; CHECK: %[[a6:.+]] = add i16 %offset.idx, -6
-; CHECK: %[[a7:.+]] = add i16 %offset.idx, -7
define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) {
entry:
@@ -121,13 +103,7 @@ loopend:
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i64 1023, %index
; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
-; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
-; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
-; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
-; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
-; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
-; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
define void @reverse_forward_induction_i64_i8() {
entry:
@@ -153,13 +129,7 @@ while.end:
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i64 1023, %index
; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
-; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
-; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
-; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
-; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
-; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
-; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
define void @reverse_forward_induction_i64_i8_signed() {
entry:
More information about the llvm-commits
mailing list