[PATCH] Extend SLPVectorizer for cases where insertelement instructions must be rescheduled
Arch D. Robison
arch.robison at intel.com
Tue Mar 25 13:01:59 PDT 2014
I separated the typo corrections and cost model change into separate patches [D3154 and D3160]. The remaining patch here addresses only the rescheduling issue. Rotem's comments motivated me to simplify the modifications to buildVector. I've updated the summary to explain the difference between a typical reduction and a "build vector" reduction.
Hi #llvm,
http://llvm-reviews.chandlerc.com/D3143
CHANGE SINCE LAST DIFF
http://llvm-reviews.chandlerc.com/D3143?vs=8021&id=8105#toc
Files:
lib/Transforms/Vectorize/SLPVectorizer.cpp
test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -366,8 +366,9 @@
int getTreeCost();
/// Construct a vectorizable tree that starts at \p Roots and is possibly
- /// used by a reduction of \p RdxOps.
- void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0);
+ /// used by a reduction of \p RdxOps. Flag \p Ruses should be true if the
+ /// reduction values do not need to be extracted.
+ void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0, bool RdxFreeExtract=false);
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
@@ -384,6 +385,10 @@
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
+
+ /// \brief Move InsertElement instructions with indices preceding LastIndex
+ /// \p IE is the root of a chain identified by findBuildVector.
+ void movePrematureInserts(ArrayRef<Value *> VL, InsertElementInst *IE);
private:
struct TreeEntry;
@@ -542,7 +547,8 @@
IRBuilder<> Builder;
};
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) {
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx, bool RdxFreeExtract) {
+ assert(!RdxFreeExtract||Rdx);
deleteTree();
RdxOps = Rdx;
if (!getSameType(Roots))
@@ -576,8 +582,8 @@
if (!UserInst)
continue;
- // Ignore uses that are part of the reduction.
- if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end())
+ // Ignore uses that are part of the reduction that will not need extracts.
+ if (RdxFreeExtract && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end())
continue;
DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
@@ -1840,6 +1846,29 @@
GatherSeq.clear();
}
+void BoUpSLP::movePrematureInserts(ArrayRef<Value *> VL, InsertElementInst *IE) {
+ Instruction *VL0 = cast<Instruction>(VL[0]);
+ int MyLastIndex = getLastIndex(VL);
+ BasicBlock *BB = cast<Instruction>(VL0)->getParent();
+ BlockNumbering &BN = BlocksNumbers[BB];
+ DEBUG(dbgs() << "SLP: Moving premature inserts\n");
+ Instruction* x = BN.getInstruction(MyLastIndex);
+ while (IE->getParent()==BB) {
+ int UserIndex = BN.getIndex(IE);
+ if (UserIndex >= MyLastIndex) {
+ // Walked past transformed region
+ break;
+ }
+ IE->removeFromParent();
+ IE->insertAfter(x);
+ DEBUG(dbgs() << "SLP: Rescheduled: " << *IE << ".\n");
+ x = IE;
+ IE = dyn_cast<InsertElementInst>(IE->user_back());
+ if (!IE)
+ break;
+ }
+}
+
/// The SLPVectorizer Pass.
struct SLPVectorizer : public FunctionPass {
typedef SmallVector<StoreInst *, 8> StoreList;
@@ -1943,7 +1972,7 @@
/// \brief Try to vectorize a list of operands.
/// \returns true if a value was vectorized.
- bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R);
+ bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, InsertElementInst *IE=0, BoUpSLP::ValueSet *Inserts=0);
/// \brief Try to vectorize a chain that may start at the operands of \V;
bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
@@ -2116,7 +2145,7 @@
return tryToVectorizeList(VL, R);
}
-bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
+bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, InsertElementInst *IE, BoUpSLP::ValueSet* Inserts) {
if (VL.size() < 2)
return false;
@@ -2166,10 +2195,14 @@
<< "\n");
ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
- R.buildTree(Ops);
+ R.buildTree(Ops, Inserts);
int Cost = R.getTreeCost();
if (Cost < -SLPCostThreshold) {
+ if (Inserts) {
+ R.movePrematureInserts(VL, IE);
+ Inserts = 0;
+ }
DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n");
R.vectorizeTree();
@@ -2412,7 +2445,7 @@
for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth);
- V.buildTree(ValsToReduce, &ReductionOps);
+ V.buildTree(ValsToReduce, &ReductionOps, true);
// Estimate cost.
int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
@@ -2529,10 +2562,12 @@
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
///
-/// Returns true if it matches
+/// Returns true if it matches. Sets \p Ops to the values inserted
+/// and \p Inserts to the insertelement instructions.
///
static bool findBuildVector(InsertElementInst *IE,
- SmallVectorImpl<Value *> &Ops) {
+ SmallVectorImpl<Value *> &Ops,
+ BoUpSLP::ValueSet &Inserts) {
if (!isa<UndefValue>(IE->getOperand(0)))
return false;
@@ -2551,6 +2586,7 @@
if (!IE->hasOneUse())
return false;
+ Inserts.insert(IE);
IE = NextUse;
}
@@ -2709,10 +2745,11 @@
// Try to vectorize trees that start at insertelement instructions.
if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) {
SmallVector<Value *, 8> Ops;
- if (!findBuildVector(IE, Ops))
+ BoUpSLP::ValueSet Inserts;
+ if (!findBuildVector(IE, Ops, Inserts))
continue;
- if (tryToVectorizeList(Ops, R)) {
+ if (tryToVectorizeList(Ops, R, IE, &Inserts)) {
Changed = true;
it = BB->begin();
e = BB->end();
Index: test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -194,4 +194,29 @@
ret <4 x float> %rb
}
+; Make sure that vectorization happens even if extractelement operations
+; must be rescheduled. The case here is from compiling Julia.
+define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @reschedule_extract(
+; CHECK: %1 = fadd <4 x float> %a, %b
+ %a0 = extractelement <4 x float> %a, i32 0
+ %b0 = extractelement <4 x float> %b, i32 0
+ %c0 = fadd float %a0, %b0
+ %v0 = insertelement <4 x float> undef, float %c0, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %b1 = extractelement <4 x float> %b, i32 1
+ %c1 = fadd float %a1, %b1
+ %v1 = insertelement <4 x float> %v0, float %c1, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %b2 = extractelement <4 x float> %b, i32 2
+ %c2 = fadd float %a2, %b2
+ %v2 = insertelement <4 x float> %v1, float %c2, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %b3 = extractelement <4 x float> %b, i32 3
+ %c3 = fadd float %a3, %b3
+ %v3 = insertelement <4 x float> %v2, float %c3, i32 3
+ ret <4 x float> %v3
+}
+
attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D3143.3.patch
Type: text/x-patch
Size: 7305 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140325/4e756168/attachment.bin>
More information about the llvm-commits
mailing list