[llvm] r285097 - [LV] Sink scalar operands of predicated instructions

Matthew Simpson via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 25 11:59:46 PDT 2016


Author: mssimpso
Date: Tue Oct 25 13:59:45 2016
New Revision: 285097

URL: http://llvm.org/viewvc/llvm-project?rev=285097&view=rev
Log:
[LV] Sink scalar operands of predicated instructions

When we predicate an instruction (div, rem, store) we place the instruction in
its own basic block within the vectorized loop. If a predicated instruction has
scalar operands, it's possible to recursively sink these scalar expressions
into the predicated block so that they might avoid execution. This patch sinks
as much scalar computation as possible into predicated blocks. We previously
were able to sink such operands only if they were extractelement instructions.

Differential Revision: https://reviews.llvm.org/D25632

Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
    llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll
    llvm/trunk/test/Transforms/LoopVectorize/induction.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=285097&r1=285096&r2=285097&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Tue Oct 25 13:59:45 2016
@@ -437,6 +437,10 @@ protected:
   /// See PR14725.
   void fixLCSSAPHIs();
 
+  /// Iteratively sink the scalarized operands of a predicated instruction into
+  /// the block that was created for it.
+  void sinkScalarOperands(Instruction *PredInst);
+
   /// Predicate conditional instructions that require predication on their
   /// respective conditions.
   void predicateInstructions();
@@ -4249,15 +4253,82 @@ void InnerLoopVectorizer::collectTrivial
   }
 }
 
+void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
+
+  // The basic block and loop containing the predicated instruction.
+  auto *PredBB = PredInst->getParent();
+  auto *VectorLoop = LI->getLoopFor(PredBB);
+
+  // Initialize a worklist with the operands of the predicated instruction.
+  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
+
+  // Holds instructions that we need to analyze again. An instruction may be
+  // reanalyzed if we don't yet know if we can sink it or not.
+  SmallVector<Instruction *, 8> InstsToReanalyze;
+
+  // Returns true if a given use occurs in the predicated block. Phi nodes use
+  // their operands in their corresponding predecessor blocks.
+  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
+    auto *I = cast<Instruction>(U.getUser());
+    BasicBlock *BB = I->getParent();
+    if (auto *Phi = dyn_cast<PHINode>(I))
+      BB = Phi->getIncomingBlock(
+          PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+    return BB == PredBB;
+  };
+
+  // Iteratively sink the scalarized operands of the predicated instruction
+  // into the block we created for it. When an instruction is sunk, it's
+  // operands are then added to the worklist. The algorithm ends after one pass
+  // through the worklist doesn't sink a single instruction.
+  bool Changed;
+  do {
+
+    // Add the instructions that need to be reanalyzed to the worklist, and
+    // reset the changed indicator.
+    Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
+    InstsToReanalyze.clear();
+    Changed = false;
+
+    while (!Worklist.empty()) {
+      auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
+
+      // We can't sink an instruction if it is a phi node, is already in the
+      // predicated block, is not in the loop, or may have side effects.
+      if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
+          !VectorLoop->contains(I) || I->mayHaveSideEffects())
+        continue;
+
+      // It's legal to sink the instruction if all its uses occur in the
+      // predicated block. Otherwise, there's nothing to do yet, and we may
+      // need to reanalyze the instruction.
+      if (!all_of(I->uses(), isBlockOfUsePredicated)) {
+        InstsToReanalyze.push_back(I);
+        continue;
+      }
+
+      // Move the instruction to the beginning of the predicated block, and add
+      // it's operands to the worklist.
+      I->moveBefore(&*PredBB->getFirstInsertionPt());
+      Worklist.insert(I->op_begin(), I->op_end());
+
+      // The sinking may have enabled other instructions to be sunk, so we will
+      // need to iterate.
+      Changed = true;
+    }
+  } while (Changed);
+}
+
 void InnerLoopVectorizer::predicateInstructions() {
 
   // For each instruction I marked for predication on value C, split I into its
-  // own basic block to form an if-then construct over C.
-  // Since I may be fed by extractelement and/or be feeding an insertelement
-  // generated during scalarization we try to move such instructions into the
-  // predicated basic block as well. For the insertelement this also means that
-  // the PHI will be created for the resulting vector rather than for the
-  // scalar instruction.
+  // own basic block to form an if-then construct over C. Since I may be fed by
+  // an extractelement instruction or other scalar operand, we try to
+  // iteratively sink its scalar operands into the predicated block. If I feeds
+  // an insertelement instruction, we try to move this instruction into the
+  // predicated block as well. For non-void types, a phi node will be created
+  // for the resulting value (either vector or scalar).
+  //
   // So for some predicated instruction, e.g. the conditional sdiv in:
   //
   // for.body:
@@ -4331,13 +4402,7 @@ void InnerLoopVectorizer::predicateInstr
     auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
                                         /*BranchWeights=*/nullptr, DT, LI);
     I->moveBefore(T);
-    // Try to move any extractelement we may have created for the predicated
-    // instruction into the Then block.
-    for (Use &Op : I->operands()) {
-      auto *OpInst = dyn_cast<ExtractElementInst>(&*Op);
-      if (OpInst && OpInst->hasOneUse()) // TODO: more accurately - hasOneUser()
-        OpInst->moveBefore(&*I);
-    }
+    sinkScalarOperands(&*I);
 
     I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if");
     BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue");

Modified: llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll?rev=285097&r1=285096&r2=285097&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll Tue Oct 25 13:59:45 2016
@@ -200,15 +200,15 @@ for.end:
 ; INTER-NOT: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
 ; INTER:     vector.body
 ; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, {{.*}} ]
-; INTER:       %[[I1:.+]] = or i64 %index, 1
-; INTER:       %[[I2:.+]] = or i64 %index, 2
-; INTER:       %[[I3:.+]] = or i64 %index, 3
 ; INTER:       %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0
+; INTER:       %[[B0:.+]] = bitcast i32* %[[G0]] to <8 x i32>*
+; INTER:       %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 8
+; INTER:       %[[I1:.+]] = or i64 %index, 1
 ; INTER:       getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 0
+; INTER:       %[[I2:.+]] = or i64 %index, 2
 ; INTER:       getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 0
+; INTER:       %[[I3:.+]] = or i64 %index, 3
 ; INTER:       getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 0
-; INTER:       %[[B0:.+]] = bitcast i32* %[[G0]] to <8 x i32>*
-; INTER:       %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 8
 ; INTER:       br i1 {{.*}}, label %middle.block, label %vector.body
 ;
 define void @predicated_store(%pair *%p, i32 %x, i64 %n) {

Modified: llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll?rev=285097&r1=285096&r2=285097&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll Tue Oct 25 13:59:45 2016
@@ -11,9 +11,6 @@ entry:
 
 ; VEC-LABEL: test
 ; VEC:   %[[v0:.+]] = add i64 %index, 0
-; VEC:   %[[v1:.+]] = add i64 %index, 1
-; VEC:   %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]]
-; VEC:   %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]]
 ; VEC:   %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
 ; VEC:   %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, <i32 20, i32 20>
 ; VEC:   %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true>
@@ -24,6 +21,7 @@ entry:
 ;
 ; VEC: [[cond]]:
 ; VEC:   %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
+; VEC:   %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]]
 ; VEC:   store i32 %[[v13]], i32* %[[v2]], align 4
 ; VEC:   br label %[[else:.+]]
 ;
@@ -34,6 +32,8 @@ entry:
 ;
 ; VEC: [[cond2]]:
 ; VEC:   %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
+; VEC:   %[[v1:.+]] = add i64 %index, 1
+; VEC:   %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]]
 ; VEC:   store i32 %[[v17]], i32* %[[v4]], align 4
 ; VEC:   br label %[[else2:.+]]
 ;
@@ -49,14 +49,13 @@ entry:
 ; UNROLL:   %[[v3:[a-zA-Z0-9]+]] = load i32, i32* %[[v1]], align 4
 ; UNROLL:   %[[v4:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v2]], 100
 ; UNROLL:   %[[v5:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v3]], 100
-; UNROLL:   %[[v6:[a-zA-Z0-9]+]] = add nsw i32 %[[v2]], 20
-; UNROLL:   %[[v7:[a-zA-Z0-9]+]] = add nsw i32 %[[v3]], 20
 ; UNROLL:   %[[o1:[a-zA-Z0-9]+]] = or i1 false, %[[v4]]
 ; UNROLL:   %[[o2:[a-zA-Z0-9]+]] = or i1 false, %[[v5]]
 ; UNROLL:   %[[v8:[a-zA-Z0-9]+]] = icmp eq i1 %[[o1]], true
 ; UNROLL:   br i1 %[[v8]], label %[[cond:[a-zA-Z0-9.]+]], label %[[else:[a-zA-Z0-9.]+]]
 ;
 ; UNROLL: [[cond]]:
+; UNROLL:   %[[v6:[a-zA-Z0-9]+]] = add nsw i32 %[[v2]], 20
 ; UNROLL:   store i32 %[[v6]], i32* %[[v0]], align 4
 ; UNROLL:   br label %[[else]]
 ;
@@ -65,6 +64,7 @@ entry:
 ; UNROLL:   br i1 %[[v9]], label %[[cond2:[a-zA-Z0-9.]+]], label %[[else2:[a-zA-Z0-9.]+]]
 ;
 ; UNROLL: [[cond2]]:
+; UNROLL:   %[[v7:[a-zA-Z0-9]+]] = add nsw i32 %[[v3]], 20
 ; UNROLL:   store i32 %[[v7]], i32* %[[v1]], align 4
 ; UNROLL:   br label %[[else2]]
 ;

Modified: llvm/trunk/test/Transforms/LoopVectorize/induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/induction.ll?rev=285097&r1=285096&r2=285097&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/induction.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/induction.ll Tue Oct 25 13:59:45 2016
@@ -303,58 +303,58 @@ for.end:
 ; CHECK: vector.body:
 ; CHECK:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue2 ]
 ; CHECK:   %[[I0:.+]] = add i32 %index, 0
-; CHECK:   %[[I1:.+]] = add i32 %index, 1
 ; CHECK:   getelementptr inbounds i32, i32* %a, i32 %[[I0]]
 ; CHECK: pred.udiv.if:
 ; CHECK:   udiv i32 {{.*}}, %[[I0]]
 ; CHECK: pred.udiv.if1:
+; CHECK:   %[[I1:.+]] = add i32 %index, 1
 ; CHECK:   udiv i32 {{.*}}, %[[I1]]
 ;
 ; UNROLL-NO_IC-LABEL: @scalarize_induction_variable_05(
 ; UNROLL-NO-IC: vector.body:
 ; UNROLL-NO-IC:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue11 ]
 ; UNROLL-NO-IC:   %[[I0:.+]] = add i32 %index, 0
-; UNROLL-NO-IC:   %[[I1:.+]] = add i32 %index, 1
 ; UNROLL-NO-IC:   %[[I2:.+]] = add i32 %index, 2
-; UNROLL-NO-IC:   %[[I3:.+]] = add i32 %index, 3
 ; UNROLL-NO-IC:   getelementptr inbounds i32, i32* %a, i32 %[[I0]]
 ; UNROLL-NO-IC:   getelementptr inbounds i32, i32* %a, i32 %[[I2]]
 ; UNROLL-NO-IC: pred.udiv.if:
 ; UNROLL-NO-IC:   udiv i32 {{.*}}, %[[I0]]
 ; UNROLL-NO-IC: pred.udiv.if6:
+; UNROLL-NO-IC:   %[[I1:.+]] = add i32 %index, 1
 ; UNROLL-NO-IC:   udiv i32 {{.*}}, %[[I1]]
 ; UNROLL-NO-IC: pred.udiv.if8:
 ; UNROLL-NO-IC:   udiv i32 {{.*}}, %[[I2]]
 ; UNROLL-NO-IC: pred.udiv.if10:
+; UNROLL-NO-IC:   %[[I3:.+]] = add i32 %index, 3
 ; UNROLL-NO-IC:   udiv i32 {{.*}}, %[[I3]]
 ;
 ; IND-LABEL: @scalarize_induction_variable_05(
 ; IND: vector.body:
 ; IND:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue2 ]
-; IND:   %[[I1:.+]] = or i32 %index, 1
 ; IND:   %[[E0:.+]] = sext i32 %index to i64
 ; IND:   getelementptr inbounds i32, i32* %a, i64 %[[E0]]
 ; IND: pred.udiv.if:
 ; IND:   udiv i32 {{.*}}, %index
 ; IND: pred.udiv.if1:
+; IND:   %[[I1:.+]] = or i32 %index, 1
 ; IND:   udiv i32 {{.*}}, %[[I1]]
 ;
 ; UNROLL-LABEL: @scalarize_induction_variable_05(
 ; UNROLL: vector.body:
 ; UNROLL:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue11 ]
-; UNROLL:   %[[I1:.+]] = or i32 %index, 1
 ; UNROLL:   %[[I2:.+]] = or i32 %index, 2
-; UNROLL:   %[[I3:.+]] = or i32 %index, 3
 ; UNROLL:   %[[E0:.+]] = sext i32 %index to i64
 ; UNROLL:   %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[E0]]
 ; UNROLL:   getelementptr i32, i32* %[[G0]], i64 2
 ; UNROLL: pred.udiv.if:
 ; UNROLL:   udiv i32 {{.*}}, %index
 ; UNROLL: pred.udiv.if6:
+; UNROLL:   %[[I1:.+]] = or i32 %index, 1
 ; UNROLL:   udiv i32 {{.*}}, %[[I1]]
 ; UNROLL: pred.udiv.if8:
 ; UNROLL:   udiv i32 {{.*}}, %[[I2]]
 ; UNROLL: pred.udiv.if10:
+; UNROLL:   %[[I3:.+]] = or i32 %index, 3
 ; UNROLL:   udiv i32 {{.*}}, %[[I3]]
 
 define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {




More information about the llvm-commits mailing list