[llvm] b7a1127 - [LoopVectorize] Fix scalarisation crash in widenPHIInstruction for scalable vectors

Wed May 12 03:02:17 PDT 2021

Author: David Sherwood
Date: 2021-05-12T11:02:11+01:00
New Revision: b7a11274f90f07537e2151fa4424db257ff9a950

URL: https://github.com/llvm/llvm-project/commit/b7a11274f90f07537e2151fa4424db257ff9a950
DIFF: https://github.com/llvm/llvm-project/commit/b7a11274f90f07537e2151fa4424db257ff9a950.diff

LOG: [LoopVectorize] Fix scalarisation crash in widenPHIInstruction for scalable vectors

In InnerLoopVectorizer::widenPHIInstruction there are cases where we have
to scalarise a pointer induction variable after vectorisation. For scalable
vectors we already deal with the case where the pointer induction variable
is uniform, but we currently crash if not uniform. For fixed width vectors
we calculate every lane of the scalarised pointer induction variable for a
given VF, however this cannot work for scalable vectors. In this case I
have added support for caching the whole vector value for each unrolled
part so that we can always extract an arbitrary element. Additionally, we
still continue to cache the known minimum number of lanes too in order
to improve code quality by avoiding an extractelement operation.

I have adapted an existing test `pointer_iv_mixed` from the file:

  Transforms/LoopVectorize/consecutive-ptr-uniforms.ll

and added it here for scalable vectors instead:

  Transforms/LoopVectorize/AArch64/sve-widen-phi.ll

Differential Revision: https://reviews.llvm.org/D101294

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d5dbc2e18a514..7e8114f379642 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3335,8 +3335,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
   SCEVExpander Exp(*SE, DL, "induction");
   auto Step = ID.getStep();
   auto StartValue = ID.getStartValue();
-  assert(Index->getType() == Step->getType() &&
-         "Index type does not match StepValue type");
+  assert(Index->getType()->getScalarType() == Step->getType() &&
+         "Index scalar type does not match StepValue type");
 
   // Note: the IR at this point is broken. We cannot use SE to create any new
   // SCEV and then expand it, hoping that SCEV's simplification will give us
@@ -3355,14 +3355,20 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
     return B.CreateAdd(X, Y);
   };
 
+  // We allow X to be a vector type, in which case Y will potentially be
+  // splatted into a vector with the same element count.
   auto CreateMul = [&B](Value *X, Value *Y) {
-    assert(X->getType() == Y->getType() && "Types don't match!");
+    assert(X->getType()->getScalarType() == Y->getType() &&
+           "Types don't match!");
     if (auto *CX = dyn_cast<ConstantInt>(X))
       if (CX->isOne())
         return Y;
     if (auto *CY = dyn_cast<ConstantInt>(Y))
       if (CY->isOne())
         return X;
+    VectorType *XVTy = dyn_cast<VectorType>(X->getType());
+    if (XVTy && !isa<VectorType>(Y->getType()))
+      Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
     return B.CreateMul(X, Y);
   };
 
@@ -3381,6 +3387,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
 
   switch (ID.getKind()) {
   case InductionDescriptor::IK_IntInduction: {
+    assert(!isa<VectorType>(Index->getType()) &&
+           "Vector indices not supported for integer inductions yet");
     assert(Index->getType() == StartValue->getType() &&
            "Index type does not match StartValue type");
     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
@@ -3395,9 +3403,12 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
     return B.CreateGEP(
         StartValue->getType()->getPointerElementType(), StartValue,
         CreateMul(Index,
-                  Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
+                  Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
+                                    GetInsertPoint())));
   }
   case InductionDescriptor::IK_FpInduction: {
+    assert(!isa<VectorType>(Index->getType()) &&
+           "Vector indices not supported for FP inductions yet");
     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
     auto InductionBinOp = ID.getInductionBinOp();
     assert(InductionBinOp &&
@@ -4816,13 +4827,33 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
       // iteration. If the instruction is uniform, we only need to generate the
       // first lane. Otherwise, we generate all VF values.
       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
-      assert((IsUniform || !VF.isScalable()) &&
-             "Currently unsupported for scalable vectors");
-      unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
+      unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
+
+      bool NeedsVectorIndex = !IsUniform && VF.isScalable();
+      Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
+      if (NeedsVectorIndex) {
+        Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
+        UnitStepVec = Builder.CreateStepVector(VecIVTy);
+        PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
+      }
 
       for (unsigned Part = 0; Part < UF; ++Part) {
         Value *PartStart = createStepForVF(
             Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
+
+        if (NeedsVectorIndex) {
+          Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
+          Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
+          Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
+          Value *SclrGep =
+              emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
+          SclrGep->setName("next.gep");
+          State.set(PhiR, SclrGep, Part);
+          // We've cached the whole vector, which means we can support the
+          // extraction of any lane.
+          continue;
+        }
+
         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
           Value *Idx = Builder.CreateAdd(
               PartStart, ConstantInt::get(PtrInd->getType(), Lane));

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 21a8dba2053d0..353ad64122fd0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -114,9 +114,68 @@ for.cond.cleanup:                                 ; preds = %for.body
   ret void
 }
 
+
+;
+; Check multiple pointer induction variables where only one is recognized as
+; uniform and remains uniform after vectorization. The other pointer induction
+; variable is not recognized as uniform and is not uniform after vectorization
+; because it is stored to memory.
+;
+
+define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) {
+; CHECK-LABEL: @pointer_iv_mixed(
+; CHECK:     vector.body
+; CHECK:       %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
+; CHECK:       %[[STEPVEC:.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:  %[[TMP1:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[IDX]], i32 0
+; CHECK-NEXT:  %[[TMP2:.*]] = shufflevector <vscale x 2 x i64> %[[TMP1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:  %[[VECIND1:.*]] = add <vscale x 2 x i64> %[[TMP2]], %[[STEPVEC]]
+; CHECK-NEXT:  %[[APTRS1:.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> %[[VECIND1]]
+; CHECK-NEXT:  %[[VSCALE64:.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:  %[[VSCALE64X2:.*]] = shl i64 %[[VSCALE64]], 1
+; CHECK-NEXT:  %[[TMP3:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[VSCALE64X2]], i32 0
+; CHECK-NEXT:  %[[TMP4:.*]] = shufflevector <vscale x 2 x i64> %[[TMP3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:  %[[TMP5:.*]] = add <vscale x 2 x i64> %[[TMP4]], %[[STEPVEC]]
+; CHECK-NEXT:  %[[VECIND2:.*]] = add <vscale x 2 x i64> %[[TMP2]], %[[TMP5]]
+; CHECK-NEXT:  %[[APTRS2:.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> %[[VECIND2]]
+; CHECK-NEXT:  %[[GEPB1:.*]] = getelementptr i32*, i32** %b, i64 %[[IDX]]
+; CHECK:       %[[BPTR1:.*]] = bitcast i32** %[[GEPB1]] to <vscale x 2 x i32*>*
+; CHECK-NEXT:  store <vscale x 2 x i32*> %[[APTRS1]], <vscale x 2 x i32*>* %[[BPTR1]], align 8
+; CHECK:       %[[VSCALE32:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:  %[[VSCALE32X2:.*]] = shl i32 %[[VSCALE32]], 1
+; CHECK-NEXT:  %[[TMP6:.*]] = sext i32 %[[VSCALE32X2]] to i64
+; CHECK-NEXT:  %[[GEPB2:.*]] = getelementptr i32*, i32** %[[GEPB1]], i64 %[[TMP6]]
+; CHECK-NEXT:  %[[BPTR2:.*]] = bitcast i32** %[[GEPB2]] to <vscale x 2 x i32*>*
+; CHECK-NEXT   store <vscale x 2 x i32*> %[[APTRS2]], <vscale x 2 x i32*>* %[[BPTR2]], align 8
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
+  %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
+  %tmp0 = phi i32 [ %tmp2, %for.body ], [ 0, %entry ]
+  %tmp1 = load i32, i32* %p, align 8
+  %tmp2 = add i32 %tmp1, %tmp0
+  store i32* %p, i32** %q, align 8
+  %tmp3 = getelementptr inbounds i32, i32* %p, i32 1
+  %tmp4 = getelementptr inbounds i32*, i32** %q, i32 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end, !llvm.loop !6
+
+for.end:
+  %tmp5 = phi i32 [ %tmp2, %for.body ]
+  ret i32 %tmp5
+}
+
+
 !0 = distinct !{!0, !1, !2, !3, !4, !5}
 !1 = !{!"llvm.loop.mustprogress"}
 !2 = !{!"llvm.loop.vectorize.width", i32 4}
 !3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
 !4 = !{!"llvm.loop.vectorize.enable", i1 true}
 !5 = !{!"llvm.loop.interleave.count", i32 2}
+!6 = distinct !{!6, !1, !7, !3, !4, !5}
+!7 = !{!"llvm.loop.vectorize.width", i32 2}