[llvm] r343954 - [LV] Do not create SCEVs on broken IR in emitTransformedIndex. PR39160

Sun Oct 7 22:46:30 PDT 2018

Author: mkazantsev
Date: Sun Oct  7 22:46:29 2018
New Revision: 343954

URL: http://llvm.org/viewvc/llvm-project?rev=343954&view=rev
Log:
[LV] Do not create SCEVs on broken IR in emitTransformedIndex. PR39160

At the point when we perform `emitTransformedIndex`, we have a broken IR (in
particular, we have Phis for which not every incoming value is properly set). On
such IR, it is illegal to create SCEV expressions, because their internal
simplification process may try to prove some predicates and break when it
stumbles across some broken IR.

The only purpose of using SCEV in this particular place is attempt to simplify
the generated code slightly. It seems that the result isn't worth it, because
some trivial cases (like addition of zero and multiplication by 1) can be
handled separately if needed, but more generally InstCombine is able to achieve
the goals we want to achieve by using SCEV.

This patch fixes a functional crash described in PR39160, and as side-effect it
also generates a bit smarter code in some simple cases. It also may cause some
optimality loss (i.e. we will now generate `mul` by power of `2` instead of
shift etc), but there is nothing what InstCombine could not handle later. In
case of dire need, we can support more trivial cases just in place.

Note that this patch only fixes one particular case of the general problem that
LV misuses SCEV, attempting to create SCEVs or prove predicates on invalid IR.
The general solution, however, seems complex enough.

Differential Revision: https://reviews.llvm.org/D52881
Reviewed By: fhahn, hsaito

Added:
    llvm/trunk/test/Transforms/LoopVectorize/pr39160.ll
Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/trunk/test/Transforms/LoopVectorize/X86/constant-fold.ll
    llvm/trunk/test/Transforms/LoopVectorize/induction.ll
    llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=343954&r1=343953&r2=343954&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Sun Oct  7 22:46:29 2018
@@ -2507,33 +2507,52 @@ Value *InnerLoopVectorizer::emitTransfor
   auto StartValue = ID.getStartValue();
   assert(Index->getType() == Step->getType() &&
          "Index type does not match StepValue type");
+
+  // Note: the IR at this point is broken. We cannot use SE to create any new
+  // SCEV and then expand it, hoping that SCEV's simplification will give us
+  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
+  // lead to various SCEV crashes. So all we can do is to use builder and rely
+  // on InstCombine for future simplifications. Here we handle some trivial
+  // cases only.
+  auto CreateAdd = [&B](Value *X, Value *Y) {
+    assert(X->getType() == Y->getType() && "Types don't match!");
+    if (auto *CX = dyn_cast<ConstantInt>(X))
+      if (CX->isZero())
+        return Y;
+    if (auto *CY = dyn_cast<ConstantInt>(Y))
+      if (CY->isZero())
+        return X;
+    return B.CreateAdd(X, Y);
+  };
+
+  auto CreateMul = [&B](Value *X, Value *Y) {
+    assert(X->getType() == Y->getType() && "Types don't match!");
+    if (auto *CX = dyn_cast<ConstantInt>(X))
+      if (CX->isOne())
+        return Y;
+    if (auto *CY = dyn_cast<ConstantInt>(Y))
+      if (CY->isOne())
+        return X;
+    return B.CreateMul(X, Y);
+  };
+
   switch (ID.getKind()) {
   case InductionDescriptor::IK_IntInduction: {
     assert(Index->getType() == StartValue->getType() &&
            "Index type does not match StartValue type");
-
-    // FIXME: Theoretically, we can call getAddExpr() of ScalarEvolution
-    // and calculate (Start + Index * Step) for all cases, without
-    // special handling for "isOne" and "isMinusOne".
-    // But in the real life the result code getting worse. We mix SCEV
-    // expressions and ADD/SUB operations and receive redundant
-    // intermediate values being calculated in different ways and
-    // Instcombine is unable to reduce them all.
-
     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
       return B.CreateSub(StartValue, Index);
-    if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne())
-      return B.CreateAdd(StartValue, Index);
-    const SCEV *S = SE->getAddExpr(SE->getSCEV(StartValue),
-                                   SE->getMulExpr(Step, SE->getSCEV(Index)));
-    return Exp.expandCodeFor(S, StartValue->getType(), &*B.GetInsertPoint());
+    auto *Offset = CreateMul(
+        Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
+    return CreateAdd(StartValue, Offset);
   }
   case InductionDescriptor::IK_PtrInduction: {
     assert(isa<SCEVConstant>(Step) &&
            "Expected constant step for pointer induction");
-    const SCEV *S = SE->getMulExpr(SE->getSCEV(Index), Step);
-    Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint());
-    return B.CreateGEP(nullptr, StartValue, Index);
+    return B.CreateGEP(
+        nullptr, StartValue,
+        CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
+                                           &*B.GetInsertPoint())));
   }
   case InductionDescriptor::IK_FpInduction: {
     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");

Modified: llvm/trunk/test/Transforms/LoopVectorize/X86/constant-fold.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/constant-fold.ll?rev=343954&r1=343953&r2=343954&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/constant-fold.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/constant-fold.ll Sun Oct  7 22:46:29 2018
@@ -18,20 +18,19 @@ define void @f1() {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 0, [[TMP0]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> undef, i16 [[OFFSET_IDX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], <i16 0, i16 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i16*, i16** [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16** [[TMP4]] to <2 x i16*>*
-; CHECK-NEXT:    store <2 x i16*> <i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 0), i32 0, i32 0), i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 1), i32 0, i32 0)>, <2 x i16*>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16*, i16** [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>*
+; CHECK-NEXT:    store <2 x i16*> <i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 0), i32 0, i32 0), i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 1), i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
 
 bb1:

Modified: llvm/trunk/test/Transforms/LoopVectorize/induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/induction.ll?rev=343954&r1=343953&r2=343954&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/induction.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/induction.ll Sun Oct  7 22:46:29 2018
@@ -138,7 +138,7 @@ for.end:
 ; CHECK-LABEL: @scalarize_induction_variable_02(
 ; CHECK: vector.body:
 ; CHECK:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK:   %offset.idx = shl i64 %index, 3
+; CHECK:   %offset.idx = mul i64 %index, 8
 ; CHECK:   %[[i0:.+]] = add i64 %offset.idx, 0
 ; CHECK:   %[[i1:.+]] = add i64 %offset.idx, 8
 ; CHECK:   getelementptr inbounds float, float* %a, i64 %[[i0]]
@@ -149,7 +149,7 @@ for.end:
 ; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_02(
 ; UNROLL-NO-IC: vector.body:
 ; UNROLL-NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; UNROLL-NO-IC:   %offset.idx = shl i64 %index, 3
+; UNROLL-NO-IC:   %offset.idx = mul i64 %index, 8
 ; UNROLL-NO-IC:   %[[i0:.+]] = add i64 %offset.idx, 0
 ; UNROLL-NO-IC:   %[[i1:.+]] = add i64 %offset.idx, 8
 ; UNROLL-NO-IC:   %[[i2:.+]] = add i64 %offset.idx, 16

Modified: llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll?rev=343954&r1=343953&r2=343954&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll Sun Oct  7 22:46:29 2018
@@ -23,11 +23,10 @@ for.end:
 ; CHECK-LABEL: @preinc
 ; CHECK-LABEL: middle.block:
 ; CHECK: %[[v3:.+]] = sub i32 %n.vec, 1
-; CHECK: %ind.escape = add i32 0, %[[v3]]
 ; CHECK-LABEL: scalar.ph:
 ; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ]
 ; CHECK-LABEL: for.end:
-; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %ind.escape, %middle.block ]
+; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %[[v3]], %middle.block ]
 ; CHECK: ret i32 %[[RET]]
 define i32 @preinc(i32 %k)  {
 entry:
@@ -135,16 +134,13 @@ for.end:
 }
 
 ; CHECK-LABEL: @PR30742
+; CHECK:   %[[T15:.+]] = add nsw i32 %tmp03, -7
 ; CHECK: vector.ph
 ; CHECK:   %[[N_MOD_VF:.+]] = urem i32 %[[T5:.+]], 2
 ; CHECK:   %[[N_VEC:.+]] = sub i32 %[[T5]], %[[N_MOD_VF]]
 ; CHECK: middle.block
 ; CHECK:   %[[CMP:.+]] = icmp eq i32 %[[T5]], %[[N_VEC]]
-; CHECK:   %[[T15:.+]] = add i32 %tmp03, -7
-; CHECK:   %[[T16:.+]] = shl i32 %[[N_MOD_VF]], 3
-; CHECK:   %[[T17:.+]] = add i32 %[[T15]], %[[T16]]
-; CHECK:   %[[T18:.+]] = shl i32 {{.*}}, 3
-; CHECK:   %ind.escape = sub i32 %[[T17]], %[[T18]]
+; CHECK:   %ind.escape = add i32 %[[T15]],
 ; CHECK:   br i1 %[[CMP]], label %BB3, label %scalar.ph
 define void @PR30742() {
 BB0:

Added: llvm/trunk/test/Transforms/LoopVectorize/pr39160.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/pr39160.ll?rev=343954&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/pr39160.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/pr39160.ll Sun Oct  7 22:46:29 2018
@@ -0,0 +1,98 @@
+; RUN: opt -loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Make sure that we can compile the test without crash.
+define void @barney() {
+
+; CHECK-LABEL: @barney(
+; CHECK:       middle.block:
+
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %tmp4 = icmp slt i32 undef, 0
+  br i1 %tmp4, label %bb2, label %bb5
+
+bb5:                                              ; preds = %bb2
+  br label %bb19
+
+bb18:                                             ; preds = %bb33
+  ret void
+
+bb19:                                             ; preds = %bb36, %bb5
+  %tmp21 = phi i64 [ undef, %bb36 ], [ 2, %bb5 ]
+  %tmp22 = phi i32 [ %tmp65, %bb36 ], [ undef, %bb5 ]
+  br label %bb50
+
+bb33:                                             ; preds = %bb62
+  br i1 undef, label %bb18, label %bb36
+
+bb36:                                             ; preds = %bb33
+  br label %bb19
+
+bb46:                                             ; preds = %bb50
+  br i1 undef, label %bb48, label %bb59
+
+bb48:                                             ; preds = %bb46
+  %tmp49 = add i32 %tmp52, 14
+  ret void
+
+bb50:                                             ; preds = %bb50, %bb19
+  %tmp52 = phi i32 [ %tmp55, %bb50 ], [ %tmp22, %bb19 ]
+  %tmp53 = phi i64 [ %tmp56, %bb50 ], [ 1, %bb19 ]
+  %tmp54 = add i32 %tmp52, 12
+  %tmp55 = add i32 %tmp52, 13
+  %tmp56 = add nuw nsw i64 %tmp53, 1
+  %tmp58 = icmp ult i64 %tmp53, undef
+  br i1 %tmp58, label %bb50, label %bb46
+
+bb59:                                             ; preds = %bb46
+  br label %bb62
+
+bb62:                                             ; preds = %bb68, %bb59
+  %tmp63 = phi i32 [ %tmp65, %bb68 ], [ %tmp55, %bb59 ]
+  %tmp64 = phi i64 [ %tmp66, %bb68 ], [ %tmp56, %bb59 ]
+  %tmp65 = add i32 %tmp63, 13
+  %tmp66 = add nuw nsw i64 %tmp64, 1
+  %tmp67 = icmp ult i64 %tmp66, %tmp21
+  br i1 %tmp67, label %bb68, label %bb33
+
+bb68:                                             ; preds = %bb62
+  br label %bb62
+}
+
+define i32 @foo(i32 addrspace(1)* %p) {
+
+; CHECK-LABEL: foo
+; CHECK:       middle.block:
+
+entry:
+  br label %outer
+
+outer:                                            ; preds = %outer_latch, %entry
+  %iv = phi i64 [ 2, %entry ], [ %iv.next, %outer_latch ]
+  br label %inner
+
+inner:                                            ; preds = %inner, %outer
+  %0 = phi i32 [ %2, %inner ], [ 0, %outer ]
+  %a = phi i32 [ %3, %inner ], [ 1, %outer ]
+  %b = phi i32 [ %1, %inner ], [ 6, %outer ]
+  %1 = add i32 %b, 2
+  %2 = or i32 %0, %b
+  %3 = add nuw nsw i32 %a, 1
+  %4 = zext i32 %3 to i64
+  %5 = icmp ugt i64 %iv, %4
+  br i1 %5, label %inner, label %outer_latch
+
+outer_latch:                                      ; preds = %inner
+  store atomic i32 %2, i32 addrspace(1)* %p unordered, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %6 = icmp ugt i64 %iv, 63
+  br i1 %6, label %exit, label %outer
+
+exit:                                             ; preds = %outer_latch
+  ret i32 0
+}