[llvm] r306575 - [LV] Fix PR33613 - retain order of insertelement per part

Wed Jun 28 10:59:33 PDT 2017

Author: ayalz
Date: Wed Jun 28 10:59:33 2017
New Revision: 306575

URL: http://llvm.org/viewvc/llvm-project?rev=306575&view=rev
Log:
[LV] Fix PR33613 - retain order of insertelement per part

r306381 caused PR33613, by reversing the order in which insertelements were
generated per unroll part. This patch fixes PR33613 by retraining this order,
placing each set of insertelements per part immediately after the last scalar
being packed for this part. Includes a test case derived from PR33613.

Reference: https://bugs.llvm.org/show_bug.cgi?id=33613
Differential Revision: https://reviews.llvm.org/D34760

Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/trunk/test/Transforms/LoopVectorize/first-order-recurrence.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=306575&r1=306574&r2=306575&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Wed Jun 28 10:59:33 2017
@@ -2716,13 +2716,13 @@ Value *InnerLoopVectorizer::getOrCreateV
       return ScalarValue;
     }
 
-    // Get the last scalar instruction we generated for V. If the value is
-    // known to be uniform after vectorization, this corresponds to lane zero
-    // of the last unroll iteration. Otherwise, the last instruction is the one
-    // we created for the last vector lane of the last unroll iteration.
+    // Get the last scalar instruction we generated for V and Part. If the value
+    // is known to be uniform after vectorization, this corresponds to lane zero
+    // of the Part unroll iteration. Otherwise, the last instruction is the one
+    // we created for the last vector lane of the Part unroll iteration.
     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
     auto *LastInst =
-        cast<Instruction>(getOrCreateScalarValue(V, UF - 1, LastLane));
+        cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane));
 
     // Set the insert point after the last scalarized instruction. This ensures
     // the insertelement sequence will directly follow the scalar definitions.
@@ -4047,7 +4047,8 @@ void InnerLoopVectorizer::fixFirstOrderR
   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
 
-  // Get the vectorized previous value.
+  // Get the vectorized previous value of the last part UF - 1. It appears last
+  // among all unrolled iterations, due to the order of their construction.
   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
 
   // Set the insertion point after the previous value if it is an instruction.

Modified: llvm/trunk/test/Transforms/LoopVectorize/first-order-recurrence.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/first-order-recurrence.ll?rev=306575&r1=306574&r2=306575&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/first-order-recurrence.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/first-order-recurrence.ll Wed Jun 28 10:59:33 2017
@@ -295,14 +295,14 @@ for.cond.cleanup3:
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = load i32, i32* {{.*}}
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = load i32, i32* {{.*}}
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = load i32, i32* {{.*}}
-; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = load i32, i32* {{.*}}
-; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = load i32, i32* {{.*}}
-; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = load i32, i32* {{.*}}
-; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = load i32, i32* {{.*}}
 ; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> undef, i32 [[TMP27]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP28]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP29]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP30]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = load i32, i32* {{.*}}
 ; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP32]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP33]], i32 2
@@ -396,3 +396,54 @@ for.body:
 for.end:
   ret i32 %val.phi
 }
+
+; We vectorize this first order recurrence, with a set of insertelements for
+; each unrolled part. Make sure these insertelements are generated in-order,
+; because the shuffle of the first order recurrence will be added after the
+; insertelement of the last part UF - 1, assuming the latter appears after the
+; insertelements of all other parts.
+;
+; int PR33613(double *b, double j, int d) {
+;   int a = 0;
+;   for(int i = 0; i < 10240; i++, b+=25) {
+;     double f = b[d]; // Scalarize to form insertelements
+;     if (j * f)
+;       a++;
+;     j = f;
+;   }
+;   return a;
+; }
+;
+; UNROLL-NO-IC-LABEL: @PR33613(
+; UNROLL-NO-IC:     vector.body:
+; UNROLL-NO-IC:       [[VECTOR_RECUR:%.*]] = phi <4 x double>
+; UNROLL-NO-IC:       shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> {{.*}}, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:  shufflevector <4 x double> {{.*}}, <4 x double> {{.*}}, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NOT:   insertelement <4 x double>
+; UNROLL-NO-IC:     middle.block:
+;
+define i32 @PR33613(double* %b, double %j, i32 %d) {
+entry:
+  %idxprom = sext i32 %d to i64
+  br label %for.body
+
+for.cond.cleanup:
+  %a.1.lcssa = phi i32 [ %a.1, %for.body ]
+  ret i32 %a.1.lcssa
+
+for.body:
+  %b.addr.012 = phi double* [ %b, %entry ], [ %add.ptr, %for.body ]
+  %i.011 = phi i32 [ 0, %entry ], [ %inc1, %for.body ]
+  %a.010 = phi i32 [ 0, %entry ], [ %a.1, %for.body ]
+  %j.addr.09 = phi double [ %j, %entry ], [ %0, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b.addr.012, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %mul = fmul double %j.addr.09, %0
+  %tobool = fcmp une double %mul, 0.000000e+00
+  %inc = zext i1 %tobool to i32
+  %a.1 = add nsw i32 %a.010, %inc
+  %inc1 = add nuw nsw i32 %i.011, 1
+  %add.ptr = getelementptr inbounds double, double* %b.addr.012, i64 25
+  %exitcond = icmp eq i32 %inc1, 10240
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}