[llvm] r285286 - [SLP] Fix for PR30626: Compiler crash inside SLP Vectorizer.

Thu Oct 27 05:02:28 PDT 2016

Author: abataev
Date: Thu Oct 27 07:02:28 2016
New Revision: 285286

URL: http://llvm.org/viewvc/llvm-project?rev=285286&view=rev
Log:
[SLP] Fix for PR30626: Compiler crash inside SLP Vectorizer.

After successfull horizontal reduction vectorization attempt for PHI node
vectorizer tries to update root binary op by combining vectorized tree
and the ReductionPHI node. But during vectorization this ReductionPHI
can be vectorized itself and replaced by the `undef` value, while the
instruction itself is marked for deletion. This 'marked for deletion'
PHI node then can be used in new binary operation, causing "Use still
stuck around after Def is destroyed" crash upon PHI node deletion.

Also the test is fixed to make it perform actual testing.

Differential Revision: https://reviews.llvm.org/D25671

Modified:
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=285286&r1=285285&r2=285286&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Thu Oct 27 07:02:28 2016
@@ -4062,7 +4062,14 @@ class HorizontalReduction {
   SmallVector<Value *, 32> ReducedVals;
 
   BinaryOperator *ReductionRoot;
-  PHINode *ReductionPHI;
+  // After successfull horizontal reduction vectorization attempt for PHI node
+  // vectorizer tries to update root binary op by combining vectorized tree and
+  // the ReductionPHI node. But during vectorization this ReductionPHI can be
+  // vectorized itself and replaced by the undef value, while the instruction
+  // itself is marked for deletion. This 'marked for deletion' PHI node then can
+  // be used in new binary operation, causing "Use still stuck around after Def
+  // is destroyed" crash upon PHI node deletion.
+  WeakVH ReductionPHI;
 
   /// The opcode of the reduction.
   unsigned ReductionOpcode;
@@ -4081,8 +4088,8 @@ public:
   unsigned MinVecRegSize;
 
   HorizontalReduction(unsigned MinVecRegSize)
-      : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
-        ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0),
+      : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
+        IsPairwiseReduction(false), ReduxWidth(0),
         MinVecRegSize(MinVecRegSize) {}
 
   /// \brief Try to find a reduction tree.
@@ -4247,7 +4254,7 @@ public:
                                      ReducedVals[i]);
       }
       // Update users.
-      if (ReductionPHI) {
+      if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
         assert(ReductionRoot && "Need a reduction operation");
         ReductionRoot->setOperand(0, VectorizedTree);
         ReductionRoot->setOperand(1, ReductionPHI);

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll?rev=285286&r1=285285&r2=285286&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll Thu Oct 27 07:02:28 2016
@@ -1,6 +1,5 @@
-; RUN: opt -slp-vectorizer -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSTORE
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
 
 ; #include <stdint.h>
 ;
@@ -15,9 +14,9 @@ target datalayout = "e-p:64:64:64-i1:8:8
 ;   return sum;
 ; }
 
-; NOSTORE-LABEL: add_red
-; NOSTORE: fmul <4 x float>
-; NOSTORE: shufflevector <4 x float>
+; CHECK-LABEL: add_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
 
 define i32 @add_red(float* %A, i32 %n) {
 entry:
@@ -148,8 +147,8 @@ for.end:
 ; }
 
 ; CHECK-LABEL: long_red
-; CHECK: fmul fast <4 x float>
-; CHECK: shufflevector <4 x float>
+; CHECK: fmul fast <8 x float>
+; CHECK: shufflevector <8 x float>
 
 define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
 entry:
@@ -305,6 +304,149 @@ for.end:
   ret i32 %sum.0.lcssa
 }
 
+; void foo(const float *arg_A, unsigned arg_B, float *array) {
+;   for (uint32_t i = 0; i < 6; ++i) {
+;     const float *ptr = arg_A + i;
+;     float w0 = array[i * 4 + 0];
+;     float w1 = array[i * 4 + 1];
+;     float w2 = array[i * 4 + 2];
+;     float w3 = array[i * 4 + 3];
+;
+;     for (unsigned j = 0; j < arg_B; ++j) {
+;       const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1);
+;       const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1);
+;       const float x3 = x2 - (-3.1f * w2) - (3.2f * w3);
+;       const float x4 = x3 + (-4.0f * w2) + w3;
+;       w1 = w0;
+;       w0 = x1;
+;       w3 = w2;
+;       w2 = x3;
+;     }
+;
+;     array[i * 4 + 0] = w0;
+;     array[i * 4 + 1] = w1;
+;     array[i * 4 + 2] = w2;
+;     array[i * 4 + 3] = w3;
+;   }
+; }
+
+define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) {
+; CHECK-LABEL: @foo(
+; CHECK: fmul fast <4 x float>
+; CHECK: shufflevector <4 x float>
+;
+entry:
+  %cmp1495 = icmp eq i32 %arg_B, 0
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup15
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup15, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ]
+  %0 = shl i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds float, float* %array, i64 %0
+  %1 = load float, float* %arrayidx, align 4
+  %2 = or i64 %0, 1
+  %arrayidx4 = getelementptr inbounds float, float* %array, i64 %2
+  %3 = load float, float* %arrayidx4, align 4
+  %4 = or i64 %0, 2
+  %arrayidx8 = getelementptr inbounds float, float* %array, i64 %4
+  %5 = load float, float* %arrayidx8, align 4
+  %6 = or i64 %0, 3
+  %arrayidx12 = getelementptr inbounds float, float* %array, i64 %6
+  %7 = load float, float* %arrayidx12, align 4
+  br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph
+
+for.body16.lr.ph:                                 ; preds = %for.body
+  %add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv
+  %8 = load float, float* %add.ptr, align 4
+  br label %for.body16
+
+for.cond.cleanup15:                               ; preds = %for.body16, %for.body
+  %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ]
+  %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ]
+  %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ]
+  %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ]
+  store float %w0.0.lcssa, float* %arrayidx, align 4
+  store float %w1.0.lcssa, float* %arrayidx4, align 4
+  store float %w2.0.lcssa, float* %arrayidx8, align 4
+  store float %w3.0.lcssa, float* %arrayidx12, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond109 = icmp eq i64 %indvars.iv.next, 6
+  br i1 %exitcond109, label %for.cond.cleanup, label %for.body
+
+for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
+  %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ]
+  %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ]
+  %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ]
+  %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ]
+  %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ]
+  %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000
+  %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000
+  %sub92 = fadd fast float %mul17, %mul18.neg
+  %sub19 = fadd fast float %sub92, %8
+  %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000
+  %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000
+  %mul23 = fmul fast float %w1.099, 0x4002666660000000
+  %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000
+  %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000
+  %add2293 = fadd fast float %mul27.neg, %mul25
+  %add24 = fadd fast float %add2293, %mul23
+  %sub2694 = fadd fast float %add24, %mul21.neg
+  %sub28 = fadd fast float %sub2694, %mul20
+  %inc = add nuw i32 %j.098, 1
+  %exitcond = icmp eq i32 %inc, %arg_B
+  br i1 %exitcond, label %for.cond.cleanup15, label %for.body16
+}
+
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
+
+; void foo(double * restrict A, double * restrict B, double * restrict C,
+;          int n) {
+;   for (intptr_t i=0; i < n; ++i) {
+;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
+;   }
+; }
+
+; STORE-LABEL: store_red_double
+; STORE: fmul fast <2 x double>
+; STORE: extractelement <2 x double>
+; STORE: extractelement <2 x double>
+
+define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
+entry:
+  %cmp17 = icmp sgt i32 %n, 0
+  br i1 %cmp17, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load double, double* %B, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
+  %1 = load double, double* %arrayidx4, align 8
+  %2 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = shl nsw i64 %i.018, 2
+  %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
+  %3 = load double, double* %arrayidx2, align 8
+  %mul3 = fmul fast double %0, %3
+  %add16 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
+  %4 = load double, double* %arrayidx6, align 8
+  %mul7 = fmul fast double %1, %4
+  %add8 = fadd fast double %mul3, %mul7
+  %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
+  store double %add8, double* %arrayidx9, align 8
+  %inc = add nsw i64 %i.018, 1
+  %exitcond = icmp eq i64 %inc, %2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 ; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
 ;   float sum = 0;
 ;   for (intptr_t i=0; i < n; ++i) {
@@ -316,9 +458,9 @@ for.end:
 ;   return sum;
 ; }
 
-; CHECK-LABEL: store_red
-; CHECK: fmul fast <4 x float>
-; CHECK: shufflevector <4 x float>
+; STORE-LABEL: store_red
+; STORE: fmul fast <4 x float>
+; STORE: shufflevector <4 x float>
 
 define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
 entry:
@@ -368,50 +510,3 @@ for.end:
   ret i32 0
 }
 
-
-; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
-
-; void foo(double * restrict A, double * restrict B, double * restrict C,
-;          int n) {
-;   for (intptr_t i=0; i < n; ++i) {
-;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
-;   }
-; }
-
-; STORE-LABEL: store_red_double
-; STORE: fmul fast <2 x double>
-; STORE: extractelement <2 x double>
-; STORE: extractelement <2 x double>
-
-define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
-entry:
-  %cmp17 = icmp sgt i32 %n, 0
-  br i1 %cmp17, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:
-  %0 = load double, double* %B, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
-  %1 = load double, double* %arrayidx4, align 8
-  %2 = sext i32 %n to i64
-  br label %for.body
-
-for.body:
-  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %mul = shl nsw i64 %i.018, 2
-  %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
-  %3 = load double, double* %arrayidx2, align 8
-  %mul3 = fmul fast double %0, %3
-  %add16 = or i64 %mul, 1
-  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
-  %4 = load double, double* %arrayidx6, align 8
-  %mul7 = fmul fast double %1, %4
-  %add8 = fadd fast double %mul3, %mul7
-  %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
-  store double %add8, double* %arrayidx9, align 8
-  %inc = add nsw i64 %i.018, 1
-  %exitcond = icmp eq i64 %inc, %2
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}