[llvm] r296606 - [SLP] Fix for PR32038: extra add of PHI node when it is not required.

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 1 02:50:45 PST 2017


Author: abataev
Date: Wed Mar  1 04:50:44 2017
New Revision: 296606

URL: http://llvm.org/viewvc/llvm-project?rev=296606&view=rev
Log:
[SLP] Fix for PR32038: extra add of PHI node when it is not required.

Summary:
If horizontal reduction tree starts from the binary operation that is
used in PHI node, but this PHI is not used in horizontal reduction, we
may end up with extra addition of this PHI node after vectorization.
Here is an example:
```
%phi = phi i32 [ %tmp, %end], ...
...
%tmp = add i32 %tmp1, %tmp2
end:
```
after vectorization we always have something like:

```
%phi = phi i32 [ %tmp, %end], ...
...
%red = extractelement <8 x 32> %vec.red, 0
%tmp = add i32 %red, %phi
end:
```
even if `%phi` is not used in reduction tree. Patch considers these PHI
nodes as extra arguments and considers them in the final result iff they
really used in reduction.

Reviewers: mkuper, hfinkel, mzolotukhin

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D30409

Modified:
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/scheduling.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=296606&r1=296605&r2=296606&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Wed Mar  1 04:50:44 2017
@@ -4250,14 +4250,6 @@ class HorizontalReduction {
   MapVector<Instruction *, Value *> ExtraArgs;
 
   BinaryOperator *ReductionRoot = nullptr;
-  // After successfull horizontal reduction vectorization attempt for PHI node
-  // vectorizer tries to update root binary op by combining vectorized tree and
-  // the ReductionPHI node. But during vectorization this ReductionPHI can be
-  // vectorized itself and replaced by the undef value, while the instruction
-  // itself is marked for deletion. This 'marked for deletion' PHI node then can
-  // be used in new binary operation, causing "Use still stuck around after Def
-  // is destroyed" crash upon PHI node deletion.
-  WeakVH ReductionPHI;
 
   /// The opcode of the reduction.
   Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;
@@ -4318,7 +4310,6 @@ public:
     ReductionOpcode = B->getOpcode();
     ReducedValueOpcode = 0;
     ReductionRoot = B;
-    ReductionPHI = Phi;
 
     // We currently only support adds.
     if ((ReductionOpcode != Instruction::Add &&
@@ -4406,9 +4397,9 @@ public:
           Stack.push_back(std::make_pair(I, 0));
           continue;
         }
-        // NextV is an extra argument for TreeN (its parent operation).
-        markExtraArg(Stack.back(), NextV);
       }
+      // NextV is an extra argument for TreeN (its parent operation).
+      markExtraArg(Stack.back(), NextV);
     }
     return true;
   }
@@ -4497,12 +4488,7 @@ public:
         }
       }
       // Update users.
-      if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
-        assert(ReductionRoot && "Need a reduction operation");
-        ReductionRoot->setOperand(0, VectorizedTree);
-        ReductionRoot->setOperand(1, ReductionPHI);
-      } else
-        ReductionRoot->replaceAllUsesWith(VectorizedTree);
+      ReductionRoot->replaceAllUsesWith(VectorizedTree);
     }
     return VectorizedTree != nullptr;
   }

Modified: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll?rev=296606&r1=296605&r2=296606&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll Wed Mar  1 04:50:44 2017
@@ -9,7 +9,7 @@ target triple = "aarch64--linux-gnu"
 @a = common global [80 x i8] zeroinitializer, align 16
 
 ; DEFAULT-LABEL: @PR28330(
-; DEFAULT: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
+; DEFAULT: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ]
 ; DEFAULT: %[[S0:.+]] = select <8 x i1> %1, <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
 ; DEFAULT: %[[R0:.+]] = shufflevector <8 x i32> %[[S0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; DEFAULT: %[[R1:.+]] = add <8 x i32> %[[S0]], %[[R0]]
@@ -18,10 +18,10 @@ target triple = "aarch64--linux-gnu"
 ; DEFAULT: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; DEFAULT: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]]
 ; DEFAULT: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0
-; DEFAULT: %tmp34 = add i32 %[[R6]], %tmp17
+; DEFAULT: %bin.extra = add i32 %[[R6]], %tmp17
 ;
 ; GATHER-LABEL: @PR28330(
-; GATHER: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
+; GATHER: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ]
 ; GATHER: %tmp19 = select i1 %tmp1, i32 -720, i32 -80
 ; GATHER: %tmp21 = select i1 %tmp3, i32 -720, i32 -80
 ; GATHER: %tmp23 = select i1 %tmp5, i32 -720, i32 -80
@@ -45,7 +45,7 @@ target triple = "aarch64--linux-gnu"
 ; GATHER: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; GATHER: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]]
 ; GATHER: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0
-; GATHER: %tmp34 = add i32 %[[R6]], %tmp17
+; GATHER: %bin.extra = add i32 %[[R6]], %tmp17
 ;
 ; MAX-COST-LABEL: @PR28330(
 ; MAX-COST-NOT: shufflevector
@@ -98,7 +98,7 @@ define void @PR32038(i32 %n) {
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
 ; DEFAULT:       for.body:
-; DEFAULT-NEXT:    [[TMP17:%.*]] = phi i32 [ [[TMP34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = add i32 -5, undef
 ; DEFAULT-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], undef
@@ -114,8 +114,8 @@ define void @PR32038(i32 %n) {
 ; DEFAULT-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; DEFAULT-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; DEFAULT-NEXT:    [[BIN_EXTRA:%.*]] = add i32 [[TMP3]], -5
-; DEFAULT-NEXT:    [[TMP34]] = add i32 [[BIN_EXTRA]], [[TMP17]]
+; DEFAULT-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP3]], -5
+; DEFAULT-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], undef
 ; DEFAULT-NEXT:    br label [[FOR_BODY]]
 ;
 ; GATHER-LABEL: @PR32038(
@@ -138,7 +138,7 @@ define void @PR32038(i32 %n) {
 ; GATHER-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
 ; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
 ; GATHER:       for.body:
-; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[TMP34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; GATHER-NEXT:    [[TMP19:%.*]] = select i1 [[TMP1]], i32 -720, i32 -80
 ; GATHER-NEXT:    [[TMP20:%.*]] = add i32 -5, [[TMP19]]
 ; GATHER-NEXT:    [[TMP21:%.*]] = select i1 [[TMP3]], i32 -720, i32 -80
@@ -169,8 +169,8 @@ define void @PR32038(i32 %n) {
 ; GATHER-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; GATHER-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; GATHER-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; GATHER-NEXT:    [[BIN_EXTRA:%.*]] = add i32 [[TMP8]], -5
-; GATHER-NEXT:    [[TMP34]] = add i32 [[BIN_EXTRA]], [[TMP17]]
+; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP8]], -5
+; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
 ; MAX-COST-LABEL: @PR32038(

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll?rev=296606&r1=296605&r2=296606&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll Wed Mar  1 04:50:44 2017
@@ -14,7 +14,7 @@ define i32 @test(i32* nocapture readonly
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* %p, i64 7
 ; CHECK-NEXT:    br label %for.body
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %entry ], [ %add.7, %for.body ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %entry ], [ %bin.extra, %for.body ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* %p to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, [[TMP1]]
@@ -32,10 +32,10 @@ define i32 @test(i32* nocapture readonly
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[TMP4]], [[SUM]]
-; CHECK-NEXT:    br i1 true, label %for.end, label %for.body
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = add i32 [[TMP4]], [[SUM]]
+; CHECK:         br i1 true, label %for.end, label %for.body
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 [[ADD_7]]
+; CHECK-NEXT:    ret i32 [[BIN_EXTRA]]
 ;
 entry:
   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/scheduling.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/scheduling.ll?rev=296606&r1=296605&r2=296606&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/scheduling.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/scheduling.ll Wed Mar  1 04:50:44 2017
@@ -12,7 +12,7 @@ define i32 @foo(i32* nocapture readonly
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[ADD52:%.*]] = add nsw i32 [[TMP15]],
+; CHECK:         [[ADD52:%.*]] = add i32 [[TMP15]],
 ; CHECK:          ret i32 [[ADD52]]
 ;
 entry:




More information about the llvm-commits mailing list