[llvm] 9abe500 - [SLP] Fix the trunc instruction insertion problem

Wed Mar 17 03:51:57 PDT 2021

Author: Bu Le
Date: 2021-03-17T13:51:08+03:00
New Revision: 9abe5004733034a17e79ef6b7e9b19000c4ea4be

URL: https://github.com/llvm/llvm-project/commit/9abe5004733034a17e79ef6b7e9b19000c4ea4be
DIFF: https://github.com/llvm/llvm-project/commit/9abe5004733034a17e79ef6b7e9b19000c4ea4be.diff

LOG: [SLP] Fix the trunc instruction insertion problem

Current SLP pass has this piece of code that inserts a trunc instruction
after the vectorized instruction. In the case that the vectorized instruction
is a phi node and not the last phi node in the BB, the trunc instruction
will be inserted between two phi nodes, which will trigger verify problem
in debug version or unpredictable error in another pass.
This patch changes the algorithm to 'if the last vectorized instruction
is a phi, insert it after the last phi node in current BB' to fix this problem.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c6edfce5628f..cc11ddd04d51 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4909,8 +4909,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // sign extend the extracted values below.
   auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
   if (MinBWs.count(ScalarRoot)) {
-    if (auto *I = dyn_cast<Instruction>(VectorRoot))
-      Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
+    if (auto *I = dyn_cast<Instruction>(VectorRoot)) {
+      // If current instr is a phi and not the last phi, insert it after the
+      // last phi node.
+      if (isa<PHINode>(I))
+        Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt());
+      else
+        Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
+    }
     auto BundleWidth = VectorizableTree[0]->Scalars.size();
     auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
     auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
index 71f3710a3c24..f6ab38bb3935 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -disable-verify -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -S | FileCheck %s
 target triple = "aarch64-unknown-linux-gnu"
 @d = internal unnamed_addr global i32 5, align 4
 
@@ -8,7 +8,7 @@ define dso_local void @l() local_unnamed_addr {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ undef, [[BB:%.*]] ], [ [[TMP12:%.*]], [[BB25:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ undef, [[BB:%.*]] ], [ [[TMP11:%.*]], [[BB25:%.*]] ]
 ; CHECK-NEXT:    br i1 undef, label [[BB3:%.*]], label [[BB11:%.*]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[I4:%.*]] = zext i1 undef to i32
@@ -28,12 +28,12 @@ define dso_local void @l() local_unnamed_addr {
 ; CHECK:       bb25:
 ; CHECK-NEXT:    [[I28:%.*]] = phi i32 [ [[I12]], [[BB11]] ], [ [[I4]], [[BB3]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ [[TMP9]], [[BB11]] ], [ [[TMP3]], [[BB3]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i8>
-; CHECK-NEXT:    [[TMP12]] = phi <2 x i16> [ [[TMP4]], [[BB11]] ], [ [[TMP1]], [[BB3]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i8> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP11]] = phi <2 x i16> [ [[TMP4]], [[BB11]] ], [ [[TMP1]], [[BB3]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i8> [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP13]] to i32
 ; CHECK-NEXT:    [[I31:%.*]] = and i32 undef, [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i8> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i8> [[TMP12]], i32 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP15]] to i32
 ; CHECK-NEXT:    [[I32:%.*]] = and i32 [[I31]], [[TMP16]]
 ; CHECK-NEXT:    [[I33:%.*]] = and i32 [[I32]], [[I28]]