[llvm] c03696d - [SLP]Improve and fix getVectorElementSize.

Thu Apr 1 07:04:09 PDT 2021

Author: Alexey Bataev
Date: 2021-04-01T06:51:26-07:00
New Revision: c03696da5e13d355775dd4382957b3dcd3c3ad7c

URL: https://github.com/llvm/llvm-project/commit/c03696da5e13d355775dd4382957b3dcd3c3ad7c
DIFF: https://github.com/llvm/llvm-project/commit/c03696da5e13d355775dd4382957b3dcd3c3ad7c.diff

LOG: [SLP]Improve and fix getVectorElementSize.

1. Need to cleanup InstrElementSize map for each new tree, otherwise might
use sizes from the previous run of the vectorization attempt.
2. No need to include into analysis the instructions from the different basic
   blocks to save compile time.

Differential Revision: https://reviews.llvm.org/D99677

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
    llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9fc31586e7052..3294318e7089d 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -627,6 +627,7 @@ class BoUpSLP {
       BS->clear();
     }
     MinBWs.clear();
+    InstrElementSize.clear();
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -5635,53 +5636,58 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
   // that feed it. The type of the loaded value may indicate a more suitable
   // width than V's type. We want to base the vector element size on the width
   // of memory operations where possible.
-  SmallVector<Instruction *, 16> Worklist;
+  SmallVector<std::pair<Instruction *, BasicBlock *>, 16> Worklist;
   SmallPtrSet<Instruction *, 16> Visited;
   if (auto *I = dyn_cast<Instruction>(V)) {
-    Worklist.push_back(I);
+    Worklist.emplace_back(I, I->getParent());
     Visited.insert(I);
   }
 
   // Traverse the expression tree in bottom-up order looking for loads. If we
   // encounter an instruction we don't yet handle, we give up.
-  auto MaxWidth = 0u;
-  auto FoundUnknownInst = false;
-  while (!Worklist.empty() && !FoundUnknownInst) {
-    auto *I = Worklist.pop_back_val();
+  auto Width = 0u;
+  while (!Worklist.empty()) {
+    Instruction *I;
+    BasicBlock *Parent;
+    std::tie(I, Parent) = Worklist.pop_back_val();
 
     // We should only be looking at scalar instructions here. If the current
-    // instruction has a vector type, give up.
+    // instruction has a vector type, skip.
     auto *Ty = I->getType();
     if (isa<VectorType>(Ty))
-      FoundUnknownInst = true;
+      continue;
 
     // If the current instruction is a load, update MaxWidth to reflect the
     // width of the loaded value.
-    else if (isa<LoadInst>(I))
-      MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
+    if (isa<LoadInst>(I) || isa<ExtractElementInst>(I) ||
+        isa<ExtractValueInst>(I))
+      Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
 
     // Otherwise, we need to visit the operands of the instruction. We only
     // handle the interesting cases from buildTree here. If an operand is an
-    // instruction we haven't yet visited, we add it to the worklist.
+    // instruction we haven't yet visited and from the same basic block as the
+    // user or the use is a PHI node, we add it to the worklist.
     else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
-             isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
+             isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I) ||
+             isa<UnaryOperator>(I)) {
       for (Use &U : I->operands())
         if (auto *J = dyn_cast<Instruction>(U.get()))
-          if (Visited.insert(J).second)
-            Worklist.push_back(J);
+          if (Visited.insert(J).second &&
+              (isa<PHINode>(I) || J->getParent() == Parent))
+            Worklist.emplace_back(J, J->getParent());
+    } else {
+      break;
     }
-
-    // If we don't yet handle the instruction, give up.
-    else
-      FoundUnknownInst = true;
   }
 
-  int Width = MaxWidth;
   // If we didn't encounter a memory access in the expression tree, or if we
   // gave up for some reason, just return the width of V. Otherwise, return the
   // maximum width we found.
-  if (!MaxWidth || FoundUnknownInst)
+  if (!Width) {
+    if (auto *CI = dyn_cast<CmpInst>(V))
+      V = CI->getOperand(0);
     Width = DL->getTypeSizeInBits(V->getType());
+  }
 
   for (Instruction *I : Visited)
     InstrElementSize[I] = Width;

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
index 0b6aeb6435148..16e7549a6a2a5 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -61,25 +61,23 @@ define void @test2(<4 x i16> %a, <4 x i16> %b, i64 %c0, i64 %c1, i64 %c2, i64 %c
 ; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
-; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
-; CHECK-NEXT:    [[A0:%.*]] = add i64 [[S0]], [[C0:%.*]]
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[A0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[C0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]], align 4
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
-; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
-; CHECK-NEXT:    [[A1:%.*]] = add i64 [[S1]], [[C1:%.*]]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
-; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
-; CHECK-NEXT:    [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]], align 4
-; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
-; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
-; CHECK-NEXT:    [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, i64* [[GEP3]], align 4
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
index 54724df7ee786..bde72f647604e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
@@ -5,25 +5,17 @@ define void @inst_size(i64* %a, <2 x i64> %b) {
 ; CHECK-LABEL: @inst_size(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[VAL:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMPL1:%.*]] = load i64, i64* [[A:%.*]], align 4
-; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1
-; CHECK-NEXT:    [[TMPL2:%.*]] = load i64, i64* [[PTR2]], align 4
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2
-; CHECK-NEXT:    [[TMPL3:%.*]] = load i64, i64* [[PTR3]], align 4
 ; CHECK-NEXT:    [[PTR4:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 3
-; CHECK-NEXT:    [[TMPL4:%.*]] = load i64, i64* [[PTR4]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[T41:%.*]] = icmp sgt i64 0, [[VAL]]
-; CHECK-NEXT:    [[T42:%.*]] = icmp sgt i64 0, [[TMPL1]]
-; CHECK-NEXT:    [[T43:%.*]] = icmp sgt i64 0, [[TMPL2]]
-; CHECK-NEXT:    [[T44:%.*]] = icmp sgt i64 0, [[TMPL3]]
-; CHECK-NEXT:    [[T45:%.*]] = icmp sgt i64 0, [[TMPL4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    br label [[BLOCK:%.*]]
 ; CHECK:       block:
 ; CHECK-NEXT:    [[PHI1:%.*]] = phi i1 [ [[T41]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[PHI2:%.*]] = phi i1 [ [[T42]], [[ENTRY]] ]
-; CHECK-NEXT:    [[PHI3:%.*]] = phi i1 [ [[T43]], [[ENTRY]] ]
-; CHECK-NEXT:    [[PHI4:%.*]] = phi i1 [ [[T44]], [[ENTRY]] ]
-; CHECK-NEXT:    [[PHI5:%.*]] = phi i1 [ [[T45]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry: