[llvm] r280179 - [LoadStoreVectorizer] Change VectorSet to Vector to match head and tail positions. Resolves PR29148.

Tue Aug 30 16:54:00 PDT 2016

Author: asbirlea
Date: Tue Aug 30 18:53:59 2016
New Revision: 280179

URL: http://llvm.org/viewvc/llvm-project?rev=280179&view=rev
Log:
[LoadStoreVectorizer] Change VectorSet to Vector to match head and tail positions. Resolves PR29148.

Summary:
LSV was using two vector sets (heads and tails) to track pairs of adjiacent position to vectorize.
A recent optimization is trying to obtain the longest chain to vectorize and assumes the positions
in heads(H) and tails(T) match, which is not the case is there are multiple tails for the same head.

e.g.:
i1: store a[0]
i2: store a[1]
i3: store a[1]
Leads to:
H: i1
T: i2 i3
Instead of:
H: i1 i1
T: i2 i3
So the positions for instructions that follow i3 will have different indexes in H/T.
This patch resolves PR29148.

This issue also surfaced the fact that if the chain is too long, and TLI
returns a "not-fast" answer, the whole chain will be abandoned for
vectorization, even though a smaller one would be beneficial.
Added a testcase and FIXME for this.

Reviewers: tstellarAMD, arsenm, jlebar

Subscribers: mzolotukhin, wdng, llvm-commits

Differential Revision: https://reviews.llvm.org/D24057

Added:
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
    llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp?rev=280179&r1=280178&r2=280179&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp Tue Aug 30 18:53:59 2016
@@ -628,7 +628,7 @@ bool Vectorizer::vectorizeChains(InstrLi
 
 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
   DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n");
-  SmallSetVector<int, 16> Heads, Tails;
+  SmallVector<int, 16> Heads, Tails;
   int ConsecutiveChain[64];
 
   // Do a quadratic search on all of the given stores and find all of the pairs
@@ -647,8 +647,8 @@ bool Vectorizer::vectorizeInstructions(A
             continue; // Should not insert.
         }
 
-        Tails.insert(j);
-        Heads.insert(i);
+        Tails.push_back(j);
+        Heads.push_back(i);
         ConsecutiveChain[i] = j;
       }
     }
@@ -660,21 +660,21 @@ bool Vectorizer::vectorizeInstructions(A
   for (int Head : Heads) {
     if (InstructionsProcessed.count(Instrs[Head]))
       continue;
-    bool longerChainExists = false;
+    bool LongerChainExists = false;
     for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
       if (Head == Tails[TIt] &&
           !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
-        longerChainExists = true;
+        LongerChainExists = true;
         break;
       }
-    if (longerChainExists)
+    if (LongerChainExists)
       continue;
 
     // We found an instr that starts a chain. Now follow the chain and try to
     // vectorize it.
     SmallVector<Instruction *, 16> Operands;
     int I = Head;
-    while (I != -1 && (Tails.count(I) || Heads.count(I))) {
+    while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
       if (InstructionsProcessed.count(Instrs[I]))
         break;
 

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll?rev=280179&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll Tue Aug 30 18:53:59 2016
@@ -0,0 +1,64 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+; Checks that there is no crash when there are multiple tails
+; for a the same head starting a chain.
+ at 0 = internal addrspace(3) global [16384 x i32] undef
+
+; CHECK-LABEL: @no_crash(
+; CHECK: store <2 x i32> zeroinitializer
+; CHECK: store i32 0
+; CHECK: store i32 0
+
+define void @no_crash(i32 %arg) {
+  %tmp2 = add i32 %arg, 14
+  %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2
+  %tmp4 = add i32 %arg, 15
+  %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp4
+
+  store i32 0, i32 addrspace(3)* %tmp3, align 4
+  store i32 0, i32 addrspace(3)* %tmp5, align 4
+  store i32 0, i32 addrspace(3)* %tmp5, align 4
+  store i32 0, i32 addrspace(3)* %tmp5, align 4
+
+  ret void
+}
+
+; Check adjiacent memory locations are properly matched and the
+; longest chain vectorized
+
+; CHECK-LABEL: @interleave_get_longest
+; CHECK: load <2 x i32>
+; CHECK: load i32
+; CHECK: store <2 x i32> zeroinitializer
+; CHECK: load i32
+; CHECK: load <2 x i32>
+; CHECK: load i32
+; CHECK: load i32
+
+define void @interleave_get_longest(i32 %arg) {
+  %a1 = add i32 %arg, 1
+  %a2 = add i32 %arg, 2
+  %a3 = add i32 %arg, 3
+  %a4 = add i32 %arg, 4
+  %tmp1 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %arg
+  %tmp2 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a1
+  %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a2
+  %tmp4 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a3
+  %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a4
+
+  %l1 = load i32, i32 addrspace(3)* %tmp2, align 4
+  %l2 = load i32, i32 addrspace(3)* %tmp1, align 4
+  store i32 0, i32 addrspace(3)* %tmp2, align 4
+  store i32 0, i32 addrspace(3)* %tmp1, align 4
+  %l3 = load i32, i32 addrspace(3)* %tmp2, align 4
+  %l4 = load i32, i32 addrspace(3)* %tmp3, align 4
+  %l5 = load i32, i32 addrspace(3)* %tmp4, align 4
+  %l6 = load i32, i32 addrspace(3)* %tmp5, align 4
+  %l7 = load i32, i32 addrspace(3)* %tmp5, align 4
+  %l8 = load i32, i32 addrspace(3)* %tmp5, align 4
+
+  ret void
+}
+

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll?rev=280179&r1=280178&r2=280179&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll Tue Aug 30 18:53:59 2016
@@ -85,3 +85,33 @@ define void  @chain_prefix_suffix(i32* n
   ret void
 }
 
+; FIXME: If the chain is too long and TLI says misaligned is not fast,
+; then LSV fails to vectorize anything in that chain.
+; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7.
+
+; CHECK-LABEL: @interleave_get_longest
+; CHECK: load <3 x i32>
+; CHECK: load i32
+; CHECK: store <2 x i32> zeroinitializer
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: load i32
+
+define void @interleave_get_longest(i32* noalias %ptr) {
+  %tmp1 = getelementptr i32, i32* %ptr, i64 0
+  %tmp2 = getelementptr i32, i32* %ptr, i64 1
+  %tmp3 = getelementptr i32, i32* %ptr, i64 2
+  %tmp4 = getelementptr i32, i32* %ptr, i64 3
+
+  %l1 = load i32, i32* %tmp2, align 4
+  %l2 = load i32, i32* %tmp1, align 4
+  store i32 0, i32* %tmp2, align 4
+  store i32 0, i32* %tmp1, align 4
+  %l3 = load i32, i32* %tmp2, align 4
+  %l4 = load i32, i32* %tmp3, align 4
+  %l5 = load i32, i32* %tmp4, align 4
+  %l6 = load i32, i32* %tmp4, align 4
+  %l7 = load i32, i32* %tmp4, align 4
+
+  ret void
+}