[llvm] r287781 - [LoadStoreVectorizer] Enable vectorization of stores in the presence of an aliasing load

Wed Nov 23 09:43:15 PST 2016

Author: asbirlea
Date: Wed Nov 23 11:43:15 2016
New Revision: 287781

URL: http://llvm.org/viewvc/llvm-project?rev=287781&view=rev
Log:
[LoadStoreVectorizer] Enable vectorization of stores in the presence of an aliasing load

Summary:
The "getVectorizablePrefix" method would give up if it found an aliasing load for a store chain.
In practice, the aliasing load can be treated as a memory barrier and all stores that precede it
are a valid vectorizable prefix.
Issue found by volkan in D26962. Testcase is a pruned version of the one in the original patch.

Reviewers: jlebar, arsenm, tstellarAMD

Subscribers: mzolotukhin, wdng, nhaehnle, anna, volkan, llvm-commits

Differential Revision: https://reviews.llvm.org/D27008

Added:
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp?rev=287781&r1=287780&r2=287781&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp Wed Nov 23 11:43:15 2016
@@ -477,10 +477,23 @@ Vectorizer::getVectorizablePrefix(ArrayR
 
   // Loop until we find an instruction in ChainInstrs that we can't vectorize.
   unsigned ChainInstrIdx = 0;
+  Instruction *BarrierMemoryInstr = nullptr;
+
   for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
     Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
-    bool AliasFound = false;
+
+    // If a barrier memory instruction was found, chain instructions that follow
+    // will not be added to the valid prefix.
+    if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
+      break;
+
+    // Check (in BB order) if any instruction prevents ChainInstr from being
+    // vectorized. Find and store the first such "conflicting" instruction.
     for (Instruction *MemInstr : MemoryInstrs) {
+      // If a barrier memory instruction was found, do not check past it.
+      if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
+        break;
+
       if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr))
         continue;
 
@@ -508,12 +521,21 @@ Vectorizer::getVectorizablePrefix(ArrayR
                  << "  " << *ChainInstr << '\n'
                  << "  " << *getPointerOperand(ChainInstr) << '\n';
         });
-        AliasFound = true;
+        // Save this aliasing memory instruction as a barrier, but allow other
+        // instructions that precede the barrier to be vectorized with this one.
+        BarrierMemoryInstr = MemInstr;
         break;
       }
     }
-    if (AliasFound)
+    // Continue the search only for store chains, since vectorizing stores that
+    // precede an aliasing load is valid. Conversely, vectorizing loads is valid
+    // up to an aliasing store, but should not pull loads from further down in
+    // the basic block.
+    if (IsLoadChain && BarrierMemoryInstr) {
+      // The BarrierMemoryInstr is a store that precedes ChainInstr.
+      assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
       break;
+    }
   }
 
   // Find the largest prefix of Chain whose elements are all in

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll?rev=287781&r1=287780&r2=287781&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll Wed Nov 23 11:43:15 2016
@@ -61,13 +61,11 @@ entry:
 }
 
 ; Here we have four stores, with an aliasing load before the last one.  We can
-; vectorize the first two stores as <2 x float>, but this vectorized store must
-; be inserted at the location of the second scalar store, not the fourth one.
+; vectorize the first three stores as <3 x float>, but this vectorized store must
+; be inserted at the location of the third scalar store, not the fourth one.
 ;
 ; CHECK-LABEL: @insert_store_point_alias
-; CHECK: store <2 x float>
-; CHECK: store float
-; CHECK-SAME: %a.idx.2
+; CHECK: store <3 x float>
 ; CHECK: load float, float addrspace(1)* %a.idx.2
 ; CHECK: store float
 ; CHECK-SAME: %a.idx.3

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll?rev=287781&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll Wed Nov 23 11:43:15 2016
@@ -0,0 +1,58 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+; Check that, in the presence of an aliasing load, the stores preceding the
+; aliasing load are safe to vectorize.
+
+; CHECK-LABEL: store_vectorize_with_alias
+; CHECK: store <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+
+; Function Attrs: nounwind
+define void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
+bb:
+  %tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)*
+  %tmp1 = load float, float addrspace(1)* %tmp, align 4
+
+  %tmp2 = bitcast i8 addrspace(1)* %a to float addrspace(1)*
+  store float %tmp1, float addrspace(1)* %tmp2, align 4
+  %tmp3 = getelementptr i8, i8 addrspace(1)* %a, i64 4
+  %tmp4 = bitcast i8 addrspace(1)* %tmp3 to float addrspace(1)*
+  store float %tmp1, float addrspace(1)* %tmp4, align 4
+  %tmp5 = getelementptr i8, i8 addrspace(1)* %a, i64 8
+  %tmp6 = bitcast i8 addrspace(1)* %tmp5 to float addrspace(1)*
+  store float %tmp1, float addrspace(1)* %tmp6, align 4
+  %tmp7 = getelementptr i8, i8 addrspace(1)* %a, i64 12
+  %tmp8 = bitcast i8 addrspace(1)* %tmp7 to float addrspace(1)*
+  store float %tmp1, float addrspace(1)* %tmp8, align 4
+
+  %tmp9 = getelementptr i8, i8 addrspace(1)* %b, i64 16
+  %tmp10 = bitcast i8 addrspace(1)* %tmp9 to float addrspace(1)*
+  %tmp11 = load float, float addrspace(1)* %tmp10, align 4
+  %tmp12 = getelementptr i8, i8 addrspace(1)* %b, i64 20
+  %tmp13 = bitcast i8 addrspace(1)* %tmp12 to float addrspace(1)*
+  %tmp14 = load float, float addrspace(1)* %tmp13, align 4
+  %tmp15 = getelementptr i8, i8 addrspace(1)* %b, i64 24
+  %tmp16 = bitcast i8 addrspace(1)* %tmp15 to float addrspace(1)*
+  %tmp17 = load float, float addrspace(1)* %tmp16, align 4
+  %tmp18 = getelementptr i8, i8 addrspace(1)* %b, i64 28
+  %tmp19 = bitcast i8 addrspace(1)* %tmp18 to float addrspace(1)*
+  %tmp20 = load float, float addrspace(1)* %tmp19, align 4
+
+  %tmp21 = getelementptr i8, i8 addrspace(1)* %a, i64 16
+  %tmp22 = bitcast i8 addrspace(1)* %tmp21 to float addrspace(1)*
+  store float %tmp11, float addrspace(1)* %tmp22, align 4
+  %tmp23 = getelementptr i8, i8 addrspace(1)* %a, i64 20
+  %tmp24 = bitcast i8 addrspace(1)* %tmp23 to float addrspace(1)*
+  store float %tmp14, float addrspace(1)* %tmp24, align 4
+  %tmp25 = getelementptr i8, i8 addrspace(1)* %a, i64 24
+  %tmp26 = bitcast i8 addrspace(1)* %tmp25 to float addrspace(1)*
+  store float %tmp17, float addrspace(1)* %tmp26, align 4
+  %tmp27 = getelementptr i8, i8 addrspace(1)* %a, i64 28
+  %tmp28 = bitcast i8 addrspace(1)* %tmp27 to float addrspace(1)*
+  store float %tmp20, float addrspace(1)* %tmp28, align 4
+
+  ret void
+}
+
+attributes #0 = { argmemonly nounwind }