[PATCH] D19501: Add LoadStoreVectorizer pass

Thu Jun 23 14:16:28 PDT 2016

asbirlea added a comment.

Here's a test-case that currently produces incorrect results on my end.

  target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
  target triple = "aarch64--linux-gnueabihf"

  define i32 @test(i32* noalias %ptr) {
  entry:
    br label %"for something"

  "for something":
    %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ]
    %next.gep = getelementptr i32, i32* %ptr, i64 %index
    %a1 = add nsw i64 %index, 1
    %next.gep1 = getelementptr i32, i32* %ptr, i64 %a1
    %a2 = add nsw i64 %index, 2
    %next.gep2 = getelementptr i32, i32* %ptr, i64 %a2

    %l1 = load i32, i32* %next.gep1, align 4                                                         
    %l2 = load i32, i32* %next.gep, align 4                                                          
    store i32 0, i32* %next.gep1, align 4                                                            
    store i32 0, i32* %next.gep, align 4
    %l3 = load i32, i32* %next.gep1, align 4                                                         
    %l4 = load i32, i32* %next.gep2, align 4   
    %index.next = add i64 %index, 8
    %cmp_res = icmp eq i64 %index.next, 8
    br i1 %cmp_res, label %ending, label %"for something"

  ending:
    ret i32 0

It does not take into account the stores as aliasing with the loads, and leaves the stores as the first instructions. 
Output I see:

  %0 = bitcast i32* %next.gep to <2 x i32>*                                                        
  store <2 x i32> zeroinitializer, <2 x i32>* %0, align 4                                          
  %1 = bitcast i32* %next.gep to <2 x i32>*                                                        
  %2 = load <2 x i32>, <2 x i32>* %1, align 4                                                      
  %3 = extractelement <2 x i32> %2, i32 0
  %4 = extractelement <2 x i32> %2, i32 1                                                          
  %5 = bitcast i32* %next.gep1 to <2 x i32>*                                                       
  %6 = load <2 x i32>, <2 x i32>* %5, align 4                                                      
  %7 = extractelement <2 x i32> %6, i32 0
  %8 = extractelement <2 x i32> %6, i32 1   

Is there a follow-up patch fixing this that I missed?

http://reviews.llvm.org/D19501