[PATCH] D20315: [LV] For some induction variables, use vector phis instead of widening the scalar in the loop body

Tue May 17 10:51:39 PDT 2016

davidxl added a comment.

Test case explicitly testing this should probably be added.

I tried the patch with the following program:

__attribute__((noinline)) long long hot() {

  long long x = 0;

#pragma clang loop vectorize_width(4) interleave_count(1)
#pragma nounroll

   for (int i = 0; i < 1000; i++) {
            x += i^2;
   }

  return x;

}

It improves performance by about ~10%. However the generated code is still not optimal -- there are unncessary vector IV copy code that can be moved out of loop or removed. Perhaps a followup patch to address that?

LBB0_1:                                # =>This Inner Loop Header: Depth=1

  movdqa  %xmm4, %xmm5                                <---- here
  paddd   %xmm1, %xmm5
  pxor    %xmm2, %xmm4
  pshufd  $78, %xmm4, %xmm6       # xmm6 = xmm4[2,3,0,1]
  movdqa  %xmm6, %xmm7
  psrad   $31, %xmm7
  punpckldq       %xmm7, %xmm6    # xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
  movdqa  %xmm4, %xmm7
  psrad   $31, %xmm7
  punpckldq       %xmm7, %xmm4    # xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
  paddq   %xmm4, %xmm0
  paddq   %xmm6, %xmm3
  addl    $-4, %eax
  movdqa  %xmm5, %xmm4                      <----- here
  jne     .LBB0_1

http://reviews.llvm.org/D20315