[PATCH] D20315: [LV] For some induction variables, use vector phis instead of widening the scalar in the loop body

Fri May 20 10:57:17 PDT 2016

> However the generated code is still not optimal -- there are unncessary vector IV copy code that can be moved out of loop or removed. Perhaps a followup patch to address that?
>
> LBB0_1:                                # =>This Inner Loop Header: Depth=1
>
>   movdqa  %xmm4, %xmm5                                <---- here
>   paddd   %xmm1, %xmm5
>   pxor    %xmm2, %xmm4
>   pshufd  $78, %xmm4, %xmm6       # xmm6 = xmm4[2,3,0,1]
>   movdqa  %xmm6, %xmm7
>   psrad   $31, %xmm7
>   punpckldq       %xmm7, %xmm6    # xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
>   movdqa  %xmm4, %xmm7
>   psrad   $31, %xmm7
>   punpckldq       %xmm7, %xmm4    # xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
>   paddq   %xmm4, %xmm0
>   paddq   %xmm6, %xmm3
>   addl    $-4, %eax
>   movdqa  %xmm5, %xmm4                      <----- here
>   jne     .LBB0_1
>

I think the first movdqa can be at least promoted to the loop preheader.

The original generated code with loop header is:

# BB#0:                                 # %entry
        pxor    %xmm0, %xmm0
        movdqa  .LCPI0_0(%rip), %xmm4   # xmm4 = [0,1,2,3]
        movl    $1000, %eax             # imm = 0x3E8
        movdqa  .LCPI0_1(%rip), %xmm1   # xmm1 = [4,4,4,4]
        movdqa  .LCPI0_2(%rip), %xmm2   # xmm2 = [2,2,2,2]
        pxor    %xmm3, %xmm3
        .p2align        4, 0x90
.LBB0_1:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
        movdqa  %xmm4, %xmm5
        paddd   %xmm1, %xmm5
        pxor    %xmm2, %xmm4
        pshufd  $78, %xmm4, %xmm6       # xmm6 = xmm4[2,3,0,1]
        movdqa  %xmm6, %xmm7
        psrad   $31, %xmm7
        punpckldq       %xmm7, %xmm6    # xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
        movdqa  %xmm4, %xmm7
        psrad   $31, %xmm7
        punpckldq       %xmm7, %xmm4    # xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
        paddq   %xmm4, %xmm0
        paddq   %xmm6, %xmm3
        addl    $-4, %eax
        movdqa  %xmm5, %xmm4
        jne     .LBB0_1

It is safe to mov "movdqa  %xmm4, %xmm5" at the start of LBB0_1 to the
end of all its predecessors: the end of BB#0 and the end of LBB0_1.

# BB#0:                                 # %entry
        pxor    %xmm0, %xmm0
        movdqa  .LCPI0_0(%rip), %xmm4   # xmm4 = [0,1,2,3]
        movl    $1000, %eax             # imm = 0x3E8
        movdqa  .LCPI0_1(%rip), %xmm1   # xmm1 = [4,4,4,4]
        movdqa  .LCPI0_2(%rip), %xmm2   # xmm2 = [2,2,2,2]
        pxor    %xmm3, %xmm3
        movdqa  %xmm4, %xmm5  ==> promoted to preheader
        .p2align        4, 0x90
.LBB0_1:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
        paddd   %xmm1, %xmm5
        pxor    %xmm2, %xmm4
        pshufd  $78, %xmm4, %xmm6       # xmm6 = xmm4[2,3,0,1]
        movdqa  %xmm6, %xmm7
        psrad   $31, %xmm7
        punpckldq       %xmm7, %xmm6    # xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
        movdqa  %xmm4, %xmm7
        psrad   $31, %xmm7
        punpckldq       %xmm7, %xmm4    # xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
        paddq   %xmm4, %xmm0
        paddq   %xmm6, %xmm3
        addl    $-4, %eax
        movdqa  %xmm5, %xmm4
        movdqa  %xmm4, %xmm5     ==> apparently redundent and will be deleted.
        jne     .LBB0_1

I think this is actually a weakness in register coalescing. I already
have a similar testcase which probably have the same cause. It is good
to have another one now. It shows the problem may be more general than
I thought and justifies more work to improve it. Will file a separate
bug for it.

Thanks,
Wei.