[PATCH] D20315: [LV] For some induction variables, use vector phis instead of widening the scalar in the loop body
Wei Mi via llvm-commits
llvm-commits at lists.llvm.org
Fri May 20 10:57:17 PDT 2016
> However the generated code is still not optimal -- there are unncessary vector IV copy code that can be moved out of loop or removed. Perhaps a followup patch to address that?
>
> LBB0_1: # =>This Inner Loop Header: Depth=1
>
> movdqa %xmm4, %xmm5 <---- here
> paddd %xmm1, %xmm5
> pxor %xmm2, %xmm4
> pshufd $78, %xmm4, %xmm6 # xmm6 = xmm4[2,3,0,1]
> movdqa %xmm6, %xmm7
> psrad $31, %xmm7
> punpckldq %xmm7, %xmm6 # xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
> movdqa %xmm4, %xmm7
> psrad $31, %xmm7
> punpckldq %xmm7, %xmm4 # xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
> paddq %xmm4, %xmm0
> paddq %xmm6, %xmm3
> addl $-4, %eax
> movdqa %xmm5, %xmm4 <----- here
> jne .LBB0_1
>
I think the first movdqa can be at least promoted to the loop preheader.
The original generated code with loop header is:
# BB#0: # %entry
pxor %xmm0, %xmm0
movdqa .LCPI0_0(%rip), %xmm4 # xmm4 = [0,1,2,3]
movl $1000, %eax # imm = 0x3E8
movdqa .LCPI0_1(%rip), %xmm1 # xmm1 = [4,4,4,4]
movdqa .LCPI0_2(%rip), %xmm2 # xmm2 = [2,2,2,2]
pxor %xmm3, %xmm3
.p2align 4, 0x90
.LBB0_1: # %vector.body
# =>This Inner Loop Header: Depth=1
movdqa %xmm4, %xmm5
paddd %xmm1, %xmm5
pxor %xmm2, %xmm4
pshufd $78, %xmm4, %xmm6 # xmm6 = xmm4[2,3,0,1]
movdqa %xmm6, %xmm7
psrad $31, %xmm7
punpckldq %xmm7, %xmm6 # xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
movdqa %xmm4, %xmm7
psrad $31, %xmm7
punpckldq %xmm7, %xmm4 # xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
paddq %xmm4, %xmm0
paddq %xmm6, %xmm3
addl $-4, %eax
movdqa %xmm5, %xmm4
jne .LBB0_1
It is safe to mov "movdqa %xmm4, %xmm5" at the start of LBB0_1 to the
end of all its predecessors: the end of BB#0 and the end of LBB0_1.
# BB#0: # %entry
pxor %xmm0, %xmm0
movdqa .LCPI0_0(%rip), %xmm4 # xmm4 = [0,1,2,3]
movl $1000, %eax # imm = 0x3E8
movdqa .LCPI0_1(%rip), %xmm1 # xmm1 = [4,4,4,4]
movdqa .LCPI0_2(%rip), %xmm2 # xmm2 = [2,2,2,2]
pxor %xmm3, %xmm3
movdqa %xmm4, %xmm5 ==> promoted to preheader
.p2align 4, 0x90
.LBB0_1: # %vector.body
# =>This Inner Loop Header: Depth=1
paddd %xmm1, %xmm5
pxor %xmm2, %xmm4
pshufd $78, %xmm4, %xmm6 # xmm6 = xmm4[2,3,0,1]
movdqa %xmm6, %xmm7
psrad $31, %xmm7
punpckldq %xmm7, %xmm6 # xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
movdqa %xmm4, %xmm7
psrad $31, %xmm7
punpckldq %xmm7, %xmm4 # xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
paddq %xmm4, %xmm0
paddq %xmm6, %xmm3
addl $-4, %eax
movdqa %xmm5, %xmm4
movdqa %xmm4, %xmm5 ==> apparently redundent and will be deleted.
jne .LBB0_1
I think this is actually a weakness in register coalescing. I already
have a similar testcase which probably have the same cause. It is good
to have another one now. It shows the problem may be more general than
I thought and justifies more work to improve it. Will file a separate
bug for it.
Thanks,
Wei.
More information about the llvm-commits
mailing list