[LLVMbugs] [Bug 23580] New: Disabling gep merging pessimizes the code after loop vectorization
bugzilla-daemon at llvm.org
bugzilla-daemon at llvm.org
Tue May 19 11:58:34 PDT 2015
https://llvm.org/bugs/show_bug.cgi?id=23580
Bug ID: 23580
Summary: Disabling gep merging pessimizes the code after loop
vectorization
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: normal
Priority: P
Component: Scalar Optimizations
Assignee: unassignedbugs at nondot.org
Reporter: wmi at google.com
CC: llvmbugs at cs.uiuc.edu
Classification: Unclassified
Created attachment 14343
--> https://llvm.org/bugs/attachment.cgi?id=14343&action=edit
testcase 1.cc
Gep merging was mostly disabled in r235455 and that was good for most of the
cases. However, I found a case that disabling gep merging will pessimize the
code after loop vectorization. It may be better to relax the constraint of gep
merging a little and allow the src gep with single use to be merged with dest
gep in the same bb.
For the kernel loop (marked in testcase 1.cc), without gep merging, loop
vectorization cannot recognize the ptrs of the load between consecutive loop
iterations are consecutive. That is because
LoopVectorizationLegality::isConsecutivePtr only check one level gep to find
the induction information. The problem will pessimize the vectorization code a
lot in this case.
for.body: ; preds = %scalar.ph,
%for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [
%bc.trunc.resume.val, %scalar.ph ]
%arrayidx16 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64
%indvars.iv
%ival = getelementptr inbounds %struct.B, %struct.B* %arrayidx16, i64 0, i32
0
%72 = load i16, i16* %ival, align 2 ===> consecutive load
%conv17 = sext i16 %72 to i32
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%arrayidx19 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64
%indvars.iv.next
%ival20 = getelementptr inbounds %struct.B, %struct.B* %arrayidx19, i64 0,
i32 0
%73 = load i16, i16* %ival20, align 2 ===> consecutive load
%conv21 = sext i16 %73 to i32
%add22 = add nsw i32 %conv21, %conv17
%mul = mul nsw i32 %add22, %11
%add23 = add nsw i32 %mul, %conv7
%shr = ashr i32 %add23, %conv
%ival26 = getelementptr inbounds %struct.B, %struct.B* %call6, i64
%indvars.iv, i32 0
%74 = load i16, i16* %ival26, align 2
%conv2783 = zext i16 %74 to i32
%sub = sub i32 %conv2783, %shr
%conv28 = trunc i32 %sub to i16
store i16 %conv28, i16* %ival26, align 2
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %2
br i1 %exitcond, label %for.cond.loopexit98, label %for.body, !llvm.loop !7
After gep merging:
for.body: ; preds =
%for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1,
%for.body.preheader ]
%ival = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64
%indvars.iv, i32 0
%12 = load i16, i16* %ival, align 2
%conv17 = sext i16 %12 to i32
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%ival20 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64
%indvars.iv.next, i32 0
%13 = load i16, i16* %ival20, align 2
%conv21 = sext i16 %13 to i32
%add22 = add nsw i32 %conv21, %conv17
%mul = mul nsw i32 %add22, %11
%add23 = add nsw i32 %mul, %conv7
%shr = ashr i32 %add23, %conv
%ival26 = getelementptr inbounds %struct.B, %struct.B* %call6, i64
%indvars.iv, i32 0
%14 = load i16, i16* %ival26, align 2
%conv2783 = zext i16 %14 to i32
%sub = sub i32 %conv2783, %shr
%conv28 = trunc i32 %sub to i16
store i16 %conv28, i16* %ival26, align 2
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %2
br i1 %exitcond, label %for.cond.loopexit98, label %for.body, !llvm.loop !2
vectorization code without gep merging:
.LBB0_19: # %vector.body
xorps %xmm3, %xmm3
movss %xmm10, %xmm3 # xmm3 = xmm10[0],xmm3[1,2,3]
leaq 1(%r12), %rdi
movd %rdi, %xmm4
pshufd $68, %xmm4, %xmm5 # xmm5 = xmm4[0,1,0,1]
movl $1, %edx
movd %rdx, %xmm4
pslldq $8, %xmm4 # xmm4 =
zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
paddq %xmm5, %xmm4
movdqa %xmm5, %xmm6
paddq %xmm7, %xmm6
pshufd $78, %xmm4, %xmm4 # xmm4 = xmm4[2,3,0,1]
movd %xmm4, %rdi
movd %xmm6, %rdx
pshufd $78, %xmm6, %xmm4 # xmm4 = xmm6[2,3,0,1]
movd %xmm4, %rsi
pinsrw $0, (%r10,%r12,2), %xmm4
pinsrw $2, (%r8,%rdi,2), %xmm4
pinsrw $4, (%r8,%rdx,2), %xmm4
pinsrw $6, (%r8,%rsi,2), %xmm4
pslld $16, %xmm4
psrad $16, %xmm4
movdqa %xmm5, %xmm6
paddq %xmm1, %xmm6
paddq %xmm2, %xmm5
movd %xmm5, %rdx
pshufd $78, %xmm5, %xmm5 # xmm5 = xmm5[2,3,0,1]
movd %xmm5, %rsi
movd %xmm6, %rdi
pshufd $78, %xmm6, %xmm5 # xmm5 = xmm6[2,3,0,1]
movq %r10, %r11
movd %xmm5, %r10
pinsrw $0, (%r8,%rdx,2), %xmm5
pinsrw $2, (%r8,%rsi,2), %xmm5
pinsrw $4, (%r8,%rdi,2), %xmm5
pinsrw $6, (%r8,%r10,2), %xmm5
movq %r11, %r10
pslld $16, %xmm5
psrad $16, %xmm5
paddd %xmm4, %xmm5
pshufd $245, %xmm5, %xmm4 # xmm4 = xmm5[1,1,3,3]
pmuludq %xmm0, %xmm5
pshufd $232, %xmm5, %xmm5 # xmm5 = xmm5[0,2,2,3]
pshufd $245, %xmm0, %xmm6 # xmm6 = xmm0[1,1,3,3]
pmuludq %xmm4, %xmm6
pshufd $232, %xmm6, %xmm4 # xmm4 = xmm6[0,2,2,3]
punpckldq %xmm4, %xmm5 # xmm5 =
xmm5[0],xmm4[0],xmm5[1],xmm4[1]
paddd %xmm9, %xmm5
psrad %xmm3, %xmm5
movq 2(%rax,%r12,2), %xmm3 # xmm3 = mem[0],zero
punpcklwd %xmm8, %xmm3 # xmm3 =
xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
psubw %xmm5, %xmm3
pshuflw $232, %xmm3, %xmm3 # xmm3 = xmm3[0,2,2,3,4,5,6,7]
pshufhw $232, %xmm3, %xmm3 # xmm3 = xmm3[0,1,2,3,4,6,6,7]
pshufd $232, %xmm3, %xmm3 # xmm3 = xmm3[0,2,2,3]
movq %xmm3, 2(%rax,%r12,2)
addq $4, %r12
cmpq %r12, %r9
jne .LBB0_19
vectorization code with gep merging:
.LBB0_19: # %vector.body
xorps %xmm3, %xmm3
movss %xmm2, %xmm3 # xmm3 = xmm2[0],xmm3[1,2,3]
movq -2(%rdx), %xmm4 # xmm4 = mem[0],zero
punpcklwd %xmm4, %xmm4 # xmm4 = xmm4[0,0,1,1,2,2,3,3]
psrad $16, %xmm4
movq (%rdx), %xmm5 # xmm5 = mem[0],zero
punpcklwd %xmm5, %xmm5 # xmm5 = xmm5[0,0,1,1,2,2,3,3]
psrad $16, %xmm5
paddd %xmm4, %xmm5
pshufd $245, %xmm5, %xmm4 # xmm4 = xmm5[1,1,3,3]
pmuludq %xmm0, %xmm5
pshufd $232, %xmm5, %xmm5 # xmm5 = xmm5[0,2,2,3]
pshufd $245, %xmm0, %xmm6 # xmm6 = xmm0[1,1,3,3]
pmuludq %xmm4, %xmm6
pshufd $232, %xmm6, %xmm4 # xmm4 = xmm6[0,2,2,3]
punpckldq %xmm4, %xmm5 # xmm5 =
xmm5[0],xmm4[0],xmm5[1],xmm4[1]
paddd %xmm1, %xmm5
psrad %xmm3, %xmm5
movq (%rsi), %xmm3 # xmm3 = mem[0],zero
punpcklwd %xmm7, %xmm3 # xmm3 =
xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
psubw %xmm5, %xmm3
pshuflw $232, %xmm3, %xmm3 # xmm3 = xmm3[0,2,2,3,4,5,6,7]
pshufhw $232, %xmm3, %xmm3 # xmm3 = xmm3[0,1,2,3,4,6,6,7]
pshufd $232, %xmm3, %xmm3 # xmm3 = xmm3[0,2,2,3]
movq %xmm3, (%rsi)
addq $8, %rdx
addq $8, %rsi
addq $-4, %rdi
jne .LBB0_19
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20150519/6c80a397/attachment.html>
More information about the llvm-bugs
mailing list