[LLVMbugs] [Bug 23580] New: Disabling gep merging pessimizes the code after loop vectorization

bugzilla-daemon at llvm.org bugzilla-daemon at llvm.org
Tue May 19 11:58:34 PDT 2015


https://llvm.org/bugs/show_bug.cgi?id=23580

            Bug ID: 23580
           Summary: Disabling gep merging pessimizes the code after loop
                    vectorization
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Scalar Optimizations
          Assignee: unassignedbugs at nondot.org
          Reporter: wmi at google.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

Created attachment 14343
  --> https://llvm.org/bugs/attachment.cgi?id=14343&action=edit
testcase 1.cc

Gep merging was mostly disabled in r235455 and that was good for most of the
cases. However, I found a case that disabling gep merging will pessimize the
code after loop vectorization. It may be better to relax the constraint of gep
merging a little and allow the src gep with single use to be merged with dest
gep in the same bb.

For the kernel loop (marked in testcase 1.cc), without gep merging, loop
vectorization cannot recognize the ptrs of the load between consecutive loop
iterations are consecutive. That is because
LoopVectorizationLegality::isConsecutivePtr only check one level gep to find
the induction information. The problem will pessimize the vectorization code a
lot in this case.

for.body:                                         ; preds = %scalar.ph,
%for.body
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [
%bc.trunc.resume.val, %scalar.ph ]
  %arrayidx16 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64
%indvars.iv
  %ival = getelementptr inbounds %struct.B, %struct.B* %arrayidx16, i64 0, i32
0
  %72 = load i16, i16* %ival, align 2      ===> consecutive load
  %conv17 = sext i16 %72 to i32
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %arrayidx19 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64
%indvars.iv.next
  %ival20 = getelementptr inbounds %struct.B, %struct.B* %arrayidx19, i64 0,
i32 0
  %73 = load i16, i16* %ival20, align 2    ===> consecutive load
  %conv21 = sext i16 %73 to i32
  %add22 = add nsw i32 %conv21, %conv17
  %mul = mul nsw i32 %add22, %11
  %add23 = add nsw i32 %mul, %conv7
  %shr = ashr i32 %add23, %conv
  %ival26 = getelementptr inbounds %struct.B, %struct.B* %call6, i64
%indvars.iv, i32 0
  %74 = load i16, i16* %ival26, align 2
  %conv2783 = zext i16 %74 to i32
  %sub = sub i32 %conv2783, %shr
  %conv28 = trunc i32 %sub to i16
  store i16 %conv28, i16* %ival26, align 2
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp eq i32 %lftr.wideiv, %2
  br i1 %exitcond, label %for.cond.loopexit98, label %for.body, !llvm.loop !7

After gep merging:
for.body:                                         ; preds =
%for.body.preheader, %for.body
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1,
%for.body.preheader ]
  %ival = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64
%indvars.iv, i32 0
  %12 = load i16, i16* %ival, align 2
  %conv17 = sext i16 %12 to i32
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %ival20 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64
%indvars.iv.next, i32 0
  %13 = load i16, i16* %ival20, align 2
  %conv21 = sext i16 %13 to i32
  %add22 = add nsw i32 %conv21, %conv17
  %mul = mul nsw i32 %add22, %11
  %add23 = add nsw i32 %mul, %conv7
  %shr = ashr i32 %add23, %conv
  %ival26 = getelementptr inbounds %struct.B, %struct.B* %call6, i64
%indvars.iv, i32 0
  %14 = load i16, i16* %ival26, align 2
  %conv2783 = zext i16 %14 to i32
  %sub = sub i32 %conv2783, %shr
  %conv28 = trunc i32 %sub to i16
  store i16 %conv28, i16* %ival26, align 2
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp eq i32 %lftr.wideiv, %2
  br i1 %exitcond, label %for.cond.loopexit98, label %for.body, !llvm.loop !2

vectorization code without gep merging:
.LBB0_19:                               # %vector.body
        xorps   %xmm3, %xmm3
        movss   %xmm10, %xmm3           # xmm3 = xmm10[0],xmm3[1,2,3]
        leaq    1(%r12), %rdi
        movd    %rdi, %xmm4
        pshufd  $68, %xmm4, %xmm5       # xmm5 = xmm4[0,1,0,1]
        movl    $1, %edx
        movd    %rdx, %xmm4
        pslldq  $8, %xmm4               # xmm4 =
zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
        paddq   %xmm5, %xmm4
        movdqa  %xmm5, %xmm6
        paddq   %xmm7, %xmm6
        pshufd  $78, %xmm4, %xmm4       # xmm4 = xmm4[2,3,0,1]
        movd    %xmm4, %rdi
        movd    %xmm6, %rdx
        pshufd  $78, %xmm6, %xmm4       # xmm4 = xmm6[2,3,0,1]
        movd    %xmm4, %rsi
        pinsrw  $0, (%r10,%r12,2), %xmm4
        pinsrw  $2, (%r8,%rdi,2), %xmm4
        pinsrw  $4, (%r8,%rdx,2), %xmm4
        pinsrw  $6, (%r8,%rsi,2), %xmm4
        pslld   $16, %xmm4
        psrad   $16, %xmm4
        movdqa  %xmm5, %xmm6
        paddq   %xmm1, %xmm6
        paddq   %xmm2, %xmm5
        movd    %xmm5, %rdx
        pshufd  $78, %xmm5, %xmm5       # xmm5 = xmm5[2,3,0,1]
        movd    %xmm5, %rsi
        movd    %xmm6, %rdi
        pshufd  $78, %xmm6, %xmm5       # xmm5 = xmm6[2,3,0,1]
        movq    %r10, %r11
        movd    %xmm5, %r10
        pinsrw  $0, (%r8,%rdx,2), %xmm5
        pinsrw  $2, (%r8,%rsi,2), %xmm5
        pinsrw  $4, (%r8,%rdi,2), %xmm5
        pinsrw  $6, (%r8,%r10,2), %xmm5
        movq    %r11, %r10
        pslld   $16, %xmm5
        psrad   $16, %xmm5
        paddd   %xmm4, %xmm5
        pshufd  $245, %xmm5, %xmm4      # xmm4 = xmm5[1,1,3,3]
        pmuludq %xmm0, %xmm5
        pshufd  $232, %xmm5, %xmm5      # xmm5 = xmm5[0,2,2,3]
        pshufd  $245, %xmm0, %xmm6      # xmm6 = xmm0[1,1,3,3]
        pmuludq %xmm4, %xmm6
        pshufd  $232, %xmm6, %xmm4      # xmm4 = xmm6[0,2,2,3]
        punpckldq       %xmm4, %xmm5    # xmm5 =
xmm5[0],xmm4[0],xmm5[1],xmm4[1]
        paddd   %xmm9, %xmm5
        psrad   %xmm3, %xmm5
        movq    2(%rax,%r12,2), %xmm3   # xmm3 = mem[0],zero
        punpcklwd       %xmm8, %xmm3    # xmm3 =
xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
        psubw   %xmm5, %xmm3
        pshuflw $232, %xmm3, %xmm3      # xmm3 = xmm3[0,2,2,3,4,5,6,7]
        pshufhw $232, %xmm3, %xmm3      # xmm3 = xmm3[0,1,2,3,4,6,6,7]
        pshufd  $232, %xmm3, %xmm3      # xmm3 = xmm3[0,2,2,3]
        movq    %xmm3, 2(%rax,%r12,2)
        addq    $4, %r12
        cmpq    %r12, %r9
        jne     .LBB0_19

vectorization code with gep merging:
.LBB0_19:                               # %vector.body
        xorps   %xmm3, %xmm3
        movss   %xmm2, %xmm3            # xmm3 = xmm2[0],xmm3[1,2,3]
        movq    -2(%rdx), %xmm4         # xmm4 = mem[0],zero
        punpcklwd       %xmm4, %xmm4    # xmm4 = xmm4[0,0,1,1,2,2,3,3]
        psrad   $16, %xmm4
        movq    (%rdx), %xmm5           # xmm5 = mem[0],zero
        punpcklwd       %xmm5, %xmm5    # xmm5 = xmm5[0,0,1,1,2,2,3,3]
        psrad   $16, %xmm5
        paddd   %xmm4, %xmm5
        pshufd  $245, %xmm5, %xmm4      # xmm4 = xmm5[1,1,3,3]
        pmuludq %xmm0, %xmm5
        pshufd  $232, %xmm5, %xmm5      # xmm5 = xmm5[0,2,2,3]
        pshufd  $245, %xmm0, %xmm6      # xmm6 = xmm0[1,1,3,3]
        pmuludq %xmm4, %xmm6
        pshufd  $232, %xmm6, %xmm4      # xmm4 = xmm6[0,2,2,3]
        punpckldq       %xmm4, %xmm5    # xmm5 =
xmm5[0],xmm4[0],xmm5[1],xmm4[1]
        paddd   %xmm1, %xmm5
        psrad   %xmm3, %xmm5
        movq    (%rsi), %xmm3           # xmm3 = mem[0],zero
        punpcklwd       %xmm7, %xmm3    # xmm3 =
xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
        psubw   %xmm5, %xmm3
        pshuflw $232, %xmm3, %xmm3      # xmm3 = xmm3[0,2,2,3,4,5,6,7]
        pshufhw $232, %xmm3, %xmm3      # xmm3 = xmm3[0,1,2,3,4,6,6,7]
        pshufd  $232, %xmm3, %xmm3      # xmm3 = xmm3[0,2,2,3]
        movq    %xmm3, (%rsi)
        addq    $8, %rdx
        addq    $8, %rsi
        addq    $-4, %rdi
        jne     .LBB0_19

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20150519/6c80a397/attachment.html>


More information about the llvm-bugs mailing list