<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - gep(gep...) merging in instcombine optimization hurts performance sometimes"

   href="https://llvm.org/bugs/show_bug.cgi?id=23163">23163</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>gep(gep...) merging in instcombine optimization hurts performance sometimes

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Scalar Optimizations

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>wmi@google.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvmbugs@cs.uiuc.edu

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Created <span class=""><a href="attachment.cgi?id=14170" name="attach_14170" title="1.cc">attachment 14170</a> <a href="attachment.cgi?id=14170&action=edit" title="1.cc">[details]</a></span>

1.cc

For the testcase 1.cc attached, 

~/workarea/llvm-r234388/build/bin/clang -O2 -fno-omit-frame-pointer -std=c++11

1.cc -S -o 1.s

In 1.s, the kernel loop contains 29 insns:

.LBB1_6:                                # %for.body7

                                        #   Parent Loop BB1_3 Depth=1

                                        # =>  This Inner Loop Header: Depth=2

        leaq    (%r12,%rdi), %rsi

        movslq  %esi, %rbx

        leaq    (%rbx,%r10), %rsi

        movzbl  (%r9,%rsi), %esi

        leal    (%rsi,%rsi,2), %esi

        leal    (%r13,%rdi), %r8d

        movslq  %r8d, %rcx

        addq    %r10, %rcx

        movzbl  (%r9,%rcx), %ecx

        leal    1(%rbx), %eax

        cltq

        addq    %r10, %rax

        movzbl  (%r9,%rax), %eax

        addl    %ecx, %eax

        leal    (%r14,%rdi), %ecx

        movslq  %ecx, %rcx

        addq    %r10, %rcx

        movzbl  (%r9,%rcx), %ecx

        addl    $2, %ebx

        movslq  %ebx, %rbx

        addq    %r10, %rbx

        movzbl  (%r9,%rbx), %ebx

        leal    (%rcx,%rsi,2), %ecx

        leal    (%rcx,%rax,4), %eax

        addl    %ebx, %eax

        movl    %eax, (%r11,%rdi,2)

        addq    $2, %rdi

        decl    %edx

        jne     .LBB1_6

If we disable gep merge in InstCombine pass, the kernel loop contains less

insns (25 insns):

.LBB1_6:                                # %for.body7

                                        #   Parent Loop BB1_3 Depth=1

                                        # =>  This Inner Loop Header: Depth=2

        leaq    (%r12,%rcx), %rsi

        movslq  %esi, %rsi

        movzbl  (%r9,%rsi), %edx

        leal    (%rdx,%rdx,2), %edx

        leal    (%r13,%rcx), %ebx

        movslq  %ebx, %rbx

        movzbl  (%r9,%rbx), %ebx

        leal    1(%rsi), %r8d

        movslq  %r8d, %rax

        movzbl  (%r9,%rax), %eax

        addl    %ebx, %eax

        leal    (%r14,%rcx), %ebx

        movslq  %ebx, %rbx

        movzbl  (%r9,%rbx), %ebx

        addl    $2, %esi

        movslq  %esi, %rsi

        movzbl  (%r9,%rsi), %esi

        leal    (%rbx,%rdx,2), %edx

        leal    (%rdx,%rax,4), %eax

        addl    %esi, %eax

        movl    %eax, (%r11,%rcx,2)

        addq    $2, %rcx

        incl    %edi

        cmpl    %edi, %r15d

        jne     .LBB1_6

gep(gep ...) merging optimization is similar with forward propagation

optimization. We need to be careful especially when the source gep has more

than one use. Usually we do the optimization only when gep merging will not

increase the cost of the destination. We can see why gep merging is bad here

from the IR of 1.cc below:

* The IR before InstCombine:

  ...

  %2 = load i8*, i8** %data, align 8, !tbaa !7

  %idx.ext2 = sext i32 %call1 to i64

  %add.ptr3 = getelementptr inbounds i8, i8* %2, i64 %idx.ext2

  ...

for.body7:

    ...

  %arrayidx = getelementptr inbounds i8, i8* %add.ptr3, i64 %idxprom

  %3 = load i8, i8* %arrayidx, align 1, !tbaa !9

    ...

  %arrayidx11 = getelementptr inbounds i8, i8* %add.ptr3, i64 %idxprom10

  %4 = load i8, i8* %arrayidx11, align 1, !tbaa !9

    ...

  %arrayidx15 = getelementptr inbounds i8, i8* %add.ptr3, i64 %idxprom14

  %5 = load i8, i8* %arrayidx15, align 1, !tbaa !9

    ...

  %arrayidx23 = getelementptr inbounds i8, i8* %add.ptr3, i64 %idxprom22

  %6 = load i8, i8* %arrayidx23, align 1, !tbaa !9

    ...

  br label %for.cond5

* The IR after InstCombine with gep merge:

  ...

  %2 = load i8*, i8** %data, align 8, !tbaa !7

  %idx.ext2 = sext i32 %call1 to i64

  ...

for.body7: 

    ...

  %add.ptr3.sum = add nsw i64 %idx.ext2, %idxprom

  %arrayidx = getelementptr inbounds i8, i8* %2, i64 %add.ptr3.sum

  %3 = load i8, i8* %arrayidx, align 1, !tbaa !9

    ...

  %add.ptr3.sum61 = add nsw i64 %idx.ext2, %idxprom10

  %arrayidx11 = getelementptr inbounds i8, i8* %2, i64 %add.ptr3.sum61

  %4 = load i8, i8* %arrayidx11, align 1, !tbaa !9

    ...

  %add.ptr3.sum63 = add nsw i64 %idx.ext2, %idxprom14

  %arrayidx15 = getelementptr inbounds i8, i8* %2, i64 %add.ptr3.sum63

  %5 = load i8, i8* %arrayidx15, align 1, !tbaa !9

    ...

  %add.ptr3.sum64 = add nsw i64 %idx.ext2, %idxprom22

  %arrayidx23 = getelementptr inbounds i8, i8* %2, i64 %add.ptr3.sum64

  %6 = load i8, i8* %arrayidx23, align 1, !tbaa !9

    ...

  br label %for.cond5

We can see that after gep merging, there is one less gep outside the loop, but

there are four more add insns inside the loop.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>