            Bug ID: 33234
           Summary: LLVM misses loadpre opportunities
           Product: clang
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: LLVM Codegen
          Assignee: unassignedclangbugs at nondot.org
          Reporter: danielcdh at gmail.com
                CC: llvm-bugs at lists.llvm.org

#cat t.cc
void foo(int *p, int t, int *a, int *b, int *c) {
  for (int i = 0; i < t; i++) {
    if (a[i]) {
      *p += b[i];
    } else {
      *p *= c[i];
#clang -O2 -S t.cc -fno-unroll-loops
        testl   %esi, %esi
        jle     .LBB0_6
# BB#1:
        movl    %esi, %eax
        .p2align        4, 0x90
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        cmpl    $0, (%rdx)
        je      .LBB0_4
# BB#3:                                 #   in Loop: Header=BB0_2 Depth=1
        movl    (%rdi), %esi
        addl    (%rcx), %esi
        jmp     .LBB0_5
        .p2align        4, 0x90
.LBB0_4:                                #   in Loop: Header=BB0_2 Depth=1
        movl    (%rdi), %esi
        imull   (%r8), %esi
.LBB0_5:                                #   in Loop: Header=BB0_2 Depth=1
        movl    %esi, (%rdi)
        addq    $4, %rdx
        addq    $4, %rcx
        addq    $4, %r8
        decq    %rax
        jne     .LBB0_2

#gcc -O2 -S t.cc -fno-unroll-loops
        testl   %esi, %esi
        jle     .L1
        movl    (%rdi), %r9d
        xorl    %eax, %eax
        jmp     .L5
        .p2align 4,,10
        .p2align 3
        addl    (%rcx,%rax,4), %r9d
        addq    $1, %rax
        movl    %r9d, (%rdi)
        cmpl    %eax, %esi
        jle     .L1
        movl    (%rdx,%rax,4), %r10d
        testl   %r10d, %r10d
        jne     .L9
        imull   (%r8,%rax,4), %r9d
        addq    $1, %rax
        movl    %r9d, (%rdi)
        cmpl    %eax, %esi
        jg      .L5

For the gcc version, there are only 2 loads + 1 store each iteration. For llvm
version, there are 3 loads + 1 store each iteration.

The problem is that llvm failed to move the load of *p outside of the loop.

