[LLVMbugs] [Bug 6396] New: de-optimization of down-counting loops with i32 counter

Mon Feb 22 14:03:17 PST 2010

http://llvm.org/bugs/show_bug.cgi?id=6396

           Summary: de-optimization of down-counting loops with i32
                    counter
           Product: libraries
           Version: 2.6
          Platform: PC
        OS/Version: Linux
            Status: NEW
          Severity: normal
          Priority: P5
         Component: Loop Optimizer
        AssignedTo: unassignedbugs at nondot.org
        ReportedBy: llvm at henning-thielemann.de
                CC: llvmbugs at cs.uiuc.edu

I have written a simple loop using an i32 counter that counts from the number
of repetitions down to zero. I hoped that x86 codegen could make use of the
LOOP instruction. It does not, but this is not the main problem here.

define i32 @_fun1(i32, float*) {
_L1:
  br label %_L2

_L2:
  %2 = phi i32 [ %0, %_L1 ], [ %7, %_L3 ]
  %3 = phi float* [ %1, %_L1 ], [ %8, %_L3 ]
  %4 = phi float [ 1.000000e+00, %_L1 ], [ %6, %_L3 ]
  %5 = icmp ne i32 %2, 0
  br i1 %5, label %_L3, label %_L5

_L3:
  %6 = fmul float %4, 0x3FEFFFE2E0000000
  store float %4, float* %3
  %7 = sub i32 %2, 1
  %8 = getelementptr float* %3, i32 1
  br label %_L2

_L5:
  %9 = sub i32 %0, %2
  ret i32 %9
}

This gets compiled to 

.LBB1_0:                                                    # %_L1
    movl    4(%esp), %ecx
    incl    %ecx
    movss    .LCPI1_0, %xmm0
    xorl    %eax, %eax
    movl    8(%esp), %edx
    jmp    .LBB1_2
    .align    16
.LBB1_1:                                                    # %_L3
                                                            # Loop Depth 1
                                                            # Loop Header is
BB1_2
                                                            # Inner Loop
    movss    %xmm0, (%edx,%eax,4)
    incl    %eax
    mulss    .LCPI1_1, %xmm0
.LBB1_2:                                                    # %_L2
                                                            # Loop Depth 1
                                                            # Loop Header
                                                            # Inner Loop
    decl    %ecx
    jne    .LBB1_1

I still wonder, why it does not just increase %edx, as I told it in the LLVM
code. But the main problem follows:

Optimization introduces a new counter that is 64 bit long and counts upwards:

define i32 @_fun1(i32, float* nocapture) nounwind {
_L1:
  %2 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
  br i1 %2, label %_L5, label %bb.nph

bb.nph:                                           ; preds = %_L1
  %tmp = zext i32 %0 to i64                       ; <i64> [#uses=1]
  br label %_L3

_L3:                                              ; preds = %_L3, %bb.nph
  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %_L3 ] ; <i64> [#uses=2]
  %3 = phi float [ 1.000000e+00, %bb.nph ], [ %4, %_L3 ] ; <float> [#uses=2]
  %scevgep = getelementptr float* %1, i64 %indvar ; <float*> [#uses=1]
  %4 = fmul float %3, 0x3FEFFFE2E0000000          ; <float> [#uses=1]
  store float %3, float* %scevgep
  %indvar.next = add i64 %indvar, 1               ; <i64> [#uses=2]
  %exitcond = icmp eq i64 %indvar.next, %tmp      ; <i1> [#uses=1]
  br i1 %exitcond, label %_L5, label %_L3

_L5:                                              ; preds = %_L3, %_L1
  %.lcssa = phi i32 [ %0, %_L1 ], [ 0, %_L3 ]     ; <i32> [#uses=1]
  %5 = sub i32 %0, %.lcssa                        ; <i32> [#uses=1]
  ret i32 %5
}

The assembly code does no longer look as nice as before:

.LBB1_0:                                                    # %_L1
    pushl    %edi
    pushl    %esi
    movl    12(%esp), %eax
    testl    %eax, %eax
    je    .LBB1_5
.LBB1_1:                                                    # %bb.nph
    movl    16(%esp), %ecx
    xorl    %edx, %edx
    movss    .LCPI1_0, %xmm0
    movl    %eax, %esi
    .align    16
.LBB1_2:                                                    # %_L3
                                                            # Loop Depth 1
                                                            # Loop Header
                                                            # Inner Loop
    movss    %xmm0, (%ecx)
    addl    $4294967295, %esi
    adcl    $4294967295, %edx
    movl    %esi, %edi
    orl    %edx, %edi
    addl    $4, %ecx
    testl    %edi, %edi
    mulss    .LCPI1_1, %xmm0
    jne    .LBB1_2
.LBB1_3:                                                    #
%_L3._L5_crit_edge
    xorl    %ecx, %ecx
.LBB1_4:                                                    # %_L5
    subl    %ecx, %eax
    popl    %esi
    popl    %edi
    ret
.LBB1_5:                                                    #
%_L1._L5_crit_edge
    movl    %eax, %ecx
    jmp    .LBB1_4

The 64 bit computation really adds more instructions but is unnecessary since
the original LLVM code is satisfied with a 32 bit counter.

-- 
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.