[llvm-commits] [llvm] r123621 - in /llvm/trunk: lib/CodeGen/SelectionDAG/TargetLowering.cpp test/CodeGen/X86/ctpop-combine.ll

Tue Jan 18 07:38:31 PST 2011

On Tue, Jan 18, 2011 at 09:12, Chris Lattner <clattner at apple.com> wrote:
> 
> On Jan 17, 2011, at 4:04 AM, Benjamin Kramer wrote:
> 
>> Author: d0k
>> Date: Mon Jan 17 06:04:57 2011
>> New Revision: 123621
>> 
>> URL: http://llvm.org/viewvc/llvm-project?rev=123621&view=rev
>> Log:
>> Add a DAGCombine to turn (ctpop x) u< 2 into (x & x-1) == 0.
>> 
>> This shaves off 4 popcounts from the hacked 186.crafty source.
>> 
>> This is enabled even when a native popcount instruction is available. The
>> combined code is one operation longer but it should be faster nevertheless.
> 
> Very nice.  This doesn't seem to speed up crafty much though.  Looking at a shark trace, it looks like much of the time is spent in Evaluate and EvaluatePawns.  Much of the time in EvaluatePawns is spent in this loop:
> 
>  %indvar460 = phi i64 [ %indvar.next461, %53 ], [ 0, %FirstOne.exit ]
>  %45 = phi i64 [ %49, %53 ], [ %20, %FirstOne.exit ]
>  %tmp464 = shl i64 %indvar460, 3
>  %tmp465 = add i64 %tmp463, %tmp464
>  %scevgep466 = getelementptr [64 x i64]* @w_pawn_attacks, i64 0, i64 %tmp465
>  %scevgep467 = getelementptr [64 x i64]* @b_pawn_attacks, i64 0, i64 %tmp465
>  %tmp473 = add i64 %tmp472, %tmp464
>  %storemerge56 = trunc i64 %tmp473 to i32
>  %46 = icmp slt i32 %storemerge56, 48
>  br i1 %46, label %47, label %62
> ; <label>:47                                      ; preds = %44
>  %tmp470 = add i64 %tmp469, %tmp464
>  %scevgep471 = getelementptr [65 x i64]* @set_mask, i64 0, i64 %tmp470
>  %scevgep468 = getelementptr [65 x i64]* @set_mask, i64 0, i64 %tmp465
>  %48 = load i64* %scevgep471, align 8, !tbaa !2
>  %49 = or i64 %48, %45
>  %50 = load i64* %scevgep468, align 8, !tbaa !2
>  %51 = and i64 %17, %50
>  %52 = icmp eq i64 %51, 0
>  br i1 %52, label %53, label %62
> ; <label>:53                                      ; preds = %47
>  %54 = load i64* %scevgep467, align 8, !tbaa !2
>  %55 = and i64 %15, %54
>  %56 = tail call i64 @llvm.ctpop.i64(i64 %55) nounwind
>  %cast.i69 = trunc i64 %56 to i32
>  %57 = load i64* %scevgep466, align 8, !tbaa !2
>  %58 = and i64 %18, %57
>  %59 = tail call i64 @llvm.ctpop.i64(i64 %58) nounwind
>  %cast.i70 = trunc i64 %59 to i32
>  %60 = sub nsw i32 %cast.i70, %cast.i69
>  %61 = icmp eq i32 %60, 2
>  %indvar.next461 = add i64 %indvar460, 1
>  br i1 %61, label %62, label %44
> 
> Our generated machine code is this.  Much of the time in this function is spent in the (insane) ctpop block, which is replicated 4 times:
> 
>        .align  4, 0x90
> LBB4_14:                                ##   Parent Loop BB4_5 Depth=1
>                                        ## =>  This Inner Loop Header: Depth=2
>        cmpl    $47, %r11d
>        jg      LBB4_17
> ## BB#15:                               ##   in Loop: Header=BB4_14 Depth=2
>        orq     (%r14), %rax
>        testq   (%r15,%rdx), %rcx
>        jne     LBB4_17
> ## BB#16:                               ##   in Loop: Header=BB4_14 Depth=2
>        leaq    _b_pawn_attacks(%rip), %r12
>        movq    (%r15,%r12), %r12
>        andq    -16(%rsp), %r12         ## 8-byte Folded Reload
>        movq    %r12, %r13
>        shrq    %r13
>        andq    %rsi, %r13
>        subq    %r13, %r12
>        movq    %r12, %r13
>        andq    %rdi, %r13
>        shrq    $2, %r12
>        andq    %rdi, %r12
>        addq    %r13, %r12
>        movq    %r12, %r13
>        shrq    $4, %r13
>        addq    %r12, %r13
>        andq    %r8, %r13
>        imulq   %r9, %r13
>        shrq    $56, %r13
>        leaq    _w_pawn_attacks(%rip), %r12
>        movq    (%r15,%r12), %r12
>        andq    -8(%rsp), %r12          ## 8-byte Folded Reload
>        movq    %r12, %rbp
>        shrq    %rbp
>        andq    %rsi, %rbp
>        subq    %rbp, %r12
>        movq    %r12, %rbp
>        andq    %rdi, %rbp
>        shrq    $2, %r12
>        andq    %rdi, %r12
>        addq    %rbp, %r12
>        movq    %r12, %rbp
>        shrq    $4, %rbp
>        addq    %r12, %rbp
>        andq    %r8, %rbp
>        imulq   %r9, %rbp
>        shrq    $56, %rbp
>        subl    %r13d, %ebp
>        addq    $64, %r15
>        addl    $8, %r11d
>        addq    $64, %r14
>        cmpl    $2, %ebp
>        jne     LBB4_14
> 
> We've done some pretty heroic LSR there and hoisted all the ctpop constants.  I don't see any obvious improvements offhand.  This same loop is duplicated lower in the function, the first copy is 11% of the time, the second copy takes 13% of the time.  There is a third copy later that has 11% of the time, and a 4th later that takes 10% of the time.
> 
> This seems like an important loop!  One simple thing that I see is that we have:
> 
> ...
>        imulq   %r9, %r13
>        shrq    $56, %r13
> ...
>        imulq   %r9, %rbp
>        shrq    $56, %rbp
>        subl    %r13d, %ebp
> ...
>        cmpl    $2, %ebp
>        jne     LBB4_14
> 
> Is there some cheaper way to do ((x*0x101010101010101)>>56)-((y*0x101010101010101)>>56) != 2?

Applying distributive laws yields ((x-y)*0x101010101010101)>>56 != 2, however adding a
"machine distribution" pass just to catch this one feels like overkill. Is there an easier
way to catch such simplifications at codegen time?

> In Evaluate, things are more difficult to analyze.  One commonly repeated pattern is a "conditional negate", which looks like this in IR:
> 
> DrawScore.exit:                                   ; preds = %186, %180, %171
>  %188 = phi i32 [ %172, %171 ], [ %storemerge1.i, %180 ], [ %.392, %186 ]
>  %189 = sub nsw i32 0, %188
>  %190 = select i1 %170, i32 %189, i32 %188
> 
> and codegen's to:
> 
> LBB3_62:                                ## %DrawScore.exit
>        movl    %edx, %eax
>        negl    %eax
>        testl   %ecx, %ecx
>        cmovnel %edx, %eax
> 
> I don't know if there is a trickier way to do that with flags.

One possibility would be
cmp $1, %ecx
sbbl %eax, %eax
(notl %eax)
xorl %edx, %eax

Some processors can even ignore the fake sbb dependency on %eax so this should be slightly
faster than the cmov code. The existing conditional increment X86 DAGcombine can probably
be extended to handle this.

> 
> This occurs later in a somewhat hot place as well:
> 
>  %448 = icmp eq i32 %wtm, 0
>  %449 = sub nsw i32 0, %storemerge310.lcssa
>  %450 = select i1 %448, i32 %449, i32 %storemerge310.lcssa
>  %451 = load i32* @largest_positional_score, align 4, !tbaa !3
>  %452 = sub nsw i32 %450, %451
> 
>        movl    180(%rsp), %eax         ## 4-byte Reload
>        movl    %eax, %ecx
>        negl    %ecx
>        cmpl    $0, 16(%rsp)            ## 4-byte Folded Reload
>        cmovnel %eax, %ecx
>        movl    _largest_positional_score(%rip), %eax
>        movl    %ecx, %edx
>        subl    %eax, %edx
> 
> That seems somehow simplifyable.
> 
> Another conditional negate that is hot is this block:
> 
>  %955 = and i32 %927, 7
>  %956 = sub nsw i32 %955, %898
>  %ispos351 = icmp sgt i32 %956, -1
>  %neg352 = sub i32 0, %956
>  %957 = select i1 %ispos351, i32 %956, i32 %neg352
>  %958 = ashr i32 %927, 3
>  %959 = sub nsw i32 %958, %900
>  %ispos353 = icmp sgt i32 %959, -1
>  %neg354 = sub i32 0, %959
>  %960 = select i1 %ispos353, i32 %959, i32 %neg354
>  %961 = icmp sgt i32 %957, %960
>  %962 = select i1 %961, i32 %957, i32 %960
>  %963 = sub nsw i32 7, %962
>  %964 = mul nsw i32 %901, %963
>  %965 = sdiv i32 %964, 10
>  %966 = add nsw i32 %965, %952
>  br label %967
> 
> Which codegens to:
> 
> 
> LBB3_303:                               ##   in Loop: Header=BB3_287 Depth=1
>        movl    %r15d, %eax
>        sarl    $3, %eax
>        subl    %esi, %eax
>        movl    %eax, %ebp
>        sarl    $31, %ebp
>        addl    %ebp, %eax
>        xorl    %ebp, %eax
>        andl    $7, %r15d
>        subl    %edi, %r15d
>        movl    %r15d, %ebp
>        sarl    $31, %ebp
>        addl    %ebp, %r15d
>        xorl    %ebp, %r15d
>        cmpl    %eax, %r15d
>        cmovlel %eax, %r15d
>        movl    $7, %eax
>        subl    %r15d, %eax
>        imull   %edx, %eax
>        movslq  %eax, %rax
>        imulq   $1717986919, %rax, %rax ## imm = 0x66666667
>        movq    %rax, %r15
>        shrq    $63, %r15
>        shrq    $32, %rax
>        sarl    $2, %eax
>        addl    %r15d, %eax
>        addl    %r13d, %eax
> 
> It seems that there should be a less painful way to sdiv by 10.  *shrug*  This pattern occurs in a couple places.

I don't know of any, 10 is such an odd number ;)

> This is an interesting code sequence I noticed:
> 
> ; <label>:1299                                    ; preds = %FirstOne.exit160
>  %1300 = lshr i32 128, %1291
>  %1301 = and i32 %1266, %1300
>  %1302 = icmp ne i32 %1301, 0
>  br label %1303
> 
> This codegen's to:
> 
>        movb    %bl, %cl
>        movl    $128, %r15d
>        shrl    %cl, %r15d
>        movl    68(%rsp), %ecx          ## 4-byte Reload
>        testl   %r15d, %ecx
>        setne   %cl
> 
> Instead of shifting a constant right to test by a bit, it seems that we could shift the value right and use a test, giving us something like:
> 
> movl    68(%rsp), %r15d          ## 4-byte Reload
> shrl %r15d, %cl
> test %r15d, 1
> jnz ...
> 
> I'm not sure if that is safe though.
> 
> Overall the best way to speed up this function is the "divide by 10" idiom.  I added a minor readme entry about the two shifts, but maybe there is something smart we can do with a bigger scope?
> 
> 
> The attacked function contains this interesting sequence in a couple places:
> 
>  %B.not2 = and i32 %square, 56
>  %41 = xor i32 %B.not2, 56
>  %42 = zext i32 %41 to i64
>  %43 = lshr i64 %40, %42
> 
> Which looks like a "conditional shift" by 56.  On targets with predication (like arm) that would probably codegen better as a select.

I guess the arm backend should handle this, if anyone cares about it.

> 
> Anyway, this is probably more than anyone ever wanted to know about crafty.
> 
> -Chris