[llvm-commits] [llvm] r123621 - in /llvm/trunk: lib/CodeGen/SelectionDAG/TargetLowering.cpp test/CodeGen/X86/ctpop-combine.ll
Chris Lattner
clattner at apple.com
Tue Jan 18 00:12:39 PST 2011
On Jan 17, 2011, at 4:04 AM, Benjamin Kramer wrote:
> Author: d0k
> Date: Mon Jan 17 06:04:57 2011
> New Revision: 123621
>
> URL: http://llvm.org/viewvc/llvm-project?rev=123621&view=rev
> Log:
> Add a DAGCombine to turn (ctpop x) u< 2 into (x & x-1) == 0.
>
> This shaves off 4 popcounts from the hacked 186.crafty source.
>
> This is enabled even when a native popcount instruction is available. The
> combined code is one operation longer but it should be faster nevertheless.
Very nice. This doesn't seem to speed up crafty much though. Looking at a shark trace, it looks like much of the time is spent in Evaluate and EvaluatePawns. Much of the time in EvaluatePawns is spent in this loop:
%indvar460 = phi i64 [ %indvar.next461, %53 ], [ 0, %FirstOne.exit ]
%45 = phi i64 [ %49, %53 ], [ %20, %FirstOne.exit ]
%tmp464 = shl i64 %indvar460, 3
%tmp465 = add i64 %tmp463, %tmp464
%scevgep466 = getelementptr [64 x i64]* @w_pawn_attacks, i64 0, i64 %tmp465
%scevgep467 = getelementptr [64 x i64]* @b_pawn_attacks, i64 0, i64 %tmp465
%tmp473 = add i64 %tmp472, %tmp464
%storemerge56 = trunc i64 %tmp473 to i32
%46 = icmp slt i32 %storemerge56, 48
br i1 %46, label %47, label %62
; <label>:47 ; preds = %44
%tmp470 = add i64 %tmp469, %tmp464
%scevgep471 = getelementptr [65 x i64]* @set_mask, i64 0, i64 %tmp470
%scevgep468 = getelementptr [65 x i64]* @set_mask, i64 0, i64 %tmp465
%48 = load i64* %scevgep471, align 8, !tbaa !2
%49 = or i64 %48, %45
%50 = load i64* %scevgep468, align 8, !tbaa !2
%51 = and i64 %17, %50
%52 = icmp eq i64 %51, 0
br i1 %52, label %53, label %62
; <label>:53 ; preds = %47
%54 = load i64* %scevgep467, align 8, !tbaa !2
%55 = and i64 %15, %54
%56 = tail call i64 @llvm.ctpop.i64(i64 %55) nounwind
%cast.i69 = trunc i64 %56 to i32
%57 = load i64* %scevgep466, align 8, !tbaa !2
%58 = and i64 %18, %57
%59 = tail call i64 @llvm.ctpop.i64(i64 %58) nounwind
%cast.i70 = trunc i64 %59 to i32
%60 = sub nsw i32 %cast.i70, %cast.i69
%61 = icmp eq i32 %60, 2
%indvar.next461 = add i64 %indvar460, 1
br i1 %61, label %62, label %44
Our generated machine code is this. Much of the time in this function is spent in the (insane) ctpop block, which is replicated 4 times:
.align 4, 0x90
LBB4_14: ## Parent Loop BB4_5 Depth=1
## => This Inner Loop Header: Depth=2
cmpl $47, %r11d
jg LBB4_17
## BB#15: ## in Loop: Header=BB4_14 Depth=2
orq (%r14), %rax
testq (%r15,%rdx), %rcx
jne LBB4_17
## BB#16: ## in Loop: Header=BB4_14 Depth=2
leaq _b_pawn_attacks(%rip), %r12
movq (%r15,%r12), %r12
andq -16(%rsp), %r12 ## 8-byte Folded Reload
movq %r12, %r13
shrq %r13
andq %rsi, %r13
subq %r13, %r12
movq %r12, %r13
andq %rdi, %r13
shrq $2, %r12
andq %rdi, %r12
addq %r13, %r12
movq %r12, %r13
shrq $4, %r13
addq %r12, %r13
andq %r8, %r13
imulq %r9, %r13
shrq $56, %r13
leaq _w_pawn_attacks(%rip), %r12
movq (%r15,%r12), %r12
andq -8(%rsp), %r12 ## 8-byte Folded Reload
movq %r12, %rbp
shrq %rbp
andq %rsi, %rbp
subq %rbp, %r12
movq %r12, %rbp
andq %rdi, %rbp
shrq $2, %r12
andq %rdi, %r12
addq %rbp, %r12
movq %r12, %rbp
shrq $4, %rbp
addq %r12, %rbp
andq %r8, %rbp
imulq %r9, %rbp
shrq $56, %rbp
subl %r13d, %ebp
addq $64, %r15
addl $8, %r11d
addq $64, %r14
cmpl $2, %ebp
jne LBB4_14
We've done some pretty heroic LSR there and hoisted all the ctpop constants. I don't see any obvious improvements offhand. This same loop is duplicated lower in the function, the first copy is 11% of the time, the second copy takes 13% of the time. There is a third copy later that has 11% of the time, and a 4th later that takes 10% of the time.
This seems like an important loop! One simple thing that I see is that we have:
...
imulq %r9, %r13
shrq $56, %r13
...
imulq %r9, %rbp
shrq $56, %rbp
subl %r13d, %ebp
...
cmpl $2, %ebp
jne LBB4_14
Is there some cheaper way to do ((x*0x101010101010101)>>56)-((y*0x101010101010101)>>56) != 2?
In Evaluate, things are more difficult to analyze. One commonly repeated pattern is a "conditional negate", which looks like this in IR:
DrawScore.exit: ; preds = %186, %180, %171
%188 = phi i32 [ %172, %171 ], [ %storemerge1.i, %180 ], [ %.392, %186 ]
%189 = sub nsw i32 0, %188
%190 = select i1 %170, i32 %189, i32 %188
and codegen's to:
LBB3_62: ## %DrawScore.exit
movl %edx, %eax
negl %eax
testl %ecx, %ecx
cmovnel %edx, %eax
I don't know if there is a trickier way to do that with flags.
This occurs later in a somewhat hot place as well:
%448 = icmp eq i32 %wtm, 0
%449 = sub nsw i32 0, %storemerge310.lcssa
%450 = select i1 %448, i32 %449, i32 %storemerge310.lcssa
%451 = load i32* @largest_positional_score, align 4, !tbaa !3
%452 = sub nsw i32 %450, %451
movl 180(%rsp), %eax ## 4-byte Reload
movl %eax, %ecx
negl %ecx
cmpl $0, 16(%rsp) ## 4-byte Folded Reload
cmovnel %eax, %ecx
movl _largest_positional_score(%rip), %eax
movl %ecx, %edx
subl %eax, %edx
That seems somehow simplifyable.
Another conditional negate that is hot is this block:
%955 = and i32 %927, 7
%956 = sub nsw i32 %955, %898
%ispos351 = icmp sgt i32 %956, -1
%neg352 = sub i32 0, %956
%957 = select i1 %ispos351, i32 %956, i32 %neg352
%958 = ashr i32 %927, 3
%959 = sub nsw i32 %958, %900
%ispos353 = icmp sgt i32 %959, -1
%neg354 = sub i32 0, %959
%960 = select i1 %ispos353, i32 %959, i32 %neg354
%961 = icmp sgt i32 %957, %960
%962 = select i1 %961, i32 %957, i32 %960
%963 = sub nsw i32 7, %962
%964 = mul nsw i32 %901, %963
%965 = sdiv i32 %964, 10
%966 = add nsw i32 %965, %952
br label %967
Which codegens to:
LBB3_303: ## in Loop: Header=BB3_287 Depth=1
movl %r15d, %eax
sarl $3, %eax
subl %esi, %eax
movl %eax, %ebp
sarl $31, %ebp
addl %ebp, %eax
xorl %ebp, %eax
andl $7, %r15d
subl %edi, %r15d
movl %r15d, %ebp
sarl $31, %ebp
addl %ebp, %r15d
xorl %ebp, %r15d
cmpl %eax, %r15d
cmovlel %eax, %r15d
movl $7, %eax
subl %r15d, %eax
imull %edx, %eax
movslq %eax, %rax
imulq $1717986919, %rax, %rax ## imm = 0x66666667
movq %rax, %r15
shrq $63, %r15
shrq $32, %rax
sarl $2, %eax
addl %r15d, %eax
addl %r13d, %eax
It seems that there should be a less painful way to sdiv by 10. *shrug* This pattern occurs in a couple places.
This is an interesting code sequence I noticed:
; <label>:1299 ; preds = %FirstOne.exit160
%1300 = lshr i32 128, %1291
%1301 = and i32 %1266, %1300
%1302 = icmp ne i32 %1301, 0
br label %1303
This codegen's to:
movb %bl, %cl
movl $128, %r15d
shrl %cl, %r15d
movl 68(%rsp), %ecx ## 4-byte Reload
testl %r15d, %ecx
setne %cl
Instead of shifting a constant right to test by a bit, it seems that we could shift the value right and use a test, giving us something like:
movl 68(%rsp), %r15d ## 4-byte Reload
shrl %r15d, %cl
test %r15d, 1
jnz ...
I'm not sure if that is safe though.
Overall the best way to speed up this function is the "divide by 10" idiom. I added a minor readme entry about the two shifts, but maybe there is something smart we can do with a bigger scope?
The attacked function contains this interesting sequence in a couple places:
%B.not2 = and i32 %square, 56
%41 = xor i32 %B.not2, 56
%42 = zext i32 %41 to i64
%43 = lshr i64 %40, %42
Which looks like a "conditional shift" by 56. On targets with predication (like arm) that would probably codegen better as a select.
Anyway, this is probably more than anyone ever wanted to know about crafty.
-Chris
More information about the llvm-commits
mailing list