[llvm-commits] [llvm] r123621 - in /llvm/trunk: lib/CodeGen/SelectionDAG/TargetLowering.cpp test/CodeGen/X86/ctpop-combine.ll

Tue Jan 18 00:12:39 PST 2011

On Jan 17, 2011, at 4:04 AM, Benjamin Kramer wrote:

> Author: d0k
> Date: Mon Jan 17 06:04:57 2011
> New Revision: 123621
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=123621&view=rev
> Log:
> Add a DAGCombine to turn (ctpop x) u< 2 into (x & x-1) == 0.
> 
> This shaves off 4 popcounts from the hacked 186.crafty source.
> 
> This is enabled even when a native popcount instruction is available. The
> combined code is one operation longer but it should be faster nevertheless.

Very nice.  This doesn't seem to speed up crafty much though.  Looking at a shark trace, it looks like much of the time is spent in Evaluate and EvaluatePawns.  Much of the time in EvaluatePawns is spent in this loop:

  %indvar460 = phi i64 [ %indvar.next461, %53 ], [ 0, %FirstOne.exit ]
  %45 = phi i64 [ %49, %53 ], [ %20, %FirstOne.exit ]
  %tmp464 = shl i64 %indvar460, 3
  %tmp465 = add i64 %tmp463, %tmp464
  %scevgep466 = getelementptr [64 x i64]* @w_pawn_attacks, i64 0, i64 %tmp465
  %scevgep467 = getelementptr [64 x i64]* @b_pawn_attacks, i64 0, i64 %tmp465
  %tmp473 = add i64 %tmp472, %tmp464
  %storemerge56 = trunc i64 %tmp473 to i32
  %46 = icmp slt i32 %storemerge56, 48
  br i1 %46, label %47, label %62
; <label>:47                                      ; preds = %44
  %tmp470 = add i64 %tmp469, %tmp464
  %scevgep471 = getelementptr [65 x i64]* @set_mask, i64 0, i64 %tmp470
  %scevgep468 = getelementptr [65 x i64]* @set_mask, i64 0, i64 %tmp465
  %48 = load i64* %scevgep471, align 8, !tbaa !2
  %49 = or i64 %48, %45
  %50 = load i64* %scevgep468, align 8, !tbaa !2
  %51 = and i64 %17, %50
  %52 = icmp eq i64 %51, 0
  br i1 %52, label %53, label %62
; <label>:53                                      ; preds = %47
  %54 = load i64* %scevgep467, align 8, !tbaa !2
  %55 = and i64 %15, %54
  %56 = tail call i64 @llvm.ctpop.i64(i64 %55) nounwind
  %cast.i69 = trunc i64 %56 to i32
  %57 = load i64* %scevgep466, align 8, !tbaa !2
  %58 = and i64 %18, %57
  %59 = tail call i64 @llvm.ctpop.i64(i64 %58) nounwind
  %cast.i70 = trunc i64 %59 to i32
  %60 = sub nsw i32 %cast.i70, %cast.i69
  %61 = icmp eq i32 %60, 2
  %indvar.next461 = add i64 %indvar460, 1
  br i1 %61, label %62, label %44

Our generated machine code is this.  Much of the time in this function is spent in the (insane) ctpop block, which is replicated 4 times:

	.align	4, 0x90
LBB4_14:                                ##   Parent Loop BB4_5 Depth=1
                                        ## =>  This Inner Loop Header: Depth=2
	cmpl	$47, %r11d
	jg	LBB4_17
## BB#15:                               ##   in Loop: Header=BB4_14 Depth=2
	orq	(%r14), %rax
	testq	(%r15,%rdx), %rcx
	jne	LBB4_17
## BB#16:                               ##   in Loop: Header=BB4_14 Depth=2
	leaq	_b_pawn_attacks(%rip), %r12
	movq	(%r15,%r12), %r12
	andq	-16(%rsp), %r12         ## 8-byte Folded Reload
	movq	%r12, %r13
	shrq	%r13
	andq	%rsi, %r13
	subq	%r13, %r12
	movq	%r12, %r13
	andq	%rdi, %r13
	shrq	$2, %r12
	andq	%rdi, %r12
	addq	%r13, %r12
	movq	%r12, %r13
	shrq	$4, %r13
	addq	%r12, %r13
	andq	%r8, %r13
	imulq	%r9, %r13
	shrq	$56, %r13
	leaq	_w_pawn_attacks(%rip), %r12
	movq	(%r15,%r12), %r12
	andq	-8(%rsp), %r12          ## 8-byte Folded Reload
	movq	%r12, %rbp
	shrq	%rbp
	andq	%rsi, %rbp
	subq	%rbp, %r12
	movq	%r12, %rbp
	andq	%rdi, %rbp
	shrq	$2, %r12
	andq	%rdi, %r12
	addq	%rbp, %r12
	movq	%r12, %rbp
	shrq	$4, %rbp
	addq	%r12, %rbp
	andq	%r8, %rbp
	imulq	%r9, %rbp
	shrq	$56, %rbp
	subl	%r13d, %ebp
	addq	$64, %r15
	addl	$8, %r11d
	addq	$64, %r14
	cmpl	$2, %ebp
	jne	LBB4_14

We've done some pretty heroic LSR there and hoisted all the ctpop constants.  I don't see any obvious improvements offhand.  This same loop is duplicated lower in the function, the first copy is 11% of the time, the second copy takes 13% of the time.  There is a third copy later that has 11% of the time, and a 4th later that takes 10% of the time.

This seems like an important loop!  One simple thing that I see is that we have:

...
	imulq	%r9, %r13
	shrq	$56, %r13
...
	imulq	%r9, %rbp
	shrq	$56, %rbp
	subl	%r13d, %ebp
...
	cmpl	$2, %ebp
	jne	LBB4_14

Is there some cheaper way to do ((x*0x101010101010101)>>56)-((y*0x101010101010101)>>56) != 2?

In Evaluate, things are more difficult to analyze.  One commonly repeated pattern is a "conditional negate", which looks like this in IR:

DrawScore.exit:                                   ; preds = %186, %180, %171
  %188 = phi i32 [ %172, %171 ], [ %storemerge1.i, %180 ], [ %.392, %186 ]
  %189 = sub nsw i32 0, %188
  %190 = select i1 %170, i32 %189, i32 %188

and codegen's to:

LBB3_62:                                ## %DrawScore.exit
	movl	%edx, %eax
	negl	%eax
	testl	%ecx, %ecx
	cmovnel	%edx, %eax

I don't know if there is a trickier way to do that with flags.

This occurs later in a somewhat hot place as well:

  %448 = icmp eq i32 %wtm, 0
  %449 = sub nsw i32 0, %storemerge310.lcssa
  %450 = select i1 %448, i32 %449, i32 %storemerge310.lcssa
  %451 = load i32* @largest_positional_score, align 4, !tbaa !3
  %452 = sub nsw i32 %450, %451

	movl	180(%rsp), %eax         ## 4-byte Reload
	movl	%eax, %ecx
	negl	%ecx
	cmpl	$0, 16(%rsp)            ## 4-byte Folded Reload
	cmovnel	%eax, %ecx
	movl	_largest_positional_score(%rip), %eax
	movl	%ecx, %edx
	subl	%eax, %edx

That seems somehow simplifyable.

Another conditional negate that is hot is this block:

  %955 = and i32 %927, 7
  %956 = sub nsw i32 %955, %898
  %ispos351 = icmp sgt i32 %956, -1
  %neg352 = sub i32 0, %956
  %957 = select i1 %ispos351, i32 %956, i32 %neg352
  %958 = ashr i32 %927, 3
  %959 = sub nsw i32 %958, %900
  %ispos353 = icmp sgt i32 %959, -1
  %neg354 = sub i32 0, %959
  %960 = select i1 %ispos353, i32 %959, i32 %neg354
  %961 = icmp sgt i32 %957, %960
  %962 = select i1 %961, i32 %957, i32 %960
  %963 = sub nsw i32 7, %962
  %964 = mul nsw i32 %901, %963
  %965 = sdiv i32 %964, 10
  %966 = add nsw i32 %965, %952
  br label %967

Which codegens to:

LBB3_303:                               ##   in Loop: Header=BB3_287 Depth=1
	movl	%r15d, %eax
	sarl	$3, %eax
	subl	%esi, %eax
	movl	%eax, %ebp
	sarl	$31, %ebp
	addl	%ebp, %eax
	xorl	%ebp, %eax
	andl	$7, %r15d
	subl	%edi, %r15d
	movl	%r15d, %ebp
	sarl	$31, %ebp
	addl	%ebp, %r15d
	xorl	%ebp, %r15d
	cmpl	%eax, %r15d
	cmovlel	%eax, %r15d
	movl	$7, %eax
	subl	%r15d, %eax
	imull	%edx, %eax
	movslq	%eax, %rax
	imulq	$1717986919, %rax, %rax ## imm = 0x66666667
	movq	%rax, %r15
	shrq	$63, %r15
	shrq	$32, %rax
	sarl	$2, %eax
	addl	%r15d, %eax
	addl	%r13d, %eax

It seems that there should be a less painful way to sdiv by 10.  *shrug*  This pattern occurs in a couple places.

This is an interesting code sequence I noticed:

; <label>:1299                                    ; preds = %FirstOne.exit160
  %1300 = lshr i32 128, %1291
  %1301 = and i32 %1266, %1300
  %1302 = icmp ne i32 %1301, 0
  br label %1303

This codegen's to:

	movb	%bl, %cl
	movl	$128, %r15d
	shrl	%cl, %r15d
	movl	68(%rsp), %ecx          ## 4-byte Reload
	testl	%r15d, %ecx
	setne	%cl

Instead of shifting a constant right to test by a bit, it seems that we could shift the value right and use a test, giving us something like:

movl	68(%rsp), %r15d          ## 4-byte Reload
shrl %r15d, %cl
test %r15d, 1
jnz ...

I'm not sure if that is safe though.

Overall the best way to speed up this function is the "divide by 10" idiom.  I added a minor readme entry about the two shifts, but maybe there is something smart we can do with a bigger scope? 

The attacked function contains this interesting sequence in a couple places:

  %B.not2 = and i32 %square, 56
  %41 = xor i32 %B.not2, 56
  %42 = zext i32 %41 to i64
  %43 = lshr i64 %40, %42

Which looks like a "conditional shift" by 56.  On targets with predication (like arm) that would probably codegen better as a select.

Anyway, this is probably more than anyone ever wanted to know about crafty.

-Chris