[PATCH] D42607: [LoopStrengthReduce, x86] don't add cost for a cmp that will be macro-fused (PR35681)

Tue Jan 30 12:17:26 PST 2018

spatel added a comment.

Thanks @evstupac and @venkataramanan.kumar.llvm for the feedback. I micro-benchmarked the maxArray code in its -O2 unrolled form from PR28343 (see below for object dump with address offsets), but as expected, I can't measure any perf difference on Haswell.

It does shrink the loop from 134 bytes to 97 bytes though.

Baseline (don't account for macro-fusion, so less instructions is considered better):

  0000000100001000	movq	$-0x80000, %rax
  0000000100001007	nopw	(%rax,%rax)
  0000000100001010	movupd	0x80000(%rsi,%rax), %xmm0
  0000000100001019	movupd	0x80000(%rdi,%rax), %xmm1
  0000000100001022	maxpd	%xmm1, %xmm0
  0000000100001026	movupd	0x80010(%rdi,%rax), %xmm1
  000000010000102f	movupd	0x80020(%rdi,%rax), %xmm2
  0000000100001038	movupd	0x80030(%rdi,%rax), %xmm3
  0000000100001041	movupd	%xmm0, 0x80000(%rdi,%rax)
  000000010000104a	movupd	0x80010(%rsi,%rax), %xmm0
  0000000100001053	maxpd	%xmm1, %xmm0
  0000000100001057	movupd	%xmm0, 0x80010(%rdi,%rax)
  0000000100001060	movupd	0x80020(%rsi,%rax), %xmm0
  0000000100001069	maxpd	%xmm2, %xmm0
  000000010000106d	movupd	0x80030(%rsi,%rax), %xmm1
  0000000100001076	movupd	%xmm0, 0x80020(%rdi,%rax)
  000000010000107f	maxpd	%xmm3, %xmm1
  0000000100001083	movupd	%xmm1, 0x80030(%rdi,%rax)
  000000010000108c	addq	$0x40, %rax
  0000000100001090	jne	0x100001010
  0000000100001096	retq

Use macro-fusion in cost calc (extra instruction, but allows smaller constant offsets):

  0000000100001000	xorl	%eax, %eax
  0000000100001002	nopw	%cs:(%rax,%rax)
  000000010000100c	nopl	(%rax)
  0000000100001010	movupd	(%rsi,%rax,8), %xmm0
  0000000100001015	movupd	0x10(%rsi,%rax,8), %xmm1
  000000010000101b	movupd	(%rdi,%rax,8), %xmm2
  0000000100001020	maxpd	%xmm2, %xmm0
  0000000100001024	movupd	0x10(%rdi,%rax,8), %xmm2
  000000010000102a	maxpd	%xmm2, %xmm1
  000000010000102e	movupd	0x20(%rdi,%rax,8), %xmm2
  0000000100001034	movupd	0x30(%rdi,%rax,8), %xmm3
  000000010000103a	movupd	%xmm0, (%rdi,%rax,8)
  000000010000103f	movupd	%xmm1, 0x10(%rdi,%rax,8)
  0000000100001045	movupd	0x20(%rsi,%rax,8), %xmm0
  000000010000104b	maxpd	%xmm2, %xmm0
  000000010000104f	movupd	0x30(%rsi,%rax,8), %xmm1
  0000000100001055	maxpd	%xmm3, %xmm1
  0000000100001059	movupd	%xmm0, 0x20(%rdi,%rax,8)
  000000010000105f	movupd	%xmm1, 0x30(%rdi,%rax,8)
  0000000100001065	addq	$0x8, %rax
  0000000100001069	cmpq	$0x10000, %rax
  000000010000106f	jne	0x100001010
  0000000100001071	retq

https://reviews.llvm.org/D42607