[PATCH] D20598: [X86] Detect SAD patterns and emit psadbw instructions on X86 redux

Fri May 27 09:20:30 PDT 2016

mkuper added a comment.

I agree the spill code is complete nonsense, but I don't think it should block this patch, for two reasons:

1. We seem to have a real problem with inserts into oversize vectors (e.g. <16 x i32> on SSE2), but I'm not entirely sure it's a high priority. I'm not sure how much of those the vectorizer actually generates - although, for reductions, that may be more common, I haven't checked.

2. More importantly, even with the spills, it still looks like a large net improvement. E.g. the SSE2 code for the sad16_i8 loop before this patch is:

  .LBB0_1:                                # %vector.body
                                          # =>This Inner Loop Header: Depth=1
  	movdqu	a+1024(%rax), %xmm7
  	pshufd	$78, %xmm7, %xmm5       # xmm5 = xmm7[2,3,0,1]
  	punpcklbw	%xmm10, %xmm5   # xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
  	movdqa	%xmm5, %xmm6
  	punpcklwd	%xmm10, %xmm6   # xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3]
  	punpckhwd	%xmm10, %xmm5   # xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
  	punpcklbw	%xmm10, %xmm7   # xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
  	movdqa	%xmm7, %xmm0
  	punpcklwd	%xmm10, %xmm0   # xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
  	punpckhwd	%xmm10, %xmm7   # xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
  	movdqu	b+1024(%rax), %xmm1
  	pshufd	$78, %xmm1, %xmm2       # xmm2 = xmm1[2,3,0,1]
  	punpcklbw	%xmm10, %xmm2   # xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
  	movdqa	%xmm2, %xmm3
  	punpcklwd	%xmm10, %xmm3   # xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
  	punpckhwd	%xmm10, %xmm2   # xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
  	punpcklbw	%xmm10, %xmm1   # xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
  	movdqa	%xmm1, %xmm4
  	punpcklwd	%xmm10, %xmm4   # xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
  	punpckhwd	%xmm10, %xmm1   # xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
  	psubd	%xmm1, %xmm7
  	psubd	%xmm4, %xmm0
  	psubd	%xmm2, %xmm5
  	psubd	%xmm3, %xmm6
  	movdqa	%xmm6, %xmm1
  	psrad	$31, %xmm1
  	paddd	%xmm1, %xmm6
  	pxor	%xmm1, %xmm6
  	movdqa	%xmm5, %xmm1
  	psrad	$31, %xmm1
  	paddd	%xmm1, %xmm5
  	pxor	%xmm1, %xmm5
  	movdqa	%xmm0, %xmm1
  	psrad	$31, %xmm1
  	paddd	%xmm1, %xmm0
  	pxor	%xmm1, %xmm0
  	movdqa	%xmm7, %xmm1
  	psrad	$31, %xmm1
  	paddd	%xmm1, %xmm7
  	pxor	%xmm1, %xmm7
  	paddd	%xmm7, %xmm8
  	paddd	%xmm0, %xmm9
  	paddd	%xmm5, %xmm11
  	paddd	%xmm6, %xmm12
  	addq	$4, %rax
  	jne	.LBB0_1

And with this patch:

  .LBB0_1:                                # %vector.body
                                          # =>This Inner Loop Header: Depth=1
  	movdqa	%xmm0, %xmm4
  	movdqu	a+1024(%rax), %xmm5
  	movdqu	b+1024(%rax), %xmm0
  	movdqa	%xmm4, (%rsp)
  	movdqa	%xmm1, 16(%rsp)
  	movdqa	%xmm3, 32(%rsp)
  	movdqa	%xmm2, 48(%rsp)
  	psadbw	%xmm5, %xmm0
  	paddd	%xmm4, %xmm0
  	movdqa	%xmm0, (%rsp)
  	movdqa	16(%rsp), %xmm1
  	movdqa	32(%rsp), %xmm3
  	movdqa	48(%rsp), %xmm2
  	addq	$4, %rax

Are you ok with me committing as is, and filing a PR on the insert_subvector issue?

http://reviews.llvm.org/D20598