[PATCH] D20598: [X86] Detect SAD patterns and emit psadbw instructions on X86 redux
Michael Kuperstein via llvm-commits
llvm-commits at lists.llvm.org
Fri May 27 09:20:30 PDT 2016
mkuper added a comment.
I agree the spill code is complete nonsense, but I don't think it should block this patch, for two reasons:
1. We seem to have a real problem with inserts into oversize vectors (e.g. <16 x i32> on SSE2), but I'm not entirely sure it's a high priority. I'm not sure how much of those the vectorizer actually generates - although, for reductions, that may be more common, I haven't checked.
2. More importantly, even with the spills, it still looks like a large net improvement. E.g. the SSE2 code for the sad16_i8 loop before this patch is:
.LBB0_1: # %vector.body
# =>This Inner Loop Header: Depth=1
movdqu a+1024(%rax), %xmm7
pshufd $78, %xmm7, %xmm5 # xmm5 = xmm7[2,3,0,1]
punpcklbw %xmm10, %xmm5 # xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
movdqa %xmm5, %xmm6
punpcklwd %xmm10, %xmm6 # xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3]
punpckhwd %xmm10, %xmm5 # xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
punpcklbw %xmm10, %xmm7 # xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
movdqa %xmm7, %xmm0
punpcklwd %xmm10, %xmm0 # xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
punpckhwd %xmm10, %xmm7 # xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
movdqu b+1024(%rax), %xmm1
pshufd $78, %xmm1, %xmm2 # xmm2 = xmm1[2,3,0,1]
punpcklbw %xmm10, %xmm2 # xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
movdqa %xmm2, %xmm3
punpcklwd %xmm10, %xmm3 # xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
punpckhwd %xmm10, %xmm2 # xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
punpcklbw %xmm10, %xmm1 # xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
movdqa %xmm1, %xmm4
punpcklwd %xmm10, %xmm4 # xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
punpckhwd %xmm10, %xmm1 # xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
psubd %xmm1, %xmm7
psubd %xmm4, %xmm0
psubd %xmm2, %xmm5
psubd %xmm3, %xmm6
movdqa %xmm6, %xmm1
psrad $31, %xmm1
paddd %xmm1, %xmm6
pxor %xmm1, %xmm6
movdqa %xmm5, %xmm1
psrad $31, %xmm1
paddd %xmm1, %xmm5
pxor %xmm1, %xmm5
movdqa %xmm0, %xmm1
psrad $31, %xmm1
paddd %xmm1, %xmm0
pxor %xmm1, %xmm0
movdqa %xmm7, %xmm1
psrad $31, %xmm1
paddd %xmm1, %xmm7
pxor %xmm1, %xmm7
paddd %xmm7, %xmm8
paddd %xmm0, %xmm9
paddd %xmm5, %xmm11
paddd %xmm6, %xmm12
addq $4, %rax
jne .LBB0_1
And with this patch:
.LBB0_1: # %vector.body
# =>This Inner Loop Header: Depth=1
movdqa %xmm0, %xmm4
movdqu a+1024(%rax), %xmm5
movdqu b+1024(%rax), %xmm0
movdqa %xmm4, (%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm3, 32(%rsp)
movdqa %xmm2, 48(%rsp)
psadbw %xmm5, %xmm0
paddd %xmm4, %xmm0
movdqa %xmm0, (%rsp)
movdqa 16(%rsp), %xmm1
movdqa 32(%rsp), %xmm3
movdqa 48(%rsp), %xmm2
addq $4, %rax
Are you ok with me committing as is, and filing a PR on the insert_subvector issue?
http://reviews.llvm.org/D20598
More information about the llvm-commits
mailing list