[LLVMbugs] [Bug 21281] New: Performance regression in vector interleave
bugzilla-daemon at llvm.org
bugzilla-daemon at llvm.org
Tue Oct 14 16:17:55 PDT 2014
http://llvm.org/bugs/show_bug.cgi?id=21281
Bug ID: 21281
Summary: Performance regression in vector interleave
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: normal
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: andrew.b.adams at gmail.com
CC: llvmbugs at cs.uiuc.edu
Classification: Unclassified
Created attachment 13204
--> http://llvm.org/bugs/attachment.cgi?id=13204&action=edit
ll to reproduce
The attached ll interleaves 8 8xi16 vectors into a single 64xi16 vector. It's
used as part of a fast transpose routine. This particular formulation is used
because it generates great code with llvm <= 3.5 on x86 and arm. arm is still
superb with trunk (12 zip instructions!), but x86 has regressed in trunk, and
gives us a >20% slowdown for a transpose benchmark.
Old asm from llvm release 3.5. 36 instructions.
movdqa %xmm0, %xmm8
punpcklwd %xmm1, %xmm8
punpckhwd %xmm1, %xmm0
movdqa %xmm2, %xmm1
punpcklwd %xmm3, %xmm1
punpckhwd %xmm3, %xmm2
movdqa %xmm0, %xmm3
punpckhdq %xmm2, %xmm3
punpckldq %xmm2, %xmm0
movdqa %xmm8, %xmm2
punpckhdq %xmm1, %xmm2
punpckldq %xmm1, %xmm8
movdqa %xmm4, %xmm1
punpcklwd %xmm5, %xmm1
punpckhwd %xmm5, %xmm4
movdqa %xmm7, %xmm5
punpcklwd %xmm6, %xmm5
punpckhwd %xmm6, %xmm7
movdqa %xmm4, %xmm6
punpckhdq %xmm7, %xmm6
punpckldq %xmm7, %xmm4
movdqa %xmm1, %xmm7
punpckhdq %xmm5, %xmm7
punpckldq %xmm5, %xmm1
movdqa %xmm8, %xmm5
punpcklwd %xmm1, %xmm5
punpckhwd %xmm1, %xmm8
movdqa %xmm2, %xmm1
punpcklwd %xmm7, %xmm1
punpckhwd %xmm7, %xmm2
movdqa %xmm0, %xmm7
punpcklwd %xmm4, %xmm7
punpckhwd %xmm4, %xmm0
movdqa %xmm3, %xmm4
punpcklwd %xmm6, %xmm4
punpckhwd %xmm6, %xmm3
New asm from llvm trunk. 44 instructions.
pshufd $78, %xmm1, %xmm9
pshufd $78, %xmm0, %xmm8
punpcklwd %xmm9, %xmm8
punpcklwd %xmm1, %xmm0
pshufd $78, %xmm3, %xmm9
pshufd $78, %xmm2, %xmm1
punpcklwd %xmm9, %xmm1
punpcklwd %xmm3, %xmm2
movdqa %xmm0, %xmm9
punpckhdq %xmm2, %xmm9
punpckldq %xmm2, %xmm0
movdqa %xmm8, %xmm2
punpckhdq %xmm1, %xmm2
punpckldq %xmm1, %xmm8
pshufd $78, %xmm5, %xmm1
pshufd $78, %xmm4, %xmm3
punpcklwd %xmm1, %xmm3
punpcklwd %xmm5, %xmm4
pshufd $78, %xmm6, %xmm1
pshufd $78, %xmm7, %xmm5
punpcklwd %xmm1, %xmm5
punpcklwd %xmm6, %xmm7
movdqa %xmm4, %xmm1
punpckhdq %xmm7, %xmm1
punpckldq %xmm7, %xmm4
movdqa %xmm3, %xmm6
punpckhdq %xmm5, %xmm6
punpckldq %xmm5, %xmm3
pshufd $78, %xmm3, %xmm5
pshufd $78, %xmm8, %xmm11
punpcklwd %xmm5, %xmm11
pshufd $78, %xmm6, %xmm10
pshufd $78, %xmm2, %xmm12
punpcklwd %xmm10, %xmm12
pshufd $78, %xmm4, %xmm10
pshufd $78, %xmm0, %xmm7
punpcklwd %xmm10, %xmm7
pshufd $78, %xmm1, %xmm10
pshufd $78, %xmm9, %xmm5
punpcklwd %xmm10, %xmm5
punpcklwd %xmm3, %xmm8
punpcklwd %xmm6, %xmm2
punpcklwd %xmm4, %xmm0
punpcklwd %xmm1, %xmm9
The difference is even more stark with avx enabled:
llvm 3.5. 28 instructions.
vpunpcklwd %xmm1, %xmm0, %xmm8
vpunpckhwd %xmm1, %xmm0, %xmm0
vpunpcklwd %xmm3, %xmm2, %xmm1
vpunpckhwd %xmm3, %xmm2, %xmm2
vpunpckhdq %xmm2, %xmm0, %xmm9
vpunpckldq %xmm2, %xmm0, %xmm2
vpunpckhdq %xmm1, %xmm8, %xmm3
vpunpckldq %xmm1, %xmm8, %xmm0
vpunpcklwd %xmm5, %xmm4, %xmm1
vpunpckhwd %xmm5, %xmm4, %xmm4
vpunpcklwd %xmm6, %xmm7, %xmm5
vpunpckhwd %xmm6, %xmm7, %xmm6
vpunpckhdq %xmm6, %xmm4, %xmm7
vpunpckldq %xmm6, %xmm4, %xmm4
vpunpckhdq %xmm5, %xmm1, %xmm6
vpunpckldq %xmm5, %xmm1, %xmm1
vpunpckhwd %xmm1, %xmm0, %xmm5
vpunpcklwd %xmm1, %xmm0, %xmm0
vinsertf128 $1, %xmm5, %ymm0, %ymm0
vpunpckhwd %xmm6, %xmm3, %xmm1
vpunpcklwd %xmm6, %xmm3, %xmm3
vinsertf128 $1, %xmm1, %ymm3, %ymm1
vpunpckhwd %xmm4, %xmm2, %xmm3
vpunpcklwd %xmm4, %xmm2, %xmm2
vinsertf128 $1, %xmm3, %ymm2, %ymm2
vpunpckhwd %xmm7, %xmm9, %xmm3
vpunpcklwd %xmm7, %xmm9, %xmm4
vinsertf128 $1, %xmm3, %ymm4, %ymm3
llvm trunk. 84 instructions.
vpunpcklwd %xmm1, %xmm1, %xmm8
vpmovzxwd %xmm0, %xmm9
vpblendw $-86, %xmm8, %xmm9, %xmm8
vpunpckhwd %xmm1, %xmm1, %xmm1
vpunpckhwd %xmm0, %xmm0, %xmm0
vpblendw $-86, %xmm1, %xmm0, %xmm0
vpunpcklwd %xmm3, %xmm3, %xmm9
vpmovzxwd %xmm2, %xmm1
vpblendw $-86, %xmm9, %xmm1, %xmm1
vpunpckhwd %xmm3, %xmm3, %xmm3
vpunpckhwd %xmm2, %xmm2, %xmm2
vpblendw $-86, %xmm3, %xmm2, %xmm2
vpermilps $96, %xmm2, %xmm3
vpermilps $-24, %xmm2, %xmm2
vinsertf128 $1, %xmm2, %ymm3, %ymm2
vpermilps $-44, %xmm0, %xmm3
vpermilps $-10, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm3, %ymm0
vblendps $-86, %ymm2, %ymm0, %ymm3
vpermilps $96, %xmm1, %xmm0
vpermilps $-24, %xmm1, %xmm1
vinsertf128 $1, %xmm1, %ymm0, %ymm0
vpermilps $-44, %xmm8, %xmm1
vpermilps $-10, %xmm8, %xmm2
vinsertf128 $1, %xmm2, %ymm1, %ymm1
vblendps $-86, %ymm0, %ymm1, %ymm1
vpunpcklwd %xmm5, %xmm5, %xmm0
vpmovzxwd %xmm4, %xmm2
vpblendw $-86, %xmm0, %xmm2, %xmm0
vpunpckhwd %xmm5, %xmm5, %xmm2
vpunpckhwd %xmm4, %xmm4, %xmm4
vpblendw $-86, %xmm2, %xmm4, %xmm2
vpunpcklwd %xmm6, %xmm6, %xmm4
vpmovzxwd %xmm7, %xmm5
vpblendw $-86, %xmm4, %xmm5, %xmm5
vpunpckhwd %xmm6, %xmm6, %xmm4
vpunpckhwd %xmm7, %xmm7, %xmm6
vpblendw $-86, %xmm4, %xmm6, %xmm4
vpermilps $96, %xmm4, %xmm6
vpermilps $-24, %xmm4, %xmm4
vinsertf128 $1, %xmm4, %ymm6, %ymm4
vpermilps $-44, %xmm2, %xmm6
vpermilps $-10, %xmm2, %xmm2
vinsertf128 $1, %xmm2, %ymm6, %ymm2
vblendps $-86, %ymm4, %ymm2, %ymm4
vpermilps $96, %xmm5, %xmm2
vpermilps $-24, %xmm5, %xmm5
vinsertf128 $1, %xmm5, %ymm2, %ymm2
vpermilps $-44, %xmm0, %xmm5
vpermilps $-10, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm5, %ymm0
vblendps $-86, %ymm2, %ymm0, %ymm2
vpunpckhwd %xmm2, %xmm2, %xmm0
vpunpckhwd %xmm1, %xmm1, %xmm5
vpblendw $-86, %xmm0, %xmm5, %xmm0
vpunpcklwd %xmm2, %xmm2, %xmm5
vpmovzxwd %xmm1, %xmm6
vpblendw $-86, %xmm5, %xmm6, %xmm5
vinsertf128 $1, %xmm0, %ymm5, %ymm0
vextractf128 $1, %ymm2, %xmm2
vpunpckhwd %xmm2, %xmm2, %xmm5
vextractf128 $1, %ymm1, %xmm1
vpunpckhwd %xmm1, %xmm1, %xmm6
vpblendw $-86, %xmm5, %xmm6, %xmm5
vpunpcklwd %xmm2, %xmm2, %xmm2
vpmovzxwd %xmm1, %xmm1
vpblendw $-86, %xmm2, %xmm1, %xmm1
vinsertf128 $1, %xmm5, %ymm1, %ymm1
vpunpckhwd %xmm4, %xmm4, %xmm2
vpunpckhwd %xmm3, %xmm3, %xmm5
vpblendw $-86, %xmm2, %xmm5, %xmm2
vpunpcklwd %xmm4, %xmm4, %xmm5
vpmovzxwd %xmm3, %xmm6
vpblendw $-86, %xmm5, %xmm6, %xmm5
vinsertf128 $1, %xmm2, %ymm5, %ymm2
vextractf128 $1, %ymm4, %xmm4
vpunpckhwd %xmm4, %xmm4, %xmm5
vextractf128 $1, %ymm3, %xmm3
vpunpckhwd %xmm3, %xmm3, %xmm6
vpblendw $-86, %xmm5, %xmm6, %xmm5
vpunpcklwd %xmm4, %xmm4, %xmm4
vpmovzxwd %xmm3, %xmm3
vpblendw $-86, %xmm4, %xmm3, %xmm3
vinsertf128 $1, %xmm5, %ymm3, %ymm3
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20141014/2d9cd742/attachment.html>
More information about the llvm-bugs
mailing list