<html>
<head>
<base href="http://llvm.org/bugs/" />
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW --- - Performance regression in vector interleave"
href="http://llvm.org/bugs/show_bug.cgi?id=21281">21281</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Performance regression in vector interleave
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>normal
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Backend: X86
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>andrew.b.adams@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvmbugs@cs.uiuc.edu
</td>
</tr>
<tr>
<th>Classification</th>
<td>Unclassified
</td>
</tr></table>
<p>
<div>
<pre>Created <span class=""><a href="attachment.cgi?id=13204" name="attach_13204" title="ll to reproduce">attachment 13204</a> <a href="attachment.cgi?id=13204&action=edit" title="ll to reproduce">[details]</a></span>
ll to reproduce
The attached ll interleaves 8 8xi16 vectors into a single 64xi16 vector. It's
used as part of a fast transpose routine. This particular formulation is used
because it generates great code with llvm <= 3.5 on x86 and arm. arm is still
superb with trunk (12 zip instructions!), but x86 has regressed in trunk, and
gives us a >20% slowdown for a transpose benchmark.
Old asm from llvm release 3.5. 36 instructions.
movdqa %xmm0, %xmm8
punpcklwd %xmm1, %xmm8
punpckhwd %xmm1, %xmm0
movdqa %xmm2, %xmm1
punpcklwd %xmm3, %xmm1
punpckhwd %xmm3, %xmm2
movdqa %xmm0, %xmm3
punpckhdq %xmm2, %xmm3
punpckldq %xmm2, %xmm0
movdqa %xmm8, %xmm2
punpckhdq %xmm1, %xmm2
punpckldq %xmm1, %xmm8
movdqa %xmm4, %xmm1
punpcklwd %xmm5, %xmm1
punpckhwd %xmm5, %xmm4
movdqa %xmm7, %xmm5
punpcklwd %xmm6, %xmm5
punpckhwd %xmm6, %xmm7
movdqa %xmm4, %xmm6
punpckhdq %xmm7, %xmm6
punpckldq %xmm7, %xmm4
movdqa %xmm1, %xmm7
punpckhdq %xmm5, %xmm7
punpckldq %xmm5, %xmm1
movdqa %xmm8, %xmm5
punpcklwd %xmm1, %xmm5
punpckhwd %xmm1, %xmm8
movdqa %xmm2, %xmm1
punpcklwd %xmm7, %xmm1
punpckhwd %xmm7, %xmm2
movdqa %xmm0, %xmm7
punpcklwd %xmm4, %xmm7
punpckhwd %xmm4, %xmm0
movdqa %xmm3, %xmm4
punpcklwd %xmm6, %xmm4
punpckhwd %xmm6, %xmm3
New asm from llvm trunk. 44 instructions.
pshufd $78, %xmm1, %xmm9
pshufd $78, %xmm0, %xmm8
punpcklwd %xmm9, %xmm8
punpcklwd %xmm1, %xmm0
pshufd $78, %xmm3, %xmm9
pshufd $78, %xmm2, %xmm1
punpcklwd %xmm9, %xmm1
punpcklwd %xmm3, %xmm2
movdqa %xmm0, %xmm9
punpckhdq %xmm2, %xmm9
punpckldq %xmm2, %xmm0
movdqa %xmm8, %xmm2
punpckhdq %xmm1, %xmm2
punpckldq %xmm1, %xmm8
pshufd $78, %xmm5, %xmm1
pshufd $78, %xmm4, %xmm3
punpcklwd %xmm1, %xmm3
punpcklwd %xmm5, %xmm4
pshufd $78, %xmm6, %xmm1
pshufd $78, %xmm7, %xmm5
punpcklwd %xmm1, %xmm5
punpcklwd %xmm6, %xmm7
movdqa %xmm4, %xmm1
punpckhdq %xmm7, %xmm1
punpckldq %xmm7, %xmm4
movdqa %xmm3, %xmm6
punpckhdq %xmm5, %xmm6
punpckldq %xmm5, %xmm3
pshufd $78, %xmm3, %xmm5
pshufd $78, %xmm8, %xmm11
punpcklwd %xmm5, %xmm11
pshufd $78, %xmm6, %xmm10
pshufd $78, %xmm2, %xmm12
punpcklwd %xmm10, %xmm12
pshufd $78, %xmm4, %xmm10
pshufd $78, %xmm0, %xmm7
punpcklwd %xmm10, %xmm7
pshufd $78, %xmm1, %xmm10
pshufd $78, %xmm9, %xmm5
punpcklwd %xmm10, %xmm5
punpcklwd %xmm3, %xmm8
punpcklwd %xmm6, %xmm2
punpcklwd %xmm4, %xmm0
punpcklwd %xmm1, %xmm9
The difference is even more stark with avx enabled:
llvm 3.5. 28 instructions.
vpunpcklwd %xmm1, %xmm0, %xmm8
vpunpckhwd %xmm1, %xmm0, %xmm0
vpunpcklwd %xmm3, %xmm2, %xmm1
vpunpckhwd %xmm3, %xmm2, %xmm2
vpunpckhdq %xmm2, %xmm0, %xmm9
vpunpckldq %xmm2, %xmm0, %xmm2
vpunpckhdq %xmm1, %xmm8, %xmm3
vpunpckldq %xmm1, %xmm8, %xmm0
vpunpcklwd %xmm5, %xmm4, %xmm1
vpunpckhwd %xmm5, %xmm4, %xmm4
vpunpcklwd %xmm6, %xmm7, %xmm5
vpunpckhwd %xmm6, %xmm7, %xmm6
vpunpckhdq %xmm6, %xmm4, %xmm7
vpunpckldq %xmm6, %xmm4, %xmm4
vpunpckhdq %xmm5, %xmm1, %xmm6
vpunpckldq %xmm5, %xmm1, %xmm1
vpunpckhwd %xmm1, %xmm0, %xmm5
vpunpcklwd %xmm1, %xmm0, %xmm0
vinsertf128 $1, %xmm5, %ymm0, %ymm0
vpunpckhwd %xmm6, %xmm3, %xmm1
vpunpcklwd %xmm6, %xmm3, %xmm3
vinsertf128 $1, %xmm1, %ymm3, %ymm1
vpunpckhwd %xmm4, %xmm2, %xmm3
vpunpcklwd %xmm4, %xmm2, %xmm2
vinsertf128 $1, %xmm3, %ymm2, %ymm2
vpunpckhwd %xmm7, %xmm9, %xmm3
vpunpcklwd %xmm7, %xmm9, %xmm4
vinsertf128 $1, %xmm3, %ymm4, %ymm3
llvm trunk. 84 instructions.
vpunpcklwd %xmm1, %xmm1, %xmm8
vpmovzxwd %xmm0, %xmm9
vpblendw $-86, %xmm8, %xmm9, %xmm8
vpunpckhwd %xmm1, %xmm1, %xmm1
vpunpckhwd %xmm0, %xmm0, %xmm0
vpblendw $-86, %xmm1, %xmm0, %xmm0
vpunpcklwd %xmm3, %xmm3, %xmm9
vpmovzxwd %xmm2, %xmm1
vpblendw $-86, %xmm9, %xmm1, %xmm1
vpunpckhwd %xmm3, %xmm3, %xmm3
vpunpckhwd %xmm2, %xmm2, %xmm2
vpblendw $-86, %xmm3, %xmm2, %xmm2
vpermilps $96, %xmm2, %xmm3
vpermilps $-24, %xmm2, %xmm2
vinsertf128 $1, %xmm2, %ymm3, %ymm2
vpermilps $-44, %xmm0, %xmm3
vpermilps $-10, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm3, %ymm0
vblendps $-86, %ymm2, %ymm0, %ymm3
vpermilps $96, %xmm1, %xmm0
vpermilps $-24, %xmm1, %xmm1
vinsertf128 $1, %xmm1, %ymm0, %ymm0
vpermilps $-44, %xmm8, %xmm1
vpermilps $-10, %xmm8, %xmm2
vinsertf128 $1, %xmm2, %ymm1, %ymm1
vblendps $-86, %ymm0, %ymm1, %ymm1
vpunpcklwd %xmm5, %xmm5, %xmm0
vpmovzxwd %xmm4, %xmm2
vpblendw $-86, %xmm0, %xmm2, %xmm0
vpunpckhwd %xmm5, %xmm5, %xmm2
vpunpckhwd %xmm4, %xmm4, %xmm4
vpblendw $-86, %xmm2, %xmm4, %xmm2
vpunpcklwd %xmm6, %xmm6, %xmm4
vpmovzxwd %xmm7, %xmm5
vpblendw $-86, %xmm4, %xmm5, %xmm5
vpunpckhwd %xmm6, %xmm6, %xmm4
vpunpckhwd %xmm7, %xmm7, %xmm6
vpblendw $-86, %xmm4, %xmm6, %xmm4
vpermilps $96, %xmm4, %xmm6
vpermilps $-24, %xmm4, %xmm4
vinsertf128 $1, %xmm4, %ymm6, %ymm4
vpermilps $-44, %xmm2, %xmm6
vpermilps $-10, %xmm2, %xmm2
vinsertf128 $1, %xmm2, %ymm6, %ymm2
vblendps $-86, %ymm4, %ymm2, %ymm4
vpermilps $96, %xmm5, %xmm2
vpermilps $-24, %xmm5, %xmm5
vinsertf128 $1, %xmm5, %ymm2, %ymm2
vpermilps $-44, %xmm0, %xmm5
vpermilps $-10, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm5, %ymm0
vblendps $-86, %ymm2, %ymm0, %ymm2
vpunpckhwd %xmm2, %xmm2, %xmm0
vpunpckhwd %xmm1, %xmm1, %xmm5
vpblendw $-86, %xmm0, %xmm5, %xmm0
vpunpcklwd %xmm2, %xmm2, %xmm5
vpmovzxwd %xmm1, %xmm6
vpblendw $-86, %xmm5, %xmm6, %xmm5
vinsertf128 $1, %xmm0, %ymm5, %ymm0
vextractf128 $1, %ymm2, %xmm2
vpunpckhwd %xmm2, %xmm2, %xmm5
vextractf128 $1, %ymm1, %xmm1
vpunpckhwd %xmm1, %xmm1, %xmm6
vpblendw $-86, %xmm5, %xmm6, %xmm5
vpunpcklwd %xmm2, %xmm2, %xmm2
vpmovzxwd %xmm1, %xmm1
vpblendw $-86, %xmm2, %xmm1, %xmm1
vinsertf128 $1, %xmm5, %ymm1, %ymm1
vpunpckhwd %xmm4, %xmm4, %xmm2
vpunpckhwd %xmm3, %xmm3, %xmm5
vpblendw $-86, %xmm2, %xmm5, %xmm2
vpunpcklwd %xmm4, %xmm4, %xmm5
vpmovzxwd %xmm3, %xmm6
vpblendw $-86, %xmm5, %xmm6, %xmm5
vinsertf128 $1, %xmm2, %ymm5, %ymm2
vextractf128 $1, %ymm4, %xmm4
vpunpckhwd %xmm4, %xmm4, %xmm5
vextractf128 $1, %ymm3, %xmm3
vpunpckhwd %xmm3, %xmm3, %xmm6
vpblendw $-86, %xmm5, %xmm6, %xmm5
vpunpcklwd %xmm4, %xmm4, %xmm4
vpmovzxwd %xmm3, %xmm3
vpblendw $-86, %xmm4, %xmm3, %xmm3
vinsertf128 $1, %xmm5, %ymm3, %ymm3</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>