<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - Inner product of two vectors gives inefficient code"
href="https://bugs.llvm.org/show_bug.cgi?id=32164">32164</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Inner product of two vectors gives inefficient code
</td>
</tr>
<tr>
<th>Product</th>
<td>new-bugs
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>new bugs
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>drraph@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>Consider:
typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;
typedef double __attribute__( ( vector_size( 32 ) ) ) float64x4_t;
float dotf(float32x4_t x, float32x4_t y) {
float ret=0;
for (int i=0;i!=4;++i) ret+=x[i]*y[i];
return ret;
}
double dotd(float64x4_t x, float64x4_t y) {
double ret=0;
for (int i=0;i!=4;++i) ret+=x[i]*y[i];
return ret;
}
clang trunk with -Ofast -march=corei7 gives:
dotf: # @dotf
mulps xmm0, xmm1
shufps xmm0, xmm0, 27 # xmm0 = xmm0[3,2,1,0]
movaps xmm1, xmm0
movhlps xmm1, xmm1 # xmm1 = xmm1[1,1]
addps xmm1, xmm0
haddps xmm1, xmm1
movaps xmm0, xmm1
ret
dotd: # @dotd
push rbp
mov rbp, rsp
and rsp, -32
sub rsp, 32
movapd xmm1, xmmword ptr [rbp + 16]
movapd xmm0, xmmword ptr [rbp + 32]
mulpd xmm1, xmmword ptr [rbp + 48]
mulpd xmm0, xmmword ptr [rbp + 64]
addpd xmm0, xmm1
shufpd xmm0, xmm0, 1 # xmm0 = xmm0[1,0]
haddpd xmm0, xmm0
mov rsp, rbp
pop rbp
ret
gcc gives more efficient code using the same flags:
dotf:
mulps xmm0, xmm1
haddps xmm0, xmm0
haddps xmm0, xmm0
ret
dotd:
movapd xmm0, XMMWORD PTR [rsp+8]
movapd xmm1, XMMWORD PTR [rsp+56]
mulpd xmm0, XMMWORD PTR [rsp+40]
mulpd xmm1, XMMWORD PTR [rsp+24]
addpd xmm0, xmm1
haddpd xmm0, xmm0
ret
In particular, clang appears to have an unnecessary prologue and epilogue and
an extra shuffle in the case of "double" and an extra shuffle and some more
work in the case of "float".</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>