[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets
martin krastev
blu.dark at gmail.com
Mon Oct 13 09:13:24 PDT 2014
Hello,
Depending on how I extract integer lanes from an x86_64 xmm register, the
backend may spill that register in order to load scalars. The effect was
observed on two targets: corei7-avx and btver1 (I haven't checked other
targets).
Here's a test case with spilling/no-spilling code put on conditional
compile:
#if __SSE4_1__ != 0
#include <smmintrin.h>
#else
#include <emmintrin.h>
#endif
#include <stdint.h>
#include <assert.h>
#if SPILLING_ENSUES == 1
static int32_t geti(const __m128i v, const size_t i)
{
switch (i) {
case 0:
return _mm_cvtsi128_si32(v);
case 1:
return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe5));
case 2:
return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe6));
case 3:
return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe7));
}
assert(0);
return -1;
}
#else
static int32_t geti(const __m128i v, const size_t i)
{
switch (i) {
case 0:
return int32_t(v[0] >> 0);
case 1:
return int32_t(v[0] >> 32);
case 2:
return int32_t(v[1] >> 0);
case 3:
return int32_t(v[1] >> 32);
}
assert(0);
return -1;
}
#endif
__m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } };
__m128 r[1];
static const float table[3] = {
1.0,
2.0,
4.0,
};
static __m128 testee(
const __m128 x)
{
const __m128i iexp = _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x), 23),
_mm_set1_epi32(127));
const __m128 s = _mm_or_ps(
_mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff << 23)), x),
_mm_castsi128_ps(_mm_set1_epi32(0x7f << 23)));
const __m128 exp = _mm_cvtepi32_ps(iexp);
const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp, _mm_set1_ps(3.f)));
const __m128i rem = _mm_sub_epi32(iexp, _mm_mullo_epi16(quot,
_mm_set1_epi32(0x10003)));
const __m128 entry = _mm_setr_ps( // 'rem' gets spilled depending on
version of lane extractor used
table[geti(rem, 0)],
table[geti(rem, 1)],
table[geti(rem, 2)],
table[geti(rem, 3)]);
return _mm_set1_ps(.5f) * entry;
}
int main(int argc, char** argv)
{
r[0] = testee(x[0]);
return 0;
}
In the above function 'testee' (duly inlined in the disassembly below),
local var 'rem' gets spilled and read back as scalars, depending on which
version of the integer lane accessor was used.
Output from clang 3.4 for target corei7-avx:
$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=0 /* no spilling */
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004004f0 <main>:
4004f0: vmovdqa 0x2004c8(%rip),%xmm0 # 6009c0 <x>
4004f8: vpsrld $0x17,%xmm0,%xmm0
4004fd: vpaddd 0x17b(%rip),%xmm0,%xmm0 # 400680
<__dso_handle+0x8>
400505: vcvtdq2ps %xmm0,%xmm1
400509: vdivps 0x17f(%rip),%xmm1,%xmm1 # 400690
<__dso_handle+0x18>
400511: vcvttps2dq %xmm1,%xmm1
400515: vpmullw 0x183(%rip),%xmm1,%xmm1 # 4006a0
<__dso_handle+0x28>
40051d: vpsubd %xmm1,%xmm0,%xmm0
400521: vmovq %xmm0,%rax
400526: movslq %eax,%rcx
400529: sar $0x20,%rax
40052d: vpextrq $0x1,%xmm0,%rdx
400533: movslq %edx,%rsi
400536: sar $0x20,%rdx
40053a: vmovss 0x4006c0(,%rcx,4),%xmm0
400543: vinsertps $0x10,0x4006c0(,%rax,4),%xmm0,%xmm0
40054e: vinsertps $0x20,0x4006c0(,%rsi,4),%xmm0,%xmm0
400559: vinsertps $0x30,0x4006c0(,%rdx,4),%xmm0,%xmm0
400564: vmulps 0x144(%rip),%xmm0,%xmm0 # 4006b0
<__dso_handle+0x38>
40056c: vmovaps %xmm0,0x20046c(%rip) # 6009e0 <r>
400574: xor %eax,%eax
400576: retq
$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=1 /* spilling */
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004004f0 <main>:
4004f0: vmovdqa 0x2004c8(%rip),%xmm0 # 6009c0 <x>
4004f8: vpsrld $0x17,%xmm0,%xmm0
4004fd: vpaddd 0x17b(%rip),%xmm0,%xmm0 # 400680
<__dso_handle+0x8>
400505: vcvtdq2ps %xmm0,%xmm1
400509: vdivps 0x17f(%rip),%xmm1,%xmm1 # 400690
<__dso_handle+0x18>
400511: vcvttps2dq %xmm1,%xmm1
400515: vpmullw 0x183(%rip),%xmm1,%xmm1 # 4006a0
<__dso_handle+0x28>
40051d: vpsubd %xmm1,%xmm0,%xmm0
400521: vmovdqa %xmm0,-0x18(%rsp)
400527: movslq -0x18(%rsp),%rax
40052c: movslq -0x14(%rsp),%rcx
400531: movslq -0x10(%rsp),%rdx
400536: movslq -0xc(%rsp),%rsi
40053b: vmovss 0x4006c0(,%rax,4),%xmm0
400544: vinsertps $0x10,0x4006c0(,%rcx,4),%xmm0,%xmm0
40054f: vinsertps $0x20,0x4006c0(,%rdx,4),%xmm0,%xmm0
40055a: vinsertps $0x30,0x4006c0(,%rsi,4),%xmm0,%xmm0
400565: vmulps 0x143(%rip),%xmm0,%xmm0 # 4006b0
<__dso_handle+0x38>
40056d: vmovaps %xmm0,0x20046b(%rip) # 6009e0 <r>
400575: xor %eax,%eax
400577: retq
Output from clang pre-release 3.5 trunk for target btver1:
$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=0 /* no spilling */
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004005c0 <main>:
4005c0: movdqa 0x1a58(%rip),%xmm0 # 402020 <x>
4005c8: psrld $0x17,%xmm0
4005cd: paddd 0x12b(%rip),%xmm0 # 400700 <.LCPI0_0>
4005d5: cvtdq2ps %xmm0,%xmm1
4005d8: divps 0x131(%rip),%xmm1 # 400710 <.LCPI0_1>
4005df: cvttps2dq %xmm1,%xmm1
4005e3: pmullw 0x135(%rip),%xmm1 # 400720 <.LCPI0_2>
4005eb: psubd %xmm1,%xmm0
4005ef: movq %xmm0,%rax
4005f4: movslq %eax,%rcx
4005f7: sar $0x20,%rax
4005fb: punpckhqdq %xmm0,%xmm0
4005ff: movq %xmm0,%rdx
400604: movslq %edx,%rsi
400607: sar $0x20,%rdx
40060b: movss 0x400740(,%rax,4),%xmm0
400614: movss 0x400740(,%rdx,4),%xmm1
40061d: unpcklps %xmm1,%xmm0
400620: movss 0x400740(,%rcx,4),%xmm1
400629: movss 0x400740(,%rsi,4),%xmm2
400632: unpcklps %xmm2,%xmm1
400635: unpcklps %xmm0,%xmm1
400638: mulps 0xf1(%rip),%xmm1 # 400730 <.LCPI0_3>
40063f: movaps %xmm1,0x1a1a(%rip) # 402060 <r>
400646: xor %eax,%eax
400648: retq
$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=1 /* spilling */
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004005c0 <main>:
4005c0: movdqa 0x1a58(%rip),%xmm0 # 402020 <x>
4005c8: psrld $0x17,%xmm0
4005cd: paddd 0x12b(%rip),%xmm0 # 400700 <.LCPI0_0>
4005d5: cvtdq2ps %xmm0,%xmm1
4005d8: divps 0x131(%rip),%xmm1 # 400710 <.LCPI0_1>
4005df: cvttps2dq %xmm1,%xmm1
4005e3: pmullw 0x135(%rip),%xmm1 # 400720 <.LCPI0_2>
4005eb: psubd %xmm1,%xmm0
4005ef: movdqa %xmm0,-0x18(%rsp)
4005f5: movslq -0x18(%rsp),%rax
4005fa: movslq -0x14(%rsp),%rcx
4005ff: movslq -0x10(%rsp),%rdx
400604: movslq -0xc(%rsp),%rsi
400609: movss 0x400740(,%rsi,4),%xmm0
400612: movss 0x400740(,%rcx,4),%xmm1
40061b: unpcklps %xmm0,%xmm1
40061e: movss 0x400740(,%rdx,4),%xmm0
400627: movss 0x400740(,%rax,4),%xmm2
400630: unpcklps %xmm0,%xmm2
400633: unpcklps %xmm1,%xmm2
400636: mulps 0xf3(%rip),%xmm2 # 400730 <.LCPI0_3>
40063d: movaps %xmm2,0x1a1c(%rip) # 402060 <r>
400644: xor %eax,%eax
400646: retq
Is that behavior expected? Because I find it odd.
Best regards,
Martin
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20141013/6ff9a867/attachment.html>
More information about the llvm-dev
mailing list