[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets

Mon Oct 13 09:13:24 PDT 2014

Hello,

Depending on how I extract integer lanes from an x86_64 xmm register, the
backend may spill that register in order to load scalars. The effect was
observed on two targets: corei7-avx and btver1 (I haven't checked other
targets).

Here's a test case with spilling/no-spilling code put on conditional
compile:

#if __SSE4_1__ != 0
#include <smmintrin.h>
#else
#include <emmintrin.h>
#endif
#include <stdint.h>
#include <assert.h>

#if SPILLING_ENSUES == 1
static int32_t geti(const __m128i v, const size_t i)
{
switch (i) {
case 0:
return _mm_cvtsi128_si32(v);
case 1:
return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe5));
case 2:
return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe6));
case 3:
return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe7));
}

assert(0);
return -1;
}

#else
static int32_t geti(const __m128i v, const size_t i)
{
switch (i) {
case 0:
return int32_t(v[0] >> 0);
case 1:
return int32_t(v[0] >> 32);
case 2:
return int32_t(v[1] >> 0);
case 3:
return int32_t(v[1] >> 32);
}

assert(0);
return -1;
}
#endif

__m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } };
__m128 r[1];

static const float table[3] = {
1.0,
2.0,
4.0,
};

static __m128 testee(
const __m128 x)
{
const __m128i iexp = _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x), 23),
_mm_set1_epi32(127));
const __m128 s = _mm_or_ps(
_mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff << 23)), x),
              _mm_castsi128_ps(_mm_set1_epi32(0x7f << 23)));

const __m128 exp = _mm_cvtepi32_ps(iexp);
const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp, _mm_set1_ps(3.f)));
const __m128i rem  = _mm_sub_epi32(iexp, _mm_mullo_epi16(quot,
_mm_set1_epi32(0x10003)));

const __m128 entry = _mm_setr_ps( // 'rem' gets spilled depending on
version of lane extractor used
table[geti(rem, 0)],
table[geti(rem, 1)],
table[geti(rem, 2)],
table[geti(rem, 3)]);

return _mm_set1_ps(.5f) * entry;
}

int main(int argc, char** argv)
{
r[0] = testee(x[0]);
return 0;
}

In the above function 'testee' (duly inlined in the disassembly below),
local var 'rem' gets spilled and read back as scalars, depending on which
version of the integer lane accessor was used.

Output from clang 3.4 for target corei7-avx:

$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=0   /* no spilling */
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004004f0 <main>:
  4004f0:   vmovdqa 0x2004c8(%rip),%xmm0        # 6009c0 <x>
  4004f8:   vpsrld $0x17,%xmm0,%xmm0
  4004fd:   vpaddd 0x17b(%rip),%xmm0,%xmm0        # 400680
<__dso_handle+0x8>
  400505:   vcvtdq2ps %xmm0,%xmm1
  400509:   vdivps 0x17f(%rip),%xmm1,%xmm1        # 400690
<__dso_handle+0x18>
  400511:   vcvttps2dq %xmm1,%xmm1
  400515:   vpmullw 0x183(%rip),%xmm1,%xmm1        # 4006a0
<__dso_handle+0x28>
  40051d:   vpsubd %xmm1,%xmm0,%xmm0
  400521:   vmovq  %xmm0,%rax
  400526:   movslq %eax,%rcx
  400529:   sar    $0x20,%rax
  40052d:   vpextrq $0x1,%xmm0,%rdx
  400533:   movslq %edx,%rsi
  400536:   sar    $0x20,%rdx
  40053a:   vmovss 0x4006c0(,%rcx,4),%xmm0
  400543:   vinsertps $0x10,0x4006c0(,%rax,4),%xmm0,%xmm0
  40054e:   vinsertps $0x20,0x4006c0(,%rsi,4),%xmm0,%xmm0
  400559:   vinsertps $0x30,0x4006c0(,%rdx,4),%xmm0,%xmm0
  400564:   vmulps 0x144(%rip),%xmm0,%xmm0        # 4006b0
<__dso_handle+0x38>
  40056c:   vmovaps %xmm0,0x20046c(%rip)        # 6009e0 <r>
  400574:   xor    %eax,%eax
  400576:   retq

$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=1    /* spilling */
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004004f0 <main>:
  4004f0:   vmovdqa 0x2004c8(%rip),%xmm0        # 6009c0 <x>
  4004f8:   vpsrld $0x17,%xmm0,%xmm0
  4004fd:   vpaddd 0x17b(%rip),%xmm0,%xmm0        # 400680
<__dso_handle+0x8>
  400505:   vcvtdq2ps %xmm0,%xmm1
  400509:   vdivps 0x17f(%rip),%xmm1,%xmm1        # 400690
<__dso_handle+0x18>
  400511:   vcvttps2dq %xmm1,%xmm1
  400515:   vpmullw 0x183(%rip),%xmm1,%xmm1        # 4006a0
<__dso_handle+0x28>
  40051d:   vpsubd %xmm1,%xmm0,%xmm0
  400521:   vmovdqa %xmm0,-0x18(%rsp)
  400527:   movslq -0x18(%rsp),%rax
  40052c:   movslq -0x14(%rsp),%rcx
  400531:   movslq -0x10(%rsp),%rdx
  400536:   movslq -0xc(%rsp),%rsi
  40053b:   vmovss 0x4006c0(,%rax,4),%xmm0
  400544:   vinsertps $0x10,0x4006c0(,%rcx,4),%xmm0,%xmm0
  40054f:   vinsertps $0x20,0x4006c0(,%rdx,4),%xmm0,%xmm0
  40055a:   vinsertps $0x30,0x4006c0(,%rsi,4),%xmm0,%xmm0
  400565:   vmulps 0x143(%rip),%xmm0,%xmm0        # 4006b0
<__dso_handle+0x38>
  40056d:   vmovaps %xmm0,0x20046b(%rip)        # 6009e0 <r>
  400575:   xor    %eax,%eax
  400577:   retq

Output from clang pre-release 3.5 trunk for target btver1:

$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=0   /* no spilling */
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004005c0 <main>:
  4005c0: movdqa 0x1a58(%rip),%xmm0        # 402020 <x>
  4005c8: psrld  $0x17,%xmm0
  4005cd: paddd  0x12b(%rip),%xmm0        # 400700 <.LCPI0_0>
  4005d5: cvtdq2ps %xmm0,%xmm1
  4005d8: divps  0x131(%rip),%xmm1        # 400710 <.LCPI0_1>
  4005df: cvttps2dq %xmm1,%xmm1
  4005e3: pmullw 0x135(%rip),%xmm1        # 400720 <.LCPI0_2>
  4005eb: psubd  %xmm1,%xmm0
  4005ef: movq   %xmm0,%rax
  4005f4: movslq %eax,%rcx
  4005f7: sar    $0x20,%rax
  4005fb: punpckhqdq %xmm0,%xmm0
  4005ff: movq   %xmm0,%rdx
  400604: movslq %edx,%rsi
  400607: sar    $0x20,%rdx
  40060b: movss  0x400740(,%rax,4),%xmm0
  400614: movss  0x400740(,%rdx,4),%xmm1
  40061d: unpcklps %xmm1,%xmm0
  400620: movss  0x400740(,%rcx,4),%xmm1
  400629: movss  0x400740(,%rsi,4),%xmm2
  400632: unpcklps %xmm2,%xmm1
  400635: unpcklps %xmm0,%xmm1
  400638: mulps  0xf1(%rip),%xmm1        # 400730 <.LCPI0_3>
  40063f: movaps %xmm1,0x1a1a(%rip)        # 402060 <r>
  400646: xor    %eax,%eax
  400648: retq

$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=1    /* spilling */
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004005c0 <main>:
  4005c0: movdqa 0x1a58(%rip),%xmm0        # 402020 <x>
  4005c8: psrld  $0x17,%xmm0
  4005cd: paddd  0x12b(%rip),%xmm0        # 400700 <.LCPI0_0>
  4005d5: cvtdq2ps %xmm0,%xmm1
  4005d8: divps  0x131(%rip),%xmm1        # 400710 <.LCPI0_1>
  4005df: cvttps2dq %xmm1,%xmm1
  4005e3: pmullw 0x135(%rip),%xmm1        # 400720 <.LCPI0_2>
  4005eb: psubd  %xmm1,%xmm0
  4005ef: movdqa %xmm0,-0x18(%rsp)
  4005f5: movslq -0x18(%rsp),%rax
  4005fa: movslq -0x14(%rsp),%rcx
  4005ff: movslq -0x10(%rsp),%rdx
  400604: movslq -0xc(%rsp),%rsi
  400609: movss  0x400740(,%rsi,4),%xmm0
  400612: movss  0x400740(,%rcx,4),%xmm1
  40061b: unpcklps %xmm0,%xmm1
  40061e: movss  0x400740(,%rdx,4),%xmm0
  400627: movss  0x400740(,%rax,4),%xmm2
  400630: unpcklps %xmm0,%xmm2
  400633: unpcklps %xmm1,%xmm2
  400636: mulps  0xf3(%rip),%xmm2        # 400730 <.LCPI0_3>
  40063d: movaps %xmm2,0x1a1c(%rip)        # 402060 <r>
  400644: xor    %eax,%eax
  400646: retq

Is that behavior expected? Because I find it odd.

Best regards,
Martin
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20141013/6ff9a867/attachment.html>