[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets

Tue Oct 14 01:36:05 PDT 2014

Hi Quentin,

Thank you for the directions. Here is the bug ticket:
http://llvm.org/bugs/show_bug.cgi?id=21269

Best regards,
Martin

On Mon, Oct 13, 2014 at 8:03 PM, Quentin Colombet <qcolombet at apple.com>
wrote:

> Hi Martin,
>
> I haven’t checked what is going on here, but if you believe some spill can
> be avoided, this is worth filing a PR (www.llvm.org/bugs) to libraries ->
> Register Allocator.
> Please attach the IR to reproduce the problem (-emit-llvm from clang).
>
> Thanks,
> -Quentin
>
> On Oct 13, 2014, at 9:13 AM, martin krastev <blu.dark at gmail.com> wrote:
>
> > Hello,
> >
> > Depending on how I extract integer lanes from an x86_64 xmm register,
> the backend may spill that register in order to load scalars. The effect
> was observed on two targets: corei7-avx and btver1 (I haven't checked other
> targets).
> >
> > Here's a test case with spilling/no-spilling code put on conditional
> compile:
> >
> > #if __SSE4_1__ != 0
> >       #include <smmintrin.h>
> > #else
> >       #include <emmintrin.h>
> > #endif
> > #include <stdint.h>
> > #include <assert.h>
> >
> > #if SPILLING_ENSUES == 1
> > static int32_t geti(const __m128i v, const size_t i)
> > {
> >       switch (i) {
> >       case 0:
> >               return _mm_cvtsi128_si32(v);
> >       case 1:
> >               return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe5));
> >       case 2:
> >               return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe6));
> >       case 3:
> >               return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe7));
> >       }
> >
> >       assert(0);
> >       return -1;
> > }
> >
> > #else
> > static int32_t geti(const __m128i v, const size_t i)
> > {
> >       switch (i) {
> >       case 0:
> >               return int32_t(v[0] >> 0);
> >       case 1:
> >               return int32_t(v[0] >> 32);
> >       case 2:
> >               return int32_t(v[1] >> 0);
> >       case 3:
> >               return int32_t(v[1] >> 32);
> >       }
> >
> >       assert(0);
> >       return -1;
> > }
> > #endif
> >
> > __m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } };
> > __m128 r[1];
> >
> > static const float table[3] = {
> >       1.0,
> >       2.0,
> >       4.0,
> > };
> >
> > static __m128 testee(
> >       const __m128 x)
> > {
> >       const __m128i iexp =
> _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x), 23), _mm_set1_epi32(127));
> >       const __m128 s = _mm_or_ps(
> >               _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff <<
> 23)), x),
> >                             _mm_castsi128_ps(_mm_set1_epi32(0x7f <<
> 23)));
> >
> >       const __m128 exp = _mm_cvtepi32_ps(iexp);
> >       const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp,
> _mm_set1_ps(3.f)));
> >       const __m128i rem  = _mm_sub_epi32(iexp, _mm_mullo_epi16(quot,
> _mm_set1_epi32(0x10003)));
> >
> >       const __m128 entry = _mm_setr_ps( // 'rem' gets spilled depending
> on version of lane extractor used
> >               table[geti(rem, 0)],
> >               table[geti(rem, 1)],
> >               table[geti(rem, 2)],
> >               table[geti(rem, 3)]);
> >
> >       return _mm_set1_ps(.5f) * entry;
> > }
> >
> > int main(int argc, char** argv)
> > {
> >       r[0] = testee(x[0]);
> >       return 0;
> > }
> >
> >
> > In the above function 'testee' (duly inlined in the disassembly below),
> local var 'rem' gets spilled and read back as scalars, depending on which
> version of the integer lane accessor was used.
> >
> > Output from clang 3.4 for target corei7-avx:
> >
> > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
> -march=native -mtune=native -DSPILLING_ENSUES=0   /* no spilling */
> > $ objdump -dC --no-show-raw-insn ./a.out
> > ...
> > 00000000004004f0 <main>:
> >   4004f0:   vmovdqa 0x2004c8(%rip),%xmm0        # 6009c0 <x>
> >   4004f8:   vpsrld $0x17,%xmm0,%xmm0
> >   4004fd:   vpaddd 0x17b(%rip),%xmm0,%xmm0        # 400680
> <__dso_handle+0x8>
> >   400505:   vcvtdq2ps %xmm0,%xmm1
> >   400509:   vdivps 0x17f(%rip),%xmm1,%xmm1        # 400690
> <__dso_handle+0x18>
> >   400511:   vcvttps2dq %xmm1,%xmm1
> >   400515:   vpmullw 0x183(%rip),%xmm1,%xmm1        # 4006a0
> <__dso_handle+0x28>
> >   40051d:   vpsubd %xmm1,%xmm0,%xmm0
> >   400521:   vmovq  %xmm0,%rax
> >   400526:   movslq %eax,%rcx
> >   400529:   sar    $0x20,%rax
> >   40052d:   vpextrq $0x1,%xmm0,%rdx
> >   400533:   movslq %edx,%rsi
> >   400536:   sar    $0x20,%rdx
> >   40053a:   vmovss 0x4006c0(,%rcx,4),%xmm0
> >   400543:   vinsertps $0x10,0x4006c0(,%rax,4),%xmm0,%xmm0
> >   40054e:   vinsertps $0x20,0x4006c0(,%rsi,4),%xmm0,%xmm0
> >   400559:   vinsertps $0x30,0x4006c0(,%rdx,4),%xmm0,%xmm0
> >   400564:   vmulps 0x144(%rip),%xmm0,%xmm0        # 4006b0
> <__dso_handle+0x38>
> >   40056c:   vmovaps %xmm0,0x20046c(%rip)        # 6009e0 <r>
> >   400574:   xor    %eax,%eax
> >   400576:   retq
> >
> > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
> -march=native -mtune=native -DSPILLING_ENSUES=1    /* spilling */
> > $ objdump -dC --no-show-raw-insn ./a.out
> > ...
> > 00000000004004f0 <main>:
> >   4004f0:   vmovdqa 0x2004c8(%rip),%xmm0        # 6009c0 <x>
> >   4004f8:   vpsrld $0x17,%xmm0,%xmm0
> >   4004fd:   vpaddd 0x17b(%rip),%xmm0,%xmm0        # 400680
> <__dso_handle+0x8>
> >   400505:   vcvtdq2ps %xmm0,%xmm1
> >   400509:   vdivps 0x17f(%rip),%xmm1,%xmm1        # 400690
> <__dso_handle+0x18>
> >   400511:   vcvttps2dq %xmm1,%xmm1
> >   400515:   vpmullw 0x183(%rip),%xmm1,%xmm1        # 4006a0
> <__dso_handle+0x28>
> >   40051d:   vpsubd %xmm1,%xmm0,%xmm0
> >   400521:   vmovdqa %xmm0,-0x18(%rsp)
> >   400527:   movslq -0x18(%rsp),%rax
> >   40052c:   movslq -0x14(%rsp),%rcx
> >   400531:   movslq -0x10(%rsp),%rdx
> >   400536:   movslq -0xc(%rsp),%rsi
> >   40053b:   vmovss 0x4006c0(,%rax,4),%xmm0
> >   400544:   vinsertps $0x10,0x4006c0(,%rcx,4),%xmm0,%xmm0
> >   40054f:   vinsertps $0x20,0x4006c0(,%rdx,4),%xmm0,%xmm0
> >   40055a:   vinsertps $0x30,0x4006c0(,%rsi,4),%xmm0,%xmm0
> >   400565:   vmulps 0x143(%rip),%xmm0,%xmm0        # 4006b0
> <__dso_handle+0x38>
> >   40056d:   vmovaps %xmm0,0x20046b(%rip)        # 6009e0 <r>
> >   400575:   xor    %eax,%eax
> >   400577:   retq
> >
> >
> > Output from clang pre-release 3.5 trunk for target btver1:
> >
> > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
> -march=native -mtune=native -DSPILLING_ENSUES=0   /* no spilling */
> > $ objdump -dC --no-show-raw-insn ./a.out
> > ...
> > 00000000004005c0 <main>:
> >   4005c0:     movdqa 0x1a58(%rip),%xmm0        # 402020 <x>
> >   4005c8:     psrld  $0x17,%xmm0
> >   4005cd:     paddd  0x12b(%rip),%xmm0        # 400700 <.LCPI0_0>
> >   4005d5:     cvtdq2ps %xmm0,%xmm1
> >   4005d8:     divps  0x131(%rip),%xmm1        # 400710 <.LCPI0_1>
> >   4005df:     cvttps2dq %xmm1,%xmm1
> >   4005e3:     pmullw 0x135(%rip),%xmm1        # 400720 <.LCPI0_2>
> >   4005eb:     psubd  %xmm1,%xmm0
> >   4005ef:     movq   %xmm0,%rax
> >   4005f4:     movslq %eax,%rcx
> >   4005f7:     sar    $0x20,%rax
> >   4005fb:     punpckhqdq %xmm0,%xmm0
> >   4005ff:     movq   %xmm0,%rdx
> >   400604:     movslq %edx,%rsi
> >   400607:     sar    $0x20,%rdx
> >   40060b:     movss  0x400740(,%rax,4),%xmm0
> >   400614:     movss  0x400740(,%rdx,4),%xmm1
> >   40061d:     unpcklps %xmm1,%xmm0
> >   400620:     movss  0x400740(,%rcx,4),%xmm1
> >   400629:     movss  0x400740(,%rsi,4),%xmm2
> >   400632:     unpcklps %xmm2,%xmm1
> >   400635:     unpcklps %xmm0,%xmm1
> >   400638:     mulps  0xf1(%rip),%xmm1        # 400730 <.LCPI0_3>
> >   40063f:     movaps %xmm1,0x1a1a(%rip)        # 402060 <r>
> >   400646:     xor    %eax,%eax
> >   400648:     retq
> >
> > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
> -march=native -mtune=native -DSPILLING_ENSUES=1    /* spilling */
> > $ objdump -dC --no-show-raw-insn ./a.out
> > ...
> > 00000000004005c0 <main>:
> >   4005c0:     movdqa 0x1a58(%rip),%xmm0        # 402020 <x>
> >   4005c8:     psrld  $0x17,%xmm0
> >   4005cd:     paddd  0x12b(%rip),%xmm0        # 400700 <.LCPI0_0>
> >   4005d5:     cvtdq2ps %xmm0,%xmm1
> >   4005d8:     divps  0x131(%rip),%xmm1        # 400710 <.LCPI0_1>
> >   4005df:     cvttps2dq %xmm1,%xmm1
> >   4005e3:     pmullw 0x135(%rip),%xmm1        # 400720 <.LCPI0_2>
> >   4005eb:     psubd  %xmm1,%xmm0
> >   4005ef:     movdqa %xmm0,-0x18(%rsp)
> >   4005f5:     movslq -0x18(%rsp),%rax
> >   4005fa:     movslq -0x14(%rsp),%rcx
> >   4005ff:     movslq -0x10(%rsp),%rdx
> >   400604:     movslq -0xc(%rsp),%rsi
> >   400609:     movss  0x400740(,%rsi,4),%xmm0
> >   400612:     movss  0x400740(,%rcx,4),%xmm1
> >   40061b:     unpcklps %xmm0,%xmm1
> >   40061e:     movss  0x400740(,%rdx,4),%xmm0
> >   400627:     movss  0x400740(,%rax,4),%xmm2
> >   400630:     unpcklps %xmm0,%xmm2
> >   400633:     unpcklps %xmm1,%xmm2
> >   400636:     mulps  0xf3(%rip),%xmm2        # 400730 <.LCPI0_3>
> >   40063d:     movaps %xmm2,0x1a1c(%rip)        # 402060 <r>
> >   400644:     xor    %eax,%eax
> >   400646:     retq
> >
> >
> > Is that behavior expected? Because I find it odd.
> >
> > Best regards,
> > Martin
> > _______________________________________________
> > LLVM Developers mailing list
> > LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20141014/e7b43539/attachment.html>