[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets
martin krastev
blu.dark at gmail.com
Tue Oct 14 01:36:05 PDT 2014
Hi Quentin,
Thank you for the directions. Here is the bug ticket:
http://llvm.org/bugs/show_bug.cgi?id=21269
Best regards,
Martin
On Mon, Oct 13, 2014 at 8:03 PM, Quentin Colombet <qcolombet at apple.com>
wrote:
> Hi Martin,
>
> I haven’t checked what is going on here, but if you believe some spill can
> be avoided, this is worth filing a PR (www.llvm.org/bugs) to libraries ->
> Register Allocator.
> Please attach the IR to reproduce the problem (-emit-llvm from clang).
>
> Thanks,
> -Quentin
>
> On Oct 13, 2014, at 9:13 AM, martin krastev <blu.dark at gmail.com> wrote:
>
> > Hello,
> >
> > Depending on how I extract integer lanes from an x86_64 xmm register,
> the backend may spill that register in order to load scalars. The effect
> was observed on two targets: corei7-avx and btver1 (I haven't checked other
> targets).
> >
> > Here's a test case with spilling/no-spilling code put on conditional
> compile:
> >
> > #if __SSE4_1__ != 0
> > #include <smmintrin.h>
> > #else
> > #include <emmintrin.h>
> > #endif
> > #include <stdint.h>
> > #include <assert.h>
> >
> > #if SPILLING_ENSUES == 1
> > static int32_t geti(const __m128i v, const size_t i)
> > {
> > switch (i) {
> > case 0:
> > return _mm_cvtsi128_si32(v);
> > case 1:
> > return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe5));
> > case 2:
> > return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe6));
> > case 3:
> > return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe7));
> > }
> >
> > assert(0);
> > return -1;
> > }
> >
> > #else
> > static int32_t geti(const __m128i v, const size_t i)
> > {
> > switch (i) {
> > case 0:
> > return int32_t(v[0] >> 0);
> > case 1:
> > return int32_t(v[0] >> 32);
> > case 2:
> > return int32_t(v[1] >> 0);
> > case 3:
> > return int32_t(v[1] >> 32);
> > }
> >
> > assert(0);
> > return -1;
> > }
> > #endif
> >
> > __m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } };
> > __m128 r[1];
> >
> > static const float table[3] = {
> > 1.0,
> > 2.0,
> > 4.0,
> > };
> >
> > static __m128 testee(
> > const __m128 x)
> > {
> > const __m128i iexp =
> _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x), 23), _mm_set1_epi32(127));
> > const __m128 s = _mm_or_ps(
> > _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff <<
> 23)), x),
> > _mm_castsi128_ps(_mm_set1_epi32(0x7f <<
> 23)));
> >
> > const __m128 exp = _mm_cvtepi32_ps(iexp);
> > const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp,
> _mm_set1_ps(3.f)));
> > const __m128i rem = _mm_sub_epi32(iexp, _mm_mullo_epi16(quot,
> _mm_set1_epi32(0x10003)));
> >
> > const __m128 entry = _mm_setr_ps( // 'rem' gets spilled depending
> on version of lane extractor used
> > table[geti(rem, 0)],
> > table[geti(rem, 1)],
> > table[geti(rem, 2)],
> > table[geti(rem, 3)]);
> >
> > return _mm_set1_ps(.5f) * entry;
> > }
> >
> > int main(int argc, char** argv)
> > {
> > r[0] = testee(x[0]);
> > return 0;
> > }
> >
> >
> > In the above function 'testee' (duly inlined in the disassembly below),
> local var 'rem' gets spilled and read back as scalars, depending on which
> version of the integer lane accessor was used.
> >
> > Output from clang 3.4 for target corei7-avx:
> >
> > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
> -march=native -mtune=native -DSPILLING_ENSUES=0 /* no spilling */
> > $ objdump -dC --no-show-raw-insn ./a.out
> > ...
> > 00000000004004f0 <main>:
> > 4004f0: vmovdqa 0x2004c8(%rip),%xmm0 # 6009c0 <x>
> > 4004f8: vpsrld $0x17,%xmm0,%xmm0
> > 4004fd: vpaddd 0x17b(%rip),%xmm0,%xmm0 # 400680
> <__dso_handle+0x8>
> > 400505: vcvtdq2ps %xmm0,%xmm1
> > 400509: vdivps 0x17f(%rip),%xmm1,%xmm1 # 400690
> <__dso_handle+0x18>
> > 400511: vcvttps2dq %xmm1,%xmm1
> > 400515: vpmullw 0x183(%rip),%xmm1,%xmm1 # 4006a0
> <__dso_handle+0x28>
> > 40051d: vpsubd %xmm1,%xmm0,%xmm0
> > 400521: vmovq %xmm0,%rax
> > 400526: movslq %eax,%rcx
> > 400529: sar $0x20,%rax
> > 40052d: vpextrq $0x1,%xmm0,%rdx
> > 400533: movslq %edx,%rsi
> > 400536: sar $0x20,%rdx
> > 40053a: vmovss 0x4006c0(,%rcx,4),%xmm0
> > 400543: vinsertps $0x10,0x4006c0(,%rax,4),%xmm0,%xmm0
> > 40054e: vinsertps $0x20,0x4006c0(,%rsi,4),%xmm0,%xmm0
> > 400559: vinsertps $0x30,0x4006c0(,%rdx,4),%xmm0,%xmm0
> > 400564: vmulps 0x144(%rip),%xmm0,%xmm0 # 4006b0
> <__dso_handle+0x38>
> > 40056c: vmovaps %xmm0,0x20046c(%rip) # 6009e0 <r>
> > 400574: xor %eax,%eax
> > 400576: retq
> >
> > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
> -march=native -mtune=native -DSPILLING_ENSUES=1 /* spilling */
> > $ objdump -dC --no-show-raw-insn ./a.out
> > ...
> > 00000000004004f0 <main>:
> > 4004f0: vmovdqa 0x2004c8(%rip),%xmm0 # 6009c0 <x>
> > 4004f8: vpsrld $0x17,%xmm0,%xmm0
> > 4004fd: vpaddd 0x17b(%rip),%xmm0,%xmm0 # 400680
> <__dso_handle+0x8>
> > 400505: vcvtdq2ps %xmm0,%xmm1
> > 400509: vdivps 0x17f(%rip),%xmm1,%xmm1 # 400690
> <__dso_handle+0x18>
> > 400511: vcvttps2dq %xmm1,%xmm1
> > 400515: vpmullw 0x183(%rip),%xmm1,%xmm1 # 4006a0
> <__dso_handle+0x28>
> > 40051d: vpsubd %xmm1,%xmm0,%xmm0
> > 400521: vmovdqa %xmm0,-0x18(%rsp)
> > 400527: movslq -0x18(%rsp),%rax
> > 40052c: movslq -0x14(%rsp),%rcx
> > 400531: movslq -0x10(%rsp),%rdx
> > 400536: movslq -0xc(%rsp),%rsi
> > 40053b: vmovss 0x4006c0(,%rax,4),%xmm0
> > 400544: vinsertps $0x10,0x4006c0(,%rcx,4),%xmm0,%xmm0
> > 40054f: vinsertps $0x20,0x4006c0(,%rdx,4),%xmm0,%xmm0
> > 40055a: vinsertps $0x30,0x4006c0(,%rsi,4),%xmm0,%xmm0
> > 400565: vmulps 0x143(%rip),%xmm0,%xmm0 # 4006b0
> <__dso_handle+0x38>
> > 40056d: vmovaps %xmm0,0x20046b(%rip) # 6009e0 <r>
> > 400575: xor %eax,%eax
> > 400577: retq
> >
> >
> > Output from clang pre-release 3.5 trunk for target btver1:
> >
> > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
> -march=native -mtune=native -DSPILLING_ENSUES=0 /* no spilling */
> > $ objdump -dC --no-show-raw-insn ./a.out
> > ...
> > 00000000004005c0 <main>:
> > 4005c0: movdqa 0x1a58(%rip),%xmm0 # 402020 <x>
> > 4005c8: psrld $0x17,%xmm0
> > 4005cd: paddd 0x12b(%rip),%xmm0 # 400700 <.LCPI0_0>
> > 4005d5: cvtdq2ps %xmm0,%xmm1
> > 4005d8: divps 0x131(%rip),%xmm1 # 400710 <.LCPI0_1>
> > 4005df: cvttps2dq %xmm1,%xmm1
> > 4005e3: pmullw 0x135(%rip),%xmm1 # 400720 <.LCPI0_2>
> > 4005eb: psubd %xmm1,%xmm0
> > 4005ef: movq %xmm0,%rax
> > 4005f4: movslq %eax,%rcx
> > 4005f7: sar $0x20,%rax
> > 4005fb: punpckhqdq %xmm0,%xmm0
> > 4005ff: movq %xmm0,%rdx
> > 400604: movslq %edx,%rsi
> > 400607: sar $0x20,%rdx
> > 40060b: movss 0x400740(,%rax,4),%xmm0
> > 400614: movss 0x400740(,%rdx,4),%xmm1
> > 40061d: unpcklps %xmm1,%xmm0
> > 400620: movss 0x400740(,%rcx,4),%xmm1
> > 400629: movss 0x400740(,%rsi,4),%xmm2
> > 400632: unpcklps %xmm2,%xmm1
> > 400635: unpcklps %xmm0,%xmm1
> > 400638: mulps 0xf1(%rip),%xmm1 # 400730 <.LCPI0_3>
> > 40063f: movaps %xmm1,0x1a1a(%rip) # 402060 <r>
> > 400646: xor %eax,%eax
> > 400648: retq
> >
> > $ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
> -march=native -mtune=native -DSPILLING_ENSUES=1 /* spilling */
> > $ objdump -dC --no-show-raw-insn ./a.out
> > ...
> > 00000000004005c0 <main>:
> > 4005c0: movdqa 0x1a58(%rip),%xmm0 # 402020 <x>
> > 4005c8: psrld $0x17,%xmm0
> > 4005cd: paddd 0x12b(%rip),%xmm0 # 400700 <.LCPI0_0>
> > 4005d5: cvtdq2ps %xmm0,%xmm1
> > 4005d8: divps 0x131(%rip),%xmm1 # 400710 <.LCPI0_1>
> > 4005df: cvttps2dq %xmm1,%xmm1
> > 4005e3: pmullw 0x135(%rip),%xmm1 # 400720 <.LCPI0_2>
> > 4005eb: psubd %xmm1,%xmm0
> > 4005ef: movdqa %xmm0,-0x18(%rsp)
> > 4005f5: movslq -0x18(%rsp),%rax
> > 4005fa: movslq -0x14(%rsp),%rcx
> > 4005ff: movslq -0x10(%rsp),%rdx
> > 400604: movslq -0xc(%rsp),%rsi
> > 400609: movss 0x400740(,%rsi,4),%xmm0
> > 400612: movss 0x400740(,%rcx,4),%xmm1
> > 40061b: unpcklps %xmm0,%xmm1
> > 40061e: movss 0x400740(,%rdx,4),%xmm0
> > 400627: movss 0x400740(,%rax,4),%xmm2
> > 400630: unpcklps %xmm0,%xmm2
> > 400633: unpcklps %xmm1,%xmm2
> > 400636: mulps 0xf3(%rip),%xmm2 # 400730 <.LCPI0_3>
> > 40063d: movaps %xmm2,0x1a1c(%rip) # 402060 <r>
> > 400644: xor %eax,%eax
> > 400646: retq
> >
> >
> > Is that behavior expected? Because I find it odd.
> >
> > Best regards,
> > Martin
> > _______________________________________________
> > LLVM Developers mailing list
> > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu
> > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20141014/e7b43539/attachment.html>
More information about the llvm-dev
mailing list