[LLVMbugs] [Bug 21269] New: Unexpected spilling of vector register during lane extraction on some x86_64 targets

Tue Oct 14 01:21:23 PDT 2014

http://llvm.org/bugs/show_bug.cgi?id=21269

            Bug ID: 21269
           Summary: Unexpected spilling of vector register during lane
                    extraction on some x86_64 targets
           Product: libraries
           Version: 3.4
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Register Allocator
          Assignee: unassignedbugs at nondot.org
          Reporter: blu.dark at gmail.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

Depending on how one extracts integer lanes from an x86_64 xmm register, the
backend may spill that register in order to load scalars. The effect was
observed on two targets: corei7-avx and btver1. Other targets were not checked.

Here is a test case with spilling/no-spilling code put on conditional compile:

#if __SSE4_1__ != 0
    #include <smmintrin.h>
#else
    #include <emmintrin.h>
#endif
#include <stdint.h>
#include <assert.h>

#if SPILLING_ENSUES != 0
static int32_t geti(const __m128i v, const size_t i)
{
    switch (i) {
    case 0:
        return _mm_cvtsi128_si32(v);
    case 1:
        return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe5));
    case 2:
        return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe6));
    case 3:
        return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe7));
    }

    assert(0);
    return -1;
}

#else
static int32_t geti(const __m128i v, const size_t i)
{
    switch (i) {
    case 0:
        return int32_t(v[0] >> 0);
    case 1:
        return int32_t(v[0] >> 32);
    case 2:
        return int32_t(v[1] >> 0);
    case 3:
        return int32_t(v[1] >> 32);
    }

    assert(0);
    return -1;
}
#endif

__m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } };
__m128 r[1];

static const float table[3] = {
    1.0,
    2.0,
    4.0,
};

static __m128 testee(
    const __m128 x)
{
    const __m128i iexp = _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x), 23),
_mm_set1_epi32(127));
    const __m128 s = _mm_or_ps(
        _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff << 23)), x),
                      _mm_castsi128_ps(_mm_set1_epi32(0x7f << 23)));

    const __m128 exp = _mm_cvtepi32_ps(iexp);
    const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp, _mm_set1_ps(3.f)));
    const __m128i rem  = _mm_sub_epi32(iexp, _mm_mullo_epi16(quot,
_mm_set1_epi32(0x10003)));

    const __m128 entry = _mm_setr_ps( // 'rem' gets spilled depending on
version of lane extractor used
        table[geti(rem, 0)],
        table[geti(rem, 1)],
        table[geti(rem, 2)],
        table[geti(rem, 3)]);

    return _mm_set1_ps(.5f) * entry;
}

int main(int argc, char** argv)
{
    r[0] = testee(x[0]);
    return 0;
}

In the above function 'testee' (duly inlined in the disassembly below), local
var 'rem' gets spilled and read back as scalars, depending on which version of
the integer lane extractor was used.

Output from clang 3.4 for target corei7-avx:

#### no spilling ####

$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=0
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004004f0 <main>:
  4004f0:   vmovdqa 0x2004c8(%rip),%xmm0        # 6009c0 <x>
  4004f8:   vpsrld $0x17,%xmm0,%xmm0
  4004fd:   vpaddd 0x17b(%rip),%xmm0,%xmm0        # 400680 <__dso_handle+0x8>
  400505:   vcvtdq2ps %xmm0,%xmm1
  400509:   vdivps 0x17f(%rip),%xmm1,%xmm1        # 400690 <__dso_handle+0x18>
  400511:   vcvttps2dq %xmm1,%xmm1
  400515:   vpmullw 0x183(%rip),%xmm1,%xmm1        # 4006a0 <__dso_handle+0x28>
  40051d:   vpsubd %xmm1,%xmm0,%xmm0
  400521:   vmovq  %xmm0,%rax
  400526:   movslq %eax,%rcx
  400529:   sar    $0x20,%rax
  40052d:   vpextrq $0x1,%xmm0,%rdx
  400533:   movslq %edx,%rsi
  400536:   sar    $0x20,%rdx
  40053a:   vmovss 0x4006c0(,%rcx,4),%xmm0
  400543:   vinsertps $0x10,0x4006c0(,%rax,4),%xmm0,%xmm0
  40054e:   vinsertps $0x20,0x4006c0(,%rsi,4),%xmm0,%xmm0
  400559:   vinsertps $0x30,0x4006c0(,%rdx,4),%xmm0,%xmm0
  400564:   vmulps 0x144(%rip),%xmm0,%xmm0        # 4006b0 <__dso_handle+0x38>
  40056c:   vmovaps %xmm0,0x20046c(%rip)        # 6009e0 <r>
  400574:   xor    %eax,%eax
  400576:   retq

Corresponding '-S -emit-llvm' from the above:

; ModuleID = 'test.cpp'
target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-redhat-linux-gnu"

@x = global [1 x <4 x float>] [<4 x float> <float 0x3FBF7CEDA0000000, float
0x3FEFF7CEE0000000, float 0x3FBF7CEDA0000000, float 0x3FEFF7CEE0000000>], align
16
@r = global [1 x <4 x float>] zeroinitializer, align 16
@_ZL5table = internal unnamed_addr constant [3 x float] [float 1.000000e+00,
float 2.000000e+00, float 4.000000e+00], align 4

; Function Attrs: nounwind uwtable
define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
  %1 = load <4 x float>* getelementptr inbounds ([1 x <4 x float>]* @x, i64 0,
i64 0), align 16, !tbaa !1
  %2 = bitcast <4 x float> %1 to <4 x i32>
  %3 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %2, i32 23) #2
  %4 = add <4 x i32> %3, <i32 -127, i32 -127, i32 -127, i32 -127>
  %5 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %4) #2
  %6 = fdiv fast <4 x float> %5, <float 3.000000e+00, float 3.000000e+00, float
3.000000e+00, float 3.000000e+00>
  %7 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %6) #2
  %8 = bitcast <4 x i32> %7 to <8 x i16>
  %9 = mul <8 x i16> %8, <i16 3, i16 1, i16 3, i16 1, i16 3, i16 1, i16 3, i16
1>
  %10 = bitcast <8 x i16> %9 to <4 x i32>
  %11 = sub <4 x i32> %4, %10
  %12 = bitcast <4 x i32> %11 to <2 x i64>
  %13 = extractelement <2 x i64> %12, i32 0
  %sext.i = shl i64 %13, 32
  %14 = ashr exact i64 %sext.i, 32
  %15 = getelementptr inbounds [3 x float]* @_ZL5table, i64 0, i64 %14
  %16 = load float* %15, align 4, !tbaa !4
  %17 = ashr i64 %13, 32
  %18 = getelementptr inbounds [3 x float]* @_ZL5table, i64 0, i64 %17
  %19 = load float* %18, align 4, !tbaa !4
  %20 = extractelement <2 x i64> %12, i32 1
  %sext5.i = shl i64 %20, 32
  %21 = ashr exact i64 %sext5.i, 32
  %22 = getelementptr inbounds [3 x float]* @_ZL5table, i64 0, i64 %21
  %23 = load float* %22, align 4, !tbaa !4
  %24 = ashr i64 %20, 32
  %25 = getelementptr inbounds [3 x float]* @_ZL5table, i64 0, i64 %24
  %26 = load float* %25, align 4, !tbaa !4
  %27 = insertelement <4 x float> undef, float %16, i32 0
  %28 = insertelement <4 x float> %27, float %19, i32 1
  %29 = insertelement <4 x float> %28, float %23, i32 2
  %30 = insertelement <4 x float> %29, float %26, i32 3
  %31 = fmul fast <4 x float> %30, <float 5.000000e-01, float 5.000000e-01,
float 5.000000e-01, float 5.000000e-01>
  store <4 x float> %31, <4 x float>* getelementptr inbounds ([1 x <4 x
float>]* @r, i64 0, i64 0), align 16, !tbaa !1
  ret i32 0
}

; Function Attrs: nounwind readnone
declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) #1

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) #1

; Function Attrs: nounwind readnone
declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1

attributes #0 = { nounwind uwtable "less-precise-fpmad"="false"
"no-frame-pointer-elim"="false" "no-infs-fp-math"="true"
"no-nans-fp-math"="true" "stack-protector-buffer-size"="8"
"unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }

!llvm.ident = !{!0}

!0 = metadata !{metadata !"clang version 3.4 (tags/RELEASE_34/final)"}
!1 = metadata !{metadata !2, metadata !2, i64 0}
!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0}
!3 = metadata !{metadata !"Simple C/C++ TBAA"}
!4 = metadata !{metadata !5, metadata !5, i64 0}
!5 = metadata !{metadata !"float", metadata !2, i64 0}

#### spilling ####

$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=1
$ objdump -dC --no-show-raw-insn ./a.out
...
00000000004004f0 <main>:
  4004f0:   vmovdqa 0x2004c8(%rip),%xmm0        # 6009c0 <x>
  4004f8:   vpsrld $0x17,%xmm0,%xmm0
  4004fd:   vpaddd 0x17b(%rip),%xmm0,%xmm0        # 400680 <__dso_handle+0x8>
  400505:   vcvtdq2ps %xmm0,%xmm1
  400509:   vdivps 0x17f(%rip),%xmm1,%xmm1        # 400690 <__dso_handle+0x18>
  400511:   vcvttps2dq %xmm1,%xmm1
  400515:   vpmullw 0x183(%rip),%xmm1,%xmm1        # 4006a0 <__dso_handle+0x28>
  40051d:   vpsubd %xmm1,%xmm0,%xmm0
  400521:   vmovdqa %xmm0,-0x18(%rsp)
  400527:   movslq -0x18(%rsp),%rax
  40052c:   movslq -0x14(%rsp),%rcx
  400531:   movslq -0x10(%rsp),%rdx
  400536:   movslq -0xc(%rsp),%rsi
  40053b:   vmovss 0x4006c0(,%rax,4),%xmm0
  400544:   vinsertps $0x10,0x4006c0(,%rcx,4),%xmm0,%xmm0
  40054f:   vinsertps $0x20,0x4006c0(,%rdx,4),%xmm0,%xmm0
  40055a:   vinsertps $0x30,0x4006c0(,%rsi,4),%xmm0,%xmm0
  400565:   vmulps 0x143(%rip),%xmm0,%xmm0        # 4006b0 <__dso_handle+0x38>
  40056d:   vmovaps %xmm0,0x20046b(%rip)        # 6009e0 <r>
  400575:   xor    %eax,%eax
  400577:   retq

Corresponding '-S -emit-llvm' from the above:

; ModuleID = 'test.cpp'
target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-redhat-linux-gnu"

@x = global [1 x <4 x float>] [<4 x float> <float 0x3FBF7CEDA0000000, float
0x3FEFF7CEE0000000, float 0x3FBF7CEDA0000000, float 0x3FEFF7CEE0000000>], align
16
@r = global [1 x <4 x float>] zeroinitializer, align 16
@_ZL5table = internal unnamed_addr constant [3 x float] [float 1.000000e+00,
float 2.000000e+00, float 4.000000e+00], align 4

; Function Attrs: nounwind uwtable
define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
  %1 = load <4 x float>* getelementptr inbounds ([1 x <4 x float>]* @x, i64 0,
i64 0), align 16, !tbaa !1
  %2 = bitcast <4 x float> %1 to <4 x i32>
  %3 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %2, i32 23) #2
  %4 = add <4 x i32> %3, <i32 -127, i32 -127, i32 -127, i32 -127>
  %5 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %4) #2
  %6 = fdiv fast <4 x float> %5, <float 3.000000e+00, float 3.000000e+00, float
3.000000e+00, float 3.000000e+00>
  %7 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %6) #2
  %8 = bitcast <4 x i32> %7 to <8 x i16>
  %9 = mul <8 x i16> %8, <i16 3, i16 1, i16 3, i16 1, i16 3, i16 1, i16 3, i16
1>
  %10 = bitcast <8 x i16> %9 to <4 x i32>
  %11 = sub <4 x i32> %4, %10
  %12 = extractelement <4 x i32> %11, i32 0
  %13 = sext i32 %12 to i64
  %14 = getelementptr inbounds [3 x float]* @_ZL5table, i64 0, i64 %13
  %15 = load float* %14, align 4, !tbaa !4
  %16 = extractelement <4 x i32> %11, i32 1
  %17 = sext i32 %16 to i64
  %18 = getelementptr inbounds [3 x float]* @_ZL5table, i64 0, i64 %17
  %19 = load float* %18, align 4, !tbaa !4
  %20 = extractelement <4 x i32> %11, i32 2
  %21 = sext i32 %20 to i64
  %22 = getelementptr inbounds [3 x float]* @_ZL5table, i64 0, i64 %21
  %23 = load float* %22, align 4, !tbaa !4
  %24 = extractelement <4 x i32> %11, i32 3
  %25 = sext i32 %24 to i64
  %26 = getelementptr inbounds [3 x float]* @_ZL5table, i64 0, i64 %25
  %27 = load float* %26, align 4, !tbaa !4
  %28 = insertelement <4 x float> undef, float %15, i32 0
  %29 = insertelement <4 x float> %28, float %19, i32 1
  %30 = insertelement <4 x float> %29, float %23, i32 2
  %31 = insertelement <4 x float> %30, float %27, i32 3
  %32 = fmul fast <4 x float> %31, <float 5.000000e-01, float 5.000000e-01,
float 5.000000e-01, float 5.000000e-01>
  store <4 x float> %32, <4 x float>* getelementptr inbounds ([1 x <4 x
float>]* @r, i64 0, i64 0), align 16, !tbaa !1
  ret i32 0
}

; Function Attrs: nounwind readnone
declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) #1

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) #1

; Function Attrs: nounwind readnone
declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1

attributes #0 = { nounwind uwtable "less-precise-fpmad"="false"
"no-frame-pointer-elim"="false" "no-infs-fp-math"="true"
"no-nans-fp-math"="true" "stack-protector-buffer-size"="8"
"unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }

!llvm.ident = !{!0}

!0 = metadata !{metadata !"clang version 3.4 (tags/RELEASE_34/final)"}
!1 = metadata !{metadata !2, metadata !2, i64 0}
!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0}
!3 = metadata !{metadata !"Simple C/C++ TBAA"}
!4 = metadata !{metadata !5, metadata !5, i64 0}
!5 = metadata !{metadata !"float", metadata !2, i64 0}

Regards,
Martin

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20141014/96b0faae/attachment.html>