[LLVMdev] Is it a bug or am I missing something ?

Tue Feb 19 01:52:28 PST 2013

Hi all,

on following code:

; ModuleID = 'shufxbug.ll'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
target triple = "i386-pc-linux-gnu"

define void @sample_test(<4 x float>* nocapture %source, <8 x float>* nocapture %dest) nounwind noinline {
L.entry:
  %0 = getelementptr <4 x float>* %source, i32 19
  %1 = load <4 x float>* %0, align 16
  %2 = extractelement <4 x float> %1, i32 0
  %3 = insertelement <8 x float> <float 0.000000e+00, float undef, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %2, i32 2
  %4 = insertelement <8 x float> %3, float %2, i32 1
  %5 = getelementptr <8 x float>* %dest, i32 19
  store <8 x float> %4, <8 x float>* %5, align 4
  ret void
}

I'm expecting LLVM to generate code so that at vecor is stored at dest[19] with following value:

<float 0.000000e+00, float 'elem_0_of_source' , float 'elem_0_of_source', float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>

When I use llc trunk as follows on a Corei7 machine I've got following assembly code:

llc shufxbug.ll -march=x86 -relocation-model=pic -o shufxbug.s

    .file   "shufxbug.ll"
    .text
    .globl  sample_test
    .align  16, 0x90
    .type   sample_test, at function
sample_test:                            # @sample_test
# BB#0:                                 # %L.entry
    movl    4(%esp), %eax 
    movss   304(%eax), %xmm0
    xorps   %xmm1, %xmm1
    movl    8(%esp), %eax
    movups  %xmm1, 624(%eax)
    pshufd  $65, %xmm0, %xmm0       # xmm0 = xmm0[1,0,0,1]
    movdqu  %xmm0, 608(%eax)
    ret
.Ltmp0:
    .size   sample_test, .Ltmp0-sample_test

    .section    ".note.GNU-stack","", at progbits

It seems to me that this sequence of instruction is building vector:

<float 'elem_1_of_source', float 'elem_0_of_source' , float 'elem_0_of_source', float 'elem_1_of_source', float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>

On a sandy bridge system, I've got similar behavior with a slightly different code (using AVX):

    pushl   %ebp
.Ltmp5:
    .cfi_def_cfa_offset 8
.Ltmp6:
    .cfi_offset %ebp, -8
    movl    %esp, %ebp
.Ltmp7:
    .cfi_def_cfa_register %ebp
    movl    12(%ebp), %eax
    .loc    1 9 0 prologue_end      # shufxbug.cl:9:0
.Ltmp8:
    vpermilps   $65, 304(%eax), %xmm0 # xmm0 = mem[1,0,0,1]
    vxorps  %xmm1, %xmm1, %xmm1
    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    movl    16(%ebp), %eax
    .loc    1 10 0                  # shufxbug.cl:10:0
    vmovups %ymm0, 608(%eax)
    .loc    1 11 0                  # shufxbug.cl:11:0
    popl    %ebp
    vzeroupper
    ret

It seems to me that generated code is not correct in both case can someone confirm or indicate what I did wrong if not a bug ?
If this ends up being an actual BUG I'll submit it in BUG tracking system.

Thanks
Seb