[LLVMbugs] [Bug 9623] New: x86: inefficient code generated for i8 vector types

bugzilla-daemon at llvm.org bugzilla-daemon at llvm.org
Mon Apr 4 05:22:40 PDT 2011


http://llvm.org/bugs/show_bug.cgi?id=9623

           Summary: x86: inefficient code generated for i8 vector types
           Product: new-bugs
           Version: trunk
          Platform: PC
        OS/Version: All
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: new bugs
        AssignedTo: unassignedbugs at nondot.org
        ReportedBy: matt at pharr.org
                CC: llvmbugs at cs.uiuc.edu


Given this input:

define <4 x i8> @foo(<4 x i8> %x, <4 x i8> %y, <4 x i8> %__mask) nounwind
readnone alwaysinline {
entry:
  %binop = mul <4 x i8> %x, %y
  %binop6 = add <4 x i8> %binop, %x
  ret <4 x i8> %binop6
}

The following quite lengthy code is generated by llc.  It would be nice to get
the appropriate MMX instructions instead.  (This is probably not a high
priority fix in the grand scheme of things, though.)

_foo:                                   ## @foo
## BB#0:                                ## %entry
    pushq    %rbp
    pushq    %r15
    pushq    %r14
    pushq    %r13
    pushq    %r12
    pushq    %rbx
    movdqa    %xmm0, %xmm2
    pextrb    $1, %xmm2, %eax
    pextrb    $1, %xmm1, %ecx
    mulb    %cl
    pextrb    $0, %xmm2, %ecx
    pextrb    $0, %xmm1, %edx
    movzbl    %al, %esi
    movb    %cl, %al
    mulb    %dl
    movzbl    %al, %eax
    movd    %eax, %xmm0
    pextrb    $2, %xmm2, %eax
    pextrb    $2, %xmm1, %ecx
    pinsrb    $1, %esi, %xmm0
    mulb    %cl
    movb    %al, %cl
    pextrb    $3, %xmm2, %eax
    pextrb    $3, %xmm1, %edx
    mulb    %dl
    movb    %al, %dl
    movzbl    %cl, %ecx
    pextrb    $4, %xmm2, %eax
    pextrb    $4, %xmm1, %esi
    pinsrb    $2, %ecx, %xmm0
    mulb    %sil
    movzbl    %dl, %ecx
    pextrb    $11, %xmm2, %edx
    pextrb    $12, %xmm2, %esi
    pextrb    $13, %xmm2, %edi
    pextrb    $14, %xmm2, %r8d
    movl    %r8d, -4(%rsp)          ## 4-byte Spill
    pextrb    $5, %xmm1, %r9d
    pextrb    $5, %xmm2, %r10d
    pextrb    $8, %xmm1, %r11d
    pinsrb    $3, %ecx, %xmm0
    movzbl    %al, %ecx
    pextrb    $15, %xmm2, %ebx
    pextrb    $8, %xmm2, %r14d
    pextrb    $12, %xmm1, %r15d
    movb    %r10b, %al
    pextrb    $13, %xmm1, %r10d
    pinsrb    $4, %ecx, %xmm0
    pextrb    $14, %xmm1, %ecx
    pextrb    $15, %xmm1, %r12d
    mulb    %r9b
    movb    %al, %r9b
    pextrb    $11, %xmm1, %r13d
    pextrb    $10, %xmm2, %ebp
    movb    %r14b, %al
    mulb    %r11b
    movb    %al, %r11b
    pextrb    $9, %xmm2, %eax
    pextrb    $9, %xmm1, %r14d
    mulb    %r14b
    movb    %al, %r14b
    pextrb    $10, %xmm1, %r8d
    movb    %bpl, %al
    mulb    %r8b
    movb    %al, %r8b
    movb    %dl, %al
    mulb    %r13b
    movb    %al, %dl
    movb    %sil, %al
    mulb    %r15b
    movb    %al, %sil
    movb    %dil, %al
    mulb    %r10b
    movb    %al, %dil
    movl    -4(%rsp), %eax          ## 4-byte Reload
    mulb    %cl
    movb    %al, %cl
    movb    %bl, %al
    mulb    %r12b
    movb    %al, %r10b
    movzbl    %r9b, %r9d
    pextrb    $7, %xmm2, %eax
    pextrb    $7, %xmm1, %ebx
    mulb    %bl
    pinsrb    $5, %r9d, %xmm0
    movzbl    %r10b, %r9d
    movzbl    %cl, %ecx
    movzbl    %dil, %edi
    movzbl    %sil, %esi
    movzbl    %dl, %edx
    movzbl    %r8b, %r8d
    movzbl    %r14b, %r10d
    movzbl    %r11b, %r11d
    movzbl    %al, %ebx
    pextrb    $6, %xmm2, %eax
    pextrb    $6, %xmm1, %r14d
    mulb    %r14b
    movzbl    %al, %eax
    pinsrb    $6, %eax, %xmm0
    pinsrb    $7, %ebx, %xmm0
    pinsrb    $8, %r11d, %xmm0
    pinsrb    $9, %r10d, %xmm0
    pinsrb    $10, %r8d, %xmm0
    pinsrb    $11, %edx, %xmm0
    pinsrb    $12, %esi, %xmm0
    pinsrb    $13, %edi, %xmm0
    pinsrb    $14, %ecx, %xmm0
    pinsrb    $15, %r9d, %xmm0
    paddb    %xmm2, %xmm0
    popq    %rbx
    popq    %r12
    popq    %r13
    popq    %r14
    popq    %r15
    popq    %rbp
    ret

If I explicitly extract the values from the vector, do the math, and repack,
like this:

define <4 x i8> @bar(<4 x i8> %x, <4 x i8> %y, <4 x i8> %__mask) nounwind
readnone alwaysinline {
entry:
  %x0 = extractelement <4 x i8> %x, i32 0
  %x1 = extractelement <4 x i8> %x, i32 1
  %x2 = extractelement <4 x i8> %x, i32 2
  %x3 = extractelement <4 x i8> %x, i32 3

  %y0 = extractelement <4 x i8> %y, i32 0
  %y1 = extractelement <4 x i8> %y, i32 1
  %y2 = extractelement <4 x i8> %y, i32 2
  %y3 = extractelement <4 x i8> %y, i32 3

  %m0 = mul i8 %x0, %y0
  %m1 = mul i8 %x1, %y1
  %m2 = mul i8 %x2, %y2
  %m3 = mul i8 %x3, %y3

  %a0 = add i8 %m0, %x0
  %a1 = add i8 %m1, %x1
  %a2 = add i8 %m2, %x2
  %a3 = add i8 %m3, %x3

  %r0 = insertelement <4 x i8> undef, i8 %a0, i32 0
  %r1 = insertelement <4 x i8> %r0, i8 %a1, i32 1
  %r2 = insertelement <4 x i8> %r1, i8 %a2, i32 2
  %r3 = insertelement <4 x i8> %r2, i8 %a3, i32 3

  ret <4 x i8> %r3
}

The code is better:

_bar:                                   ## @bar
## BB#0:                                ## %entry
    pextrb    $2, %xmm0, %ecx
    pextrb    $2, %xmm1, %edx
    movb    %cl, %al
    mulb    %dl
    movb    %al, %dl
    addb    %cl, %dl
    pextrb    $0, %xmm0, %ecx
    pextrb    $0, %xmm1, %esi
    movb    %cl, %al
    mulb    %sil
    pextrb    $3, %xmm0, %esi
    movb    %al, %dil
    addb    %cl, %dil
    movzbl    %dl, %ecx
    pextrb    $3, %xmm1, %edx
    movb    %sil, %al
    mulb    %dl
    addb    %sil, %al
    movzbl    %al, %edx
    shll    $8, %edx
    pextrb    $1, %xmm0, %esi
    orl    %ecx, %edx
    movzbl    %dil, %ecx
    pextrb    $1, %xmm1, %edi
    movb    %sil, %al
    mulb    %dil
    addb    %sil, %al
    movzbl    %al, %eax
    shll    $8, %eax
    orl    %ecx, %eax
    pinsrw    $0, %eax, %xmm0
    pinsrw    $1, %edx, %xmm0
    ret

-- 
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.



More information about the llvm-bugs mailing list