[LLVMbugs] [Bug 13239] New: optimizer and code generator do not generate vector shuffle instructions but a lot if inserts and extracts

bugzilla-daemon at llvm.org bugzilla-daemon at llvm.org
Fri Jun 29 08:52:49 PDT 2012


http://llvm.org/bugs/show_bug.cgi?id=13239

             Bug #: 13239
           Summary: optimizer and code generator do not generate vector
                    shuffle instructions but a lot if inserts and extracts
           Product: libraries
           Version: 3.1
          Platform: PC
        OS/Version: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
        AssignedTo: unassignedbugs at nondot.org
        ReportedBy: llvm at henning-thielemann.de
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified


See the LL file below. It contains a lot of insertelements and extractelements
in order to arrange elements for the use of llvm.x86.ssse3.pabs.d.128. Usually
such insertelements and extractelements are translated to efficient vector
shuffling instructions.
But here neither the optimizer generates shufflevector LLVM instructions, nor
does the X86 code generator creates SSE or AVX shuffle instructions.
My machine supports AVX.

$ cat run-dis.ll
; ModuleID = 'run.bc'

define void @_fun1(<6 x i32>*) {
_L1:
  %1 = load <6 x i32>* %0
  %2 = extractelement <6 x i32> %1, i32 0
  %3 = extractelement <6 x i32> %1, i32 1
  %4 = extractelement <6 x i32> %1, i32 2
  %5 = extractelement <6 x i32> %1, i32 3
  %6 = insertelement <4 x i32> undef, i32 %2, i32 0
  %7 = insertelement <4 x i32> %6, i32 %3, i32 1
  %8 = insertelement <4 x i32> %7, i32 %4, i32 2
  %9 = insertelement <4 x i32> %8, i32 %5, i32 3
  %10 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %9)
  %11 = extractelement <4 x i32> %10, i32 0
  %12 = insertelement <6 x i32> undef, i32 %11, i32 0
  %13 = extractelement <4 x i32> %10, i32 1
  %14 = insertelement <6 x i32> %12, i32 %13, i32 1
  %15 = extractelement <4 x i32> %10, i32 2
  %16 = insertelement <6 x i32> %14, i32 %15, i32 2
  %17 = extractelement <4 x i32> %10, i32 3
  %18 = insertelement <6 x i32> %16, i32 %17, i32 3
  %19 = extractelement <6 x i32> %1, i32 4
  %20 = extractelement <6 x i32> %1, i32 5
  %21 = extractelement <6 x i32> %1, i32 4
  %22 = extractelement <6 x i32> %1, i32 5
  %23 = insertelement <4 x i32> undef, i32 %19, i32 0
  %24 = insertelement <4 x i32> %23, i32 %20, i32 1
  %25 = insertelement <4 x i32> %24, i32 %21, i32 2
  %26 = insertelement <4 x i32> %25, i32 %22, i32 3
  %27 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %26)
  %28 = extractelement <4 x i32> %27, i32 0
  %29 = insertelement <6 x i32> %18, i32 %28, i32 4
  %30 = extractelement <4 x i32> %27, i32 1
  %31 = insertelement <6 x i32> %29, i32 %30, i32 5
  %32 = extractelement <4 x i32> %27, i32 2
  %33 = insertelement <6 x i32> %31, i32 %32, i32 6
  %34 = extractelement <4 x i32> %27, i32 3
  %35 = insertelement <6 x i32> %33, i32 %34, i32 7
  store <6 x i32> %35, <6 x i32>* %0
  ret void
}

declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone

$ llvm-as < run-dis.ll | llc
        .file   "<stdin>"
        .text
        .globl  _fun1
        .align  16, 0x90
        .type   _fun1, at function
_fun1:                                  # @_fun1
        .cfi_startproc
# BB#0:                                 # %_L1
        pushq   %rbp
.Ltmp2:
        .cfi_def_cfa_offset 16
.Ltmp3:
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
.Ltmp4:
        .cfi_def_cfa_register %rbp
        vmovdqa (%rdi), %ymm0
        vpextrd $1, %xmm0, %eax
        vmovd   %xmm0, %ecx
        vmovd   %ecx, %xmm1
        vpinsrd $1, %eax, %xmm1, %xmm1
        vpextrd $2, %xmm0, %eax
        vpinsrd $2, %eax, %xmm1, %xmm1
        vpextrd $3, %xmm0, %edx
        vextractf128    $1, %ymm0, %xmm0
        vpextrd $1, %xmm0, %eax
        vmovd   %xmm0, %ecx
        vmovd   %ecx, %xmm0
        vpinsrd $1, %eax, %xmm0, %xmm0
        vpinsrd $3, %edx, %xmm1, %xmm1
        vpabsd  %xmm1, %xmm1
        vpextrd $1, %xmm1, %edx
        vpinsrd $1, %edx, %xmm1, %xmm2
        vpextrd $2, %xmm1, %edx
        vpinsrd $2, %edx, %xmm2, %xmm2
        vpextrd $3, %xmm1, %edx
        vpinsrd $3, %edx, %xmm2, %xmm1
        vpinsrd $2, %ecx, %xmm0, %xmm0
        vpinsrd $3, %eax, %xmm0, %xmm0
        vmovdqa %xmm1, (%rdi)
        vpabsd  %xmm0, %xmm0
        vpextrd $1, %xmm0, %eax
        vpinsrd $1, %eax, %xmm0, %xmm0
        vinsertf128     $1, %xmm0, %ymm1, %ymm0
        vextractf128    $1, %ymm0, %xmm0
        vmovq   %xmm0, 16(%rdi)
        popq    %rbp
        vzeroupper
        ret
.Ltmp5:
        .size   _fun1, .Ltmp5-_fun1
        .cfi_endproc


        .section        ".note.GNU-stack","", at progbits

$ llvm-as < run-dis.ll | opt -O3 | llvm-dis
; ModuleID = '<stdin>'

define void @_fun1(<6 x i32>* nocapture) nounwind {
_L1:
  %1 = load <6 x i32>* %0
  %2 = extractelement <6 x i32> %1, i32 0
  %3 = extractelement <6 x i32> %1, i32 1
  %4 = extractelement <6 x i32> %1, i32 2
  %5 = extractelement <6 x i32> %1, i32 3
  %6 = insertelement <4 x i32> undef, i32 %2, i32 0
  %7 = insertelement <4 x i32> %6, i32 %3, i32 1
  %8 = insertelement <4 x i32> %7, i32 %4, i32 2
  %9 = insertelement <4 x i32> %8, i32 %5, i32 3
  %10 = tail call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %9)
  %11 = extractelement <4 x i32> %10, i32 0
  %12 = insertelement <6 x i32> undef, i32 %11, i32 0
  %13 = extractelement <4 x i32> %10, i32 1
  %14 = insertelement <6 x i32> %12, i32 %13, i32 1
  %15 = extractelement <4 x i32> %10, i32 2
  %16 = insertelement <6 x i32> %14, i32 %15, i32 2
  %17 = extractelement <4 x i32> %10, i32 3
  %18 = insertelement <6 x i32> %16, i32 %17, i32 3
  %19 = extractelement <6 x i32> %1, i32 4
  %20 = extractelement <6 x i32> %1, i32 5
  %21 = insertelement <4 x i32> undef, i32 %19, i32 0
  %22 = insertelement <4 x i32> %21, i32 %20, i32 1
  %23 = insertelement <4 x i32> %22, i32 %19, i32 2
  %24 = insertelement <4 x i32> %23, i32 %20, i32 3
  %25 = tail call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %24)
  %26 = extractelement <4 x i32> %25, i32 0
  %27 = insertelement <6 x i32> %18, i32 %26, i32 4
  %28 = extractelement <4 x i32> %25, i32 1
  %29 = insertelement <6 x i32> %27, i32 %28, i32 5
  store <6 x i32> %29, <6 x i32>* %0
  ret void
}

declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone

-- 
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.



More information about the llvm-bugs mailing list