[LLVMbugs] [Bug 13239] New: optimizer and code generator do not generate vector shuffle instructions but a lot if inserts and extracts
bugzilla-daemon at llvm.org
bugzilla-daemon at llvm.org
Fri Jun 29 08:52:49 PDT 2012
http://llvm.org/bugs/show_bug.cgi?id=13239
Bug #: 13239
Summary: optimizer and code generator do not generate vector
shuffle instructions but a lot if inserts and extracts
Product: libraries
Version: 3.1
Platform: PC
OS/Version: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Backend: X86
AssignedTo: unassignedbugs at nondot.org
ReportedBy: llvm at henning-thielemann.de
CC: llvmbugs at cs.uiuc.edu
Classification: Unclassified
See the LL file below. It contains a lot of insertelements and extractelements
in order to arrange elements for the use of llvm.x86.ssse3.pabs.d.128. Usually
such insertelements and extractelements are translated to efficient vector
shuffling instructions.
But here neither the optimizer generates shufflevector LLVM instructions, nor
does the X86 code generator creates SSE or AVX shuffle instructions.
My machine supports AVX.
$ cat run-dis.ll
; ModuleID = 'run.bc'
define void @_fun1(<6 x i32>*) {
_L1:
%1 = load <6 x i32>* %0
%2 = extractelement <6 x i32> %1, i32 0
%3 = extractelement <6 x i32> %1, i32 1
%4 = extractelement <6 x i32> %1, i32 2
%5 = extractelement <6 x i32> %1, i32 3
%6 = insertelement <4 x i32> undef, i32 %2, i32 0
%7 = insertelement <4 x i32> %6, i32 %3, i32 1
%8 = insertelement <4 x i32> %7, i32 %4, i32 2
%9 = insertelement <4 x i32> %8, i32 %5, i32 3
%10 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %9)
%11 = extractelement <4 x i32> %10, i32 0
%12 = insertelement <6 x i32> undef, i32 %11, i32 0
%13 = extractelement <4 x i32> %10, i32 1
%14 = insertelement <6 x i32> %12, i32 %13, i32 1
%15 = extractelement <4 x i32> %10, i32 2
%16 = insertelement <6 x i32> %14, i32 %15, i32 2
%17 = extractelement <4 x i32> %10, i32 3
%18 = insertelement <6 x i32> %16, i32 %17, i32 3
%19 = extractelement <6 x i32> %1, i32 4
%20 = extractelement <6 x i32> %1, i32 5
%21 = extractelement <6 x i32> %1, i32 4
%22 = extractelement <6 x i32> %1, i32 5
%23 = insertelement <4 x i32> undef, i32 %19, i32 0
%24 = insertelement <4 x i32> %23, i32 %20, i32 1
%25 = insertelement <4 x i32> %24, i32 %21, i32 2
%26 = insertelement <4 x i32> %25, i32 %22, i32 3
%27 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %26)
%28 = extractelement <4 x i32> %27, i32 0
%29 = insertelement <6 x i32> %18, i32 %28, i32 4
%30 = extractelement <4 x i32> %27, i32 1
%31 = insertelement <6 x i32> %29, i32 %30, i32 5
%32 = extractelement <4 x i32> %27, i32 2
%33 = insertelement <6 x i32> %31, i32 %32, i32 6
%34 = extractelement <4 x i32> %27, i32 3
%35 = insertelement <6 x i32> %33, i32 %34, i32 7
store <6 x i32> %35, <6 x i32>* %0
ret void
}
declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
$ llvm-as < run-dis.ll | llc
.file "<stdin>"
.text
.globl _fun1
.align 16, 0x90
.type _fun1, at function
_fun1: # @_fun1
.cfi_startproc
# BB#0: # %_L1
pushq %rbp
.Ltmp2:
.cfi_def_cfa_offset 16
.Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp4:
.cfi_def_cfa_register %rbp
vmovdqa (%rdi), %ymm0
vpextrd $1, %xmm0, %eax
vmovd %xmm0, %ecx
vmovd %ecx, %xmm1
vpinsrd $1, %eax, %xmm1, %xmm1
vpextrd $2, %xmm0, %eax
vpinsrd $2, %eax, %xmm1, %xmm1
vpextrd $3, %xmm0, %edx
vextractf128 $1, %ymm0, %xmm0
vpextrd $1, %xmm0, %eax
vmovd %xmm0, %ecx
vmovd %ecx, %xmm0
vpinsrd $1, %eax, %xmm0, %xmm0
vpinsrd $3, %edx, %xmm1, %xmm1
vpabsd %xmm1, %xmm1
vpextrd $1, %xmm1, %edx
vpinsrd $1, %edx, %xmm1, %xmm2
vpextrd $2, %xmm1, %edx
vpinsrd $2, %edx, %xmm2, %xmm2
vpextrd $3, %xmm1, %edx
vpinsrd $3, %edx, %xmm2, %xmm1
vpinsrd $2, %ecx, %xmm0, %xmm0
vpinsrd $3, %eax, %xmm0, %xmm0
vmovdqa %xmm1, (%rdi)
vpabsd %xmm0, %xmm0
vpextrd $1, %xmm0, %eax
vpinsrd $1, %eax, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm0
vextractf128 $1, %ymm0, %xmm0
vmovq %xmm0, 16(%rdi)
popq %rbp
vzeroupper
ret
.Ltmp5:
.size _fun1, .Ltmp5-_fun1
.cfi_endproc
.section ".note.GNU-stack","", at progbits
$ llvm-as < run-dis.ll | opt -O3 | llvm-dis
; ModuleID = '<stdin>'
define void @_fun1(<6 x i32>* nocapture) nounwind {
_L1:
%1 = load <6 x i32>* %0
%2 = extractelement <6 x i32> %1, i32 0
%3 = extractelement <6 x i32> %1, i32 1
%4 = extractelement <6 x i32> %1, i32 2
%5 = extractelement <6 x i32> %1, i32 3
%6 = insertelement <4 x i32> undef, i32 %2, i32 0
%7 = insertelement <4 x i32> %6, i32 %3, i32 1
%8 = insertelement <4 x i32> %7, i32 %4, i32 2
%9 = insertelement <4 x i32> %8, i32 %5, i32 3
%10 = tail call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %9)
%11 = extractelement <4 x i32> %10, i32 0
%12 = insertelement <6 x i32> undef, i32 %11, i32 0
%13 = extractelement <4 x i32> %10, i32 1
%14 = insertelement <6 x i32> %12, i32 %13, i32 1
%15 = extractelement <4 x i32> %10, i32 2
%16 = insertelement <6 x i32> %14, i32 %15, i32 2
%17 = extractelement <4 x i32> %10, i32 3
%18 = insertelement <6 x i32> %16, i32 %17, i32 3
%19 = extractelement <6 x i32> %1, i32 4
%20 = extractelement <6 x i32> %1, i32 5
%21 = insertelement <4 x i32> undef, i32 %19, i32 0
%22 = insertelement <4 x i32> %21, i32 %20, i32 1
%23 = insertelement <4 x i32> %22, i32 %19, i32 2
%24 = insertelement <4 x i32> %23, i32 %20, i32 3
%25 = tail call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %24)
%26 = extractelement <4 x i32> %25, i32 0
%27 = insertelement <6 x i32> %18, i32 %26, i32 4
%28 = extractelement <4 x i32> %25, i32 1
%29 = insertelement <6 x i32> %27, i32 %28, i32 5
store <6 x i32> %29, <6 x i32>* %0
ret void
}
declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
--
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.
More information about the llvm-bugs
mailing list