[LLVMbugs] [Bug 20225] New: Generates a total mess of SIMD for simple array initialization
bugzilla-daemon at llvm.org
bugzilla-daemon at llvm.org
Mon Jul 7 02:14:07 PDT 2014
http://llvm.org/bugs/show_bug.cgi?id=20225
Bug ID: 20225
Summary: Generates a total mess of SIMD for simple array
initialization
Product: clang
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: normal
Priority: P
Component: LLVM Codegen
Assignee: unassignedclangbugs at nondot.org
Reporter: bisqwit at iki.fi
CC: llvmbugs at cs.uiuc.edu
Classification: Unclassified
For this code:
unsigned char lengths[256];
void init(void)
{
unsigned a;
for(a=0; a<256; ++a) lengths[a] = 8 + (a >= 144);
}
Clang emits an abhorrent mess of SIMD code that ends up being three times
larger than the array it is initializing, on optimization levels -O2 or
greater, including -Os, on x86_64 and i386 systems when SSE2 is enabled.
Generated code:
.file "test1.cc"
.section .rodata.cst16,"aM", at progbits,16
.align 16
.LCPI0_0:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.LCPI0_1:
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.LCPI0_2:
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.LCPI0_3:
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 15 # 0xf
.LCPI0_4:
.long 16 # 0x10
.long 17 # 0x11
.long 18 # 0x12
.long 19 # 0x13
.LCPI0_5:
.long 24 # 0x18
.long 25 # 0x19
.long 26 # 0x1a
.long 27 # 0x1b
.LCPI0_6:
.long 20 # 0x14
.long 21 # 0x15
.long 22 # 0x16
.long 23 # 0x17
.LCPI0_7:
.long 28 # 0x1c
.long 29 # 0x1d
.long 30 # 0x1e
.long 31 # 0x1f
.LCPI0_8:
.long 2147483648 # 0x80000000
.long 2147483648 # 0x80000000
.long 2147483648 # 0x80000000
.long 2147483648 # 0x80000000
.LCPI0_9:
.long 2147483791 # 0x8000008f
.long 2147483791 # 0x8000008f
.long 2147483791 # 0x8000008f
.long 2147483791 # 0x8000008f
.LCPI0_10:
.zero 16,1
.LCPI0_11:
.zero 16,8
.text
.globl _Z4initv
.align 16, 0x90
.type _Z4initv, at function
_Z4initv: # @_Z4initv
.cfi_startproc
# BB#0: # %vector.ph
subq $264, %rsp # imm = 0x108
.Ltmp1:
.cfi_def_cfa_offset 272
xorl %eax, %eax
movdqa .LCPI0_7(%rip), %xmm15
movdqa .LCPI0_8(%rip), %xmm13
movdqa .LCPI0_9(%rip), %xmm14
movdqa .LCPI0_10(%rip), %xmm11
movdqa .LCPI0_11(%rip), %xmm12
.align 16, 0x90
.LBB0_1: # %vector.body
# =>This Inner Loop Header:
Depth=1
movd %eax, %xmm4
pshufd $0, %xmm4, %xmm8 # xmm8 = xmm4[0,0,0,0]
movdqa %xmm8, %xmm4
paddd .LCPI0_0(%rip), %xmm4
movdqa %xmm8, %xmm5
paddd .LCPI0_1(%rip), %xmm5
movdqa %xmm8, %xmm6
paddd .LCPI0_2(%rip), %xmm6
movdqa %xmm8, %xmm7
paddd .LCPI0_3(%rip), %xmm7
pxor %xmm13, %xmm7
pcmpgtd %xmm14, %xmm7
movdqa %xmm7, 96(%rsp)
pxor %xmm13, %xmm6
pcmpgtd %xmm14, %xmm6
movdqa %xmm6, 192(%rsp)
pxor %xmm13, %xmm5
pcmpgtd %xmm14, %xmm5
movdqa %xmm5, 144(%rsp)
pxor %xmm13, %xmm4
pcmpgtd %xmm14, %xmm4
movdqa %xmm4, 240(%rsp)
movdqa %xmm7, 64(%rsp)
movdqa %xmm6, 160(%rsp)
movdqa %xmm5, 112(%rsp)
movdqa %xmm4, 208(%rsp)
movdqa %xmm7, 80(%rsp)
movdqa %xmm6, 176(%rsp)
movdqa %xmm5, 128(%rsp)
movdqa %xmm4, 224(%rsp)
movd 108(%rsp), %xmm9
movd 204(%rsp), %xmm10
punpcklbw %xmm9, %xmm10 # xmm10 =
xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
movd 156(%rsp), %xmm2
movd 252(%rsp), %xmm3
punpcklbw %xmm2, %xmm3 # xmm3 =
xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
punpcklbw %xmm10, %xmm3 # xmm3 =
xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
movd 68(%rsp), %xmm2
movd 164(%rsp), %xmm0
punpcklbw %xmm2, %xmm0 # xmm0 =
xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
movd 116(%rsp), %xmm2
movd 212(%rsp), %xmm1
punpcklbw %xmm2, %xmm1 # xmm1 =
xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
movdqa %xmm8, %xmm2
paddd .LCPI0_4(%rip), %xmm2
punpcklbw %xmm0, %xmm1 # xmm1 =
xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
movdqa %xmm8, %xmm0
paddd .LCPI0_5(%rip), %xmm0
punpcklbw %xmm3, %xmm1 # xmm1 =
xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
movdqa %xmm8, %xmm3
paddd .LCPI0_6(%rip), %xmm3
paddd %xmm15, %xmm8
punpcklbw %xmm7, %xmm6 # xmm6 =
xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
punpcklbw %xmm5, %xmm4 # xmm4 =
xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
punpcklbw %xmm6, %xmm4 # xmm4 =
xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
movd 88(%rsp), %xmm5
movd 184(%rsp), %xmm6
punpcklbw %xmm5, %xmm6 # xmm6 =
xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
movd 136(%rsp), %xmm5
movd 232(%rsp), %xmm7
punpcklbw %xmm5, %xmm7 # xmm7 =
xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
punpcklbw %xmm6, %xmm7 # xmm7 =
xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
punpcklbw %xmm7, %xmm4 # xmm4 =
xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
punpcklbw %xmm1, %xmm4 # xmm4 =
xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
pxor %xmm13, %xmm8
pcmpgtd %xmm14, %xmm8
movdqa %xmm8, -96(%rsp)
pxor %xmm13, %xmm3
pcmpgtd %xmm14, %xmm3
movdqa %xmm3, (%rsp)
pxor %xmm13, %xmm0
pcmpgtd %xmm14, %xmm0
movdqa %xmm0, -48(%rsp)
pxor %xmm13, %xmm2
pcmpgtd %xmm14, %xmm2
movdqa %xmm2, 48(%rsp)
movdqa %xmm8, -128(%rsp)
movdqa %xmm3, -32(%rsp)
movdqa %xmm0, -80(%rsp)
movdqa %xmm2, 16(%rsp)
movdqa %xmm8, -112(%rsp)
movdqa %xmm3, -16(%rsp)
movdqa %xmm0, -64(%rsp)
movdqa %xmm2, 32(%rsp)
movd -84(%rsp), %xmm1
movd 12(%rsp), %xmm5
punpcklbw %xmm1, %xmm5 # xmm5 =
xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
movd -36(%rsp), %xmm1
movd 60(%rsp), %xmm6
punpcklbw %xmm1, %xmm6 # xmm6 =
xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
punpcklbw %xmm5, %xmm6 # xmm6 =
xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
movd -124(%rsp), %xmm1
movd -28(%rsp), %xmm5
punpcklbw %xmm1, %xmm5 # xmm5 =
xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
movd -76(%rsp), %xmm1
movd 20(%rsp), %xmm7
punpcklbw %xmm1, %xmm7 # xmm7 =
xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
punpcklbw %xmm5, %xmm7 # xmm7 =
xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
punpcklbw %xmm6, %xmm7 # xmm7 =
xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
punpcklbw %xmm8, %xmm3 # xmm3 =
xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
punpcklbw %xmm0, %xmm2 # xmm2 =
xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
punpcklbw %xmm3, %xmm2 # xmm2 =
xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
movd -104(%rsp), %xmm0
movd -8(%rsp), %xmm1
punpcklbw %xmm0, %xmm1 # xmm1 =
xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
movd -56(%rsp), %xmm0
movd 40(%rsp), %xmm3
punpcklbw %xmm0, %xmm3 # xmm3 =
xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
punpcklbw %xmm1, %xmm3 # xmm3 =
xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
punpcklbw %xmm3, %xmm2 # xmm2 =
xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
punpcklbw %xmm7, %xmm2 # xmm2 =
xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
pand %xmm11, %xmm4
por %xmm12, %xmm4
movdqa %xmm4, lengths(%rax)
pand %xmm11, %xmm2
por %xmm12, %xmm2
movdqa %xmm2, lengths+16(%rax)
addq $32, %rax
cmpq $256, %rax # imm = 0x100
jne .LBB0_1
# BB#2: # %middle.block
addq $264, %rsp # imm = 0x108
retq
.Ltmp2:
.size _Z4initv, .Ltmp2-_Z4initv
.cfi_endproc
.type lengths, at object # @lengths
.bss
.globl lengths
.align 16
lengths:
.zero 256
.size lengths, 256
.ident "Debian clang version 3.5-1 (trunk) (based on LLVM 3.5)"
.section ".note.GNU-stack","", at progbits
Compared to what it does without SSE:
_Z4initv: # @_Z4initv
.cfi_startproc
# BB#0:
xorl %eax, %eax
.align 16, 0x90
.LBB0_1: # =>This Inner Loop Header: Depth=1
cmpl $143, %eax
seta %cl
orb $8, %cl
movb %cl, lengths(%rax)
incq %rax
cmpq $256, %rax # imm = 0x100
jne .LBB0_1
# BB#2:
retq
.Ltmp2:
.size _Z4initv, .Ltmp2-_Z4initv
.cfi_endproc
Or what it does when the loop is rewritten in two loops:
for(a=0; a<144; ++a) lengths[a] = 8;
for(a=144; a<256; ++a) lengths[a] = 9;
_Z4initv: # @_Z4initv
.cfi_startproc
# BB#0: # %.preheader
pushq %rax
.Ltmp1:
.cfi_def_cfa_offset 16
movl $lengths, %edi
movl $8, %esi
movl $144, %edx
callq memset
movabsq $651061555542690057, %rax # imm = 0x909090909090909
movq %rax, lengths+248(%rip)
movq %rax, lengths+240(%rip)
movq %rax, lengths+232(%rip)
movq %rax, lengths+224(%rip)
movq %rax, lengths+216(%rip)
movq %rax, lengths+208(%rip)
movq %rax, lengths+200(%rip)
movq %rax, lengths+192(%rip)
movq %rax, lengths+184(%rip)
movq %rax, lengths+176(%rip)
movq %rax, lengths+168(%rip)
movq %rax, lengths+160(%rip)
movq %rax, lengths+152(%rip)
movq %rax, lengths+144(%rip)
popq %rax
retq
.Ltmp2:
.size _Z4initv, .Ltmp2-_Z4initv
.cfi_endproc
Which is what it should have done for the first code in my opinion.
For comparison, GCC does not use SIMD for the first version at all, and for the
second version it uses "rep stosq" rather than calling memset, and it lea's
lengths(%rip) into a register and uses that register rather for smaller code,
but is otherwise identical.
Rewriting 8 + (a >= 144) into (a >= 144 ? 9 : 8) does not change the outcome in
any manner.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20140707/7cf39cb4/attachment.html>
More information about the llvm-bugs
mailing list