[LLVMbugs] [Bug 13248] New: efficient implementation of sitofp for vectors
bugzilla-daemon at llvm.org
bugzilla-daemon at llvm.org
Sun Jul 1 03:10:02 PDT 2012
http://llvm.org/bugs/show_bug.cgi?id=13248
Bug #: 13248
Summary: efficient implementation of sitofp for vectors
Product: libraries
Version: 3.1
Platform: PC
OS/Version: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Backend: X86
AssignedTo: unassignedbugs at nondot.org
ReportedBy: llvm at henning-thielemann.de
CC: llvmbugs at cs.uiuc.edu
Classification: Unclassified
I want to implement a 'signum' function for float and double vectors. This
works well for SSE 128 bit vectors, but the generated code for AVX 256 bit
vectors is sometimes awful. I am using an AVX machine (no AVX2).
$ cat signum.ll
; generic implementation for 128 vectors: works well
define void @signum32a(<4 x float>*) {
_L1:
%1 = load <4 x float>* %0
%2 = fcmp olt <4 x float> %1, zeroinitializer
%3 = sitofp <4 x i1> %2 to <4 x float>
%4 = fcmp ogt <4 x float> %1, zeroinitializer
%5 = sitofp <4 x i1> %4 to <4 x float>
%6 = fsub <4 x float> %3, %5
store <4 x float> %6, <4 x float>* %0
ret void
}
; generic implementation for 256 vectors: generates really bad code
; it looks like every element is processed individually
define void @signum32b(<8 x float>*) {
_L1:
%1 = load <8 x float>* %0
%2 = fcmp olt <8 x float> %1, zeroinitializer
%3 = sitofp <8 x i1> %2 to <8 x float>
%4 = fcmp ogt <8 x float> %1, zeroinitializer
%5 = sitofp <8 x i1> %4 to <8 x float>
%6 = fsub <8 x float> %3, %5
store <8 x float> %6, <8 x float>* %0
ret void
}
; implementation using AVX intrinsics
; the assembly code looks almost the same as the one for signum32a (good)
define void @signum32c(<8 x float>*) {
_L1:
%1 = load <8 x float>* %0
%2 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %1, <8 x
float> zeroinitializer, i8 1)
%3 = bitcast <8 x float> %2 to <8 x i32>
%4 = sitofp <8 x i32> %3 to <8 x float>
%5 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>
zeroinitializer, <8 x float> %1, i8 1)
%6 = bitcast <8 x float> %5 to <8 x i32>
%7 = sitofp <8 x i32> %6 to <8 x float>
%8 = fsub <8 x float> %4, %7
store <8 x float> %8, <8 x float>* %0
ret void
}
declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8)
nounwind readnone
; generic implementation for 128 vectors: looks acceptable
; it could make use of vectorized conversion from int to float vector
define void @signum64a(<2 x double>*) {
_L1:
%1 = load <2 x double>* %0
%2 = fcmp olt <2 x double> %1, zeroinitializer
%3 = sitofp <2 x i1> %2 to <2 x double>
%4 = fcmp ogt <2 x double> %1, zeroinitializer
%5 = sitofp <2 x i1> %4 to <2 x double>
%6 = fsub <2 x double> %3, %5
store <2 x double> %6, <2 x double>* %0
ret void
}
; generic implementation for 256 vectors: looks acceptable
; I think the adjacent vpslld and vpsrad are redundant
; because the result of vcmpltpd can only contain elements with value 0 or -1.
define void @signum64b(<4 x double>*) {
_L1:
%1 = load <4 x double>* %0
%2 = fcmp olt <4 x double> %1, zeroinitializer
%3 = sitofp <4 x i1> %2 to <4 x double>
%4 = fcmp ogt <4 x double> %1, zeroinitializer
%5 = sitofp <4 x i1> %4 to <4 x double>
%6 = fsub <4 x double> %3, %5
store <4 x double> %6, <4 x double>* %0
ret void
}
; specialized implementation using AVX intrinsics
; Since the result of the comparison is i32 not i1
; we can perform the subtraction for the integers.
; It seems to be important how the shufflevector call is written.
define void @signum64c(<4 x double>*) {
_L1:
%x = load <4 x double>* %0
%xgt = tail call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %x, <4 x
double> zeroinitializer, i8 1)
%igt = bitcast <4 x double> %xgt to <8 x i32>
%xlt = tail call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>
zeroinitializer, <4 x double> %x, i8 1)
%ilt = bitcast <4 x double> %xlt to <8 x i32>
; it is important to use %igt twice as source in order to make LLVM use a
shuffle operation
%isign = sub <8 x i32> %igt, %ilt
%ssign = shufflevector <8 x i32> %isign, <8 x i32> %isign, <4 x i32> <i32 0,
i32 2, i32 12, i32 14>
%sign = tail call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %ssign)
store <4 x double> %sign, <4 x double>* %0
ret void
}
declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8)
nounwind readnone
declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
$ cat signum.ll | llvm-as | llc
.file "<stdin>"
.text
.globl signum32a
.align 16, 0x90
.type signum32a, at function
signum32a: # @signum32a
.cfi_startproc
# BB#0: # %_L1
vmovaps (%rdi), %xmm0
vxorps %xmm1, %xmm1, %xmm1
vcmpltps %xmm0, %xmm1, %xmm2
vcvtdq2ps %xmm2, %xmm2
vcmpltps %xmm1, %xmm0, %xmm0
vcvtdq2ps %xmm0, %xmm0
vsubps %xmm2, %xmm0, %xmm0
vmovaps %xmm0, (%rdi)
ret
.Ltmp0:
.size signum32a, .Ltmp0-signum32a
.cfi_endproc
.section .rodata.cst16,"aM", at progbits,16
.align 16
.LCPI1_0:
.byte 0 # 0x0
.byte 1 # 0x1
.byte 4 # 0x4
.byte 5 # 0x5
.byte 8 # 0x8
.byte 9 # 0x9
.byte 12 # 0xc
.byte 13 # 0xd
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.text
.globl signum32b
.align 16, 0x90
.type signum32b, at function
signum32b: # @signum32b
.cfi_startproc
# BB#0: # %_L1
pushq %rbp
.Ltmp3:
.cfi_def_cfa_offset 16
.Ltmp4:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp5:
.cfi_def_cfa_register %rbp
vmovaps (%rdi), %ymm0
vxorps %ymm2, %ymm2, %ymm2
vcmpltps %ymm2, %ymm0, %ymm3
vextractf128 $1, %ymm3, %xmm4
vmovdqa .LCPI1_0(%rip), %xmm1
vpshufb %xmm1, %xmm4, %xmm4
vpshufb %xmm1, %xmm3, %xmm3
vmovlhps %xmm4, %xmm3, %xmm3 # xmm3 = xmm3[0],xmm4[0]
vpsllw $15, %xmm3, %xmm3
vcmpltps %ymm0, %ymm2, %ymm2
vextractf128 $1, %ymm2, %xmm4
vpsraw $15, %xmm3, %xmm0
vpshufb %xmm1, %xmm4, %xmm3
vpshufb %xmm1, %xmm2, %xmm1
vmovlhps %xmm3, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm3[0]
vpsllw $15, %xmm1, %xmm1
vpsraw $15, %xmm1, %xmm1
vpextrw $2, %xmm1, %edx
vpextrw $6, %xmm1, %esi
vpextrw $3, %xmm1, %eax
vpextrw $4, %xmm0, %r10d
vpextrw $1, %xmm1, %ecx
movswl %cx, %ecx
vcvtsi2ss %ecx, %xmm0, %xmm4
vpextrw $4, %xmm1, %r8d
vpextrw $1, %xmm0, %r9d
movswl %dx, %edx
vpextrw $5, %xmm1, %ecx
movswl %cx, %ecx
vcvtsi2ss %ecx, %xmm0, %xmm3
vcvtsi2ss %edx, %xmm0, %xmm8
movswl %si, %edx
movswl %ax, %eax
movswl %r10w, %ecx
vmovd %xmm1, %esi
movswl %si, %esi
vcvtsi2ss %esi, %xmm0, %xmm2
vinsertps $16, %xmm4, %xmm2, %xmm2 # xmm2 =
xmm2[0],xmm4[0],xmm2[2,3]
vpextrw $6, %xmm0, %esi
vcvtsi2ss %eax, %xmm0, %xmm11
vcvtsi2ss %edx, %xmm0, %xmm10
vcvtsi2ss %ecx, %xmm0, %xmm9
vpextrw $2, %xmm0, %ecx
movswl %r9w, %eax
movswl %r8w, %edx
vcvtsi2ss %edx, %xmm0, %xmm4
vpextrw $5, %xmm0, %edx
movswl %dx, %edx
vcvtsi2ss %edx, %xmm0, %xmm7
vinsertps $16, %xmm3, %xmm4, %xmm4 # xmm4 =
xmm4[0],xmm3[0],xmm4[2,3]
vinsertps $32, %xmm8, %xmm2, %xmm6 # xmm6 =
xmm2[0,1],xmm8[0],xmm2[3]
vcvtsi2ss %eax, %xmm0, %xmm2
movswl %si, %eax
vcvtsi2ss %eax, %xmm0, %xmm3
vpextrw $7, %xmm1, %eax
movswl %ax, %eax
vcvtsi2ss %eax, %xmm0, %xmm5
vinsertps $48, %xmm11, %xmm6, %xmm1 # xmm1 = xmm6[0,1,2],xmm11[0]
vinsertps $32, %xmm10, %xmm4, %xmm4 # xmm4 =
xmm4[0,1],xmm10[0],xmm4[3]
vinsertps $16, %xmm7, %xmm9, %xmm6 # xmm6 =
xmm9[0],xmm7[0],xmm9[2,3]
movswl %cx, %eax
vmovd %xmm0, %ecx
movswl %cx, %ecx
vcvtsi2ss %ecx, %xmm0, %xmm7
vinsertps $16, %xmm2, %xmm7, %xmm2 # xmm2 =
xmm7[0],xmm2[0],xmm7[2,3]
vinsertps $32, %xmm3, %xmm6, %xmm3 # xmm3 =
xmm6[0,1],xmm3[0],xmm6[3]
vinsertps $48, %xmm5, %xmm4, %xmm4 # xmm4 = xmm4[0,1,2],xmm5[0]
vcvtsi2ss %eax, %xmm0, %xmm5
vpextrw $7, %xmm0, %eax
movswl %ax, %eax
vcvtsi2ss %eax, %xmm0, %xmm6
vinsertf128 $1, %xmm4, %ymm1, %ymm1
vinsertps $48, %xmm6, %xmm3, %xmm3 # xmm3 = xmm3[0,1,2],xmm6[0]
vinsertps $32, %xmm5, %xmm2, %xmm2 # xmm2 =
xmm2[0,1],xmm5[0],xmm2[3]
vpextrw $3, %xmm0, %eax
movswl %ax, %eax
vcvtsi2ss %eax, %xmm0, %xmm0
vinsertps $48, %xmm0, %xmm2, %xmm0 # xmm0 = xmm2[0,1,2],xmm0[0]
vinsertf128 $1, %xmm3, %ymm0, %ymm0
vsubps %ymm1, %ymm0, %ymm0
vmovaps %ymm0, (%rdi)
popq %rbp
vzeroupper
ret
.Ltmp6:
.size signum32b, .Ltmp6-signum32b
.cfi_endproc
.globl signum32c
.align 16, 0x90
.type signum32c, at function
signum32c: # @signum32c
.cfi_startproc
# BB#0: # %_L1
pushq %rbp
.Ltmp9:
.cfi_def_cfa_offset 16
.Ltmp10:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp11:
.cfi_def_cfa_register %rbp
vmovaps (%rdi), %ymm0
vxorps %ymm1, %ymm1, %ymm1
vcmpltps %ymm0, %ymm1, %ymm2
vcvtdq2ps %ymm2, %ymm2
vcmpltps %ymm1, %ymm0, %ymm0
vcvtdq2ps %ymm0, %ymm0
vsubps %ymm2, %ymm0, %ymm0
vmovaps %ymm0, (%rdi)
popq %rbp
vzeroupper
ret
.Ltmp12:
.size signum32c, .Ltmp12-signum32c
.cfi_endproc
.globl signum64a
.align 16, 0x90
.type signum64a, at function
signum64a: # @signum64a
.cfi_startproc
# BB#0: # %_L1
vmovapd (%rdi), %xmm0
vxorpd %xmm2, %xmm2, %xmm2
vcmpltpd %xmm0, %xmm2, %xmm1
vpextrq $1, %xmm1, %rax
vcmpltpd %xmm2, %xmm0, %xmm0
vcvtsi2sdq %rax, %xmm0, %xmm2
movd %xmm1, %rax
vcvtsi2sdq %rax, %xmm0, %xmm1
vunpcklpd %xmm2, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm2[0]
vpextrq $1, %xmm0, %rax
vcvtsi2sdq %rax, %xmm0, %xmm2
movd %xmm0, %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
vunpcklpd %xmm2, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm2[0]
vsubpd %xmm1, %xmm0, %xmm0
vmovapd %xmm0, (%rdi)
ret
.Ltmp13:
.size signum64a, .Ltmp13-signum64a
.cfi_endproc
.globl signum64b
.align 16, 0x90
.type signum64b, at function
signum64b: # @signum64b
.cfi_startproc
# BB#0: # %_L1
pushq %rbp
.Ltmp16:
.cfi_def_cfa_offset 16
.Ltmp17:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp18:
.cfi_def_cfa_register %rbp
vmovapd (%rdi), %ymm0
vxorps %ymm1, %ymm1, %ymm1
vcmpltpd %ymm0, %ymm1, %ymm2
vextractf128 $1, %ymm2, %xmm3
vpshufd $8, %xmm3, %xmm3 # xmm3 = xmm3[0,2,0,0]
vpshufd $8, %xmm2, %xmm2 # xmm2 = xmm2[0,2,0,0]
vmovlhps %xmm3, %xmm2, %xmm2 # xmm2 = xmm2[0],xmm3[0]
vpslld $31, %xmm2, %xmm2
vpsrad $31, %xmm2, %xmm2
vcmpltpd %ymm1, %ymm0, %ymm1
vextractf128 $1, %ymm1, %xmm3
vcvtdq2pd %xmm2, %ymm0
vpshufd $8, %xmm3, %xmm2 # xmm2 = xmm3[0,2,0,0]
vpshufd $8, %xmm1, %xmm1 # xmm1 = xmm1[0,2,0,0]
vmovlhps %xmm2, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm2[0]
vpslld $31, %xmm1, %xmm1
vpsrad $31, %xmm1, %xmm1
vcvtdq2pd %xmm1, %ymm1
vsubpd %ymm0, %ymm1, %ymm0
vmovapd %ymm0, (%rdi)
popq %rbp
vzeroupper
ret
.Ltmp19:
.size signum64b, .Ltmp19-signum64b
.cfi_endproc
.globl signum64c
.align 16, 0x90
.type signum64c, at function
signum64c: # @signum64c
.cfi_startproc
# BB#0: # %_L1
pushq %rbp
.Ltmp22:
.cfi_def_cfa_offset 16
.Ltmp23:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp24:
.cfi_def_cfa_register %rbp
vmovapd (%rdi), %ymm1
vxorps %ymm2, %ymm2, %ymm2
vcmpltpd %ymm1, %ymm2, %ymm0
vextractf128 $1, %ymm0, %xmm3
vcmpltpd %ymm2, %ymm1, %ymm1
vextractf128 $1, %ymm1, %xmm2
vpsubd %xmm3, %xmm2, %xmm2
vpsubd %xmm0, %xmm1, %xmm0
vshufps $-120, %xmm2, %xmm0, %xmm0 # xmm0 = xmm0[0,2],xmm2[0,2]
vcvtdq2pd %xmm0, %ymm0
vmovaps %ymm0, (%rdi)
popq %rbp
vzeroupper
ret
.Ltmp25:
.size signum64c, .Ltmp25-signum64c
.cfi_endproc
.section ".note.GNU-stack","", at progbits
--
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.
More information about the llvm-bugs
mailing list