[LLVMbugs] [Bug 13248] New: efficient implementation of sitofp for vectors

bugzilla-daemon at llvm.org bugzilla-daemon at llvm.org
Sun Jul 1 03:10:02 PDT 2012


http://llvm.org/bugs/show_bug.cgi?id=13248

             Bug #: 13248
           Summary: efficient implementation of sitofp for vectors
           Product: libraries
           Version: 3.1
          Platform: PC
        OS/Version: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
        AssignedTo: unassignedbugs at nondot.org
        ReportedBy: llvm at henning-thielemann.de
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified


I want to implement a 'signum' function for float and double vectors. This
works well for SSE 128 bit vectors, but the generated code for AVX 256 bit
vectors is sometimes awful. I am using an AVX machine (no AVX2).

$ cat signum.ll
; generic implementation for 128 vectors: works well
define void @signum32a(<4 x float>*) {
_L1:
  %1 = load <4 x float>* %0
  %2 = fcmp olt <4 x float> %1, zeroinitializer
  %3 = sitofp <4 x i1> %2 to <4 x float>
  %4 = fcmp ogt <4 x float> %1, zeroinitializer
  %5 = sitofp <4 x i1> %4 to <4 x float>
  %6 = fsub <4 x float> %3, %5
  store <4 x float> %6, <4 x float>* %0
  ret void
}

; generic implementation for 256 vectors: generates really bad code
; it looks like every element is processed individually
define void @signum32b(<8 x float>*) {
_L1:
  %1 = load <8 x float>* %0
  %2 = fcmp olt <8 x float> %1, zeroinitializer
  %3 = sitofp <8 x i1> %2 to <8 x float>
  %4 = fcmp ogt <8 x float> %1, zeroinitializer
  %5 = sitofp <8 x i1> %4 to <8 x float>
  %6 = fsub <8 x float> %3, %5
  store <8 x float> %6, <8 x float>* %0
  ret void
}

; implementation using AVX intrinsics
; the assembly code looks almost the same as the one for signum32a (good)
define void @signum32c(<8 x float>*) {
_L1:
  %1 = load <8 x float>* %0
  %2 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %1, <8 x
float> zeroinitializer, i8 1)
  %3 = bitcast <8 x float> %2 to <8 x i32>
  %4 = sitofp <8 x i32> %3 to <8 x float>
  %5 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>
zeroinitializer, <8 x float> %1, i8 1)
  %6 = bitcast <8 x float> %5 to <8 x i32>
  %7 = sitofp <8 x i32> %6 to <8 x float>
  %8 = fsub <8 x float> %4, %7
  store <8 x float> %8, <8 x float>* %0
  ret void
}

declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8)
nounwind readnone



; generic implementation for 128 vectors: looks acceptable
; it could make use of vectorized conversion from int to float vector
define void @signum64a(<2 x double>*) {
_L1:
  %1 = load <2 x double>* %0
  %2 = fcmp olt <2 x double> %1, zeroinitializer
  %3 = sitofp <2 x i1> %2 to <2 x double>
  %4 = fcmp ogt <2 x double> %1, zeroinitializer
  %5 = sitofp <2 x i1> %4 to <2 x double>
  %6 = fsub <2 x double> %3, %5
  store <2 x double> %6, <2 x double>* %0
  ret void
}

; generic implementation for 256 vectors: looks acceptable
; I think the adjacent vpslld and vpsrad are redundant
; because the result of vcmpltpd can only contain elements with value 0 or -1.
define void @signum64b(<4 x double>*) {
_L1:
  %1 = load <4 x double>* %0
  %2 = fcmp olt <4 x double> %1, zeroinitializer
  %3 = sitofp <4 x i1> %2 to <4 x double>
  %4 = fcmp ogt <4 x double> %1, zeroinitializer
  %5 = sitofp <4 x i1> %4 to <4 x double>
  %6 = fsub <4 x double> %3, %5
  store <4 x double> %6, <4 x double>* %0
  ret void
}

; specialized implementation using AVX intrinsics
; Since the result of the comparison is i32 not i1
; we can perform the subtraction for the integers.
; It seems to be important how the shufflevector call is written.
define void @signum64c(<4 x double>*) {
_L1:
  %x = load <4 x double>* %0
  %xgt = tail call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %x, <4 x
double> zeroinitializer, i8 1)
  %igt = bitcast <4 x double> %xgt to <8 x i32>
  %xlt = tail call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>
zeroinitializer, <4 x double> %x, i8 1)
  %ilt = bitcast <4 x double> %xlt to <8 x i32>
  ; it is important to use %igt twice as source in order to make LLVM use a
shuffle operation
  %isign = sub <8 x i32> %igt, %ilt
  %ssign = shufflevector <8 x i32> %isign, <8 x i32> %isign, <4 x i32> <i32 0,
i32 2, i32 12, i32 14>
  %sign = tail call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %ssign)
  store <4 x double> %sign, <4 x double>* %0
  ret void
}

declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8)
nounwind readnone

declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone



$ cat signum.ll | llvm-as | llc
        .file   "<stdin>"
        .text
        .globl  signum32a
        .align  16, 0x90
        .type   signum32a, at function
signum32a:                              # @signum32a
        .cfi_startproc
# BB#0:                                 # %_L1
        vmovaps (%rdi), %xmm0
        vxorps  %xmm1, %xmm1, %xmm1
        vcmpltps        %xmm0, %xmm1, %xmm2
        vcvtdq2ps       %xmm2, %xmm2
        vcmpltps        %xmm1, %xmm0, %xmm0
        vcvtdq2ps       %xmm0, %xmm0
        vsubps  %xmm2, %xmm0, %xmm0
        vmovaps %xmm0, (%rdi)
        ret
.Ltmp0:
        .size   signum32a, .Ltmp0-signum32a
        .cfi_endproc

        .section        .rodata.cst16,"aM", at progbits,16
        .align  16
.LCPI1_0:
        .byte   0                       # 0x0
        .byte   1                       # 0x1
        .byte   4                       # 0x4
        .byte   5                       # 0x5
        .byte   8                       # 0x8
        .byte   9                       # 0x9
        .byte   12                      # 0xc
        .byte   13                      # 0xd
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .text
        .globl  signum32b
        .align  16, 0x90
        .type   signum32b, at function
signum32b:                              # @signum32b
        .cfi_startproc
# BB#0:                                 # %_L1
        pushq   %rbp
.Ltmp3:
        .cfi_def_cfa_offset 16
.Ltmp4:
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
.Ltmp5:
        .cfi_def_cfa_register %rbp
        vmovaps (%rdi), %ymm0
        vxorps  %ymm2, %ymm2, %ymm2
        vcmpltps        %ymm2, %ymm0, %ymm3
        vextractf128    $1, %ymm3, %xmm4
        vmovdqa .LCPI1_0(%rip), %xmm1
        vpshufb %xmm1, %xmm4, %xmm4
        vpshufb %xmm1, %xmm3, %xmm3
        vmovlhps        %xmm4, %xmm3, %xmm3 # xmm3 = xmm3[0],xmm4[0]
        vpsllw  $15, %xmm3, %xmm3
        vcmpltps        %ymm0, %ymm2, %ymm2
        vextractf128    $1, %ymm2, %xmm4
        vpsraw  $15, %xmm3, %xmm0
        vpshufb %xmm1, %xmm4, %xmm3
        vpshufb %xmm1, %xmm2, %xmm1
        vmovlhps        %xmm3, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm3[0]
        vpsllw  $15, %xmm1, %xmm1
        vpsraw  $15, %xmm1, %xmm1
        vpextrw $2, %xmm1, %edx
        vpextrw $6, %xmm1, %esi
        vpextrw $3, %xmm1, %eax
        vpextrw $4, %xmm0, %r10d
        vpextrw $1, %xmm1, %ecx
        movswl  %cx, %ecx
        vcvtsi2ss       %ecx, %xmm0, %xmm4
        vpextrw $4, %xmm1, %r8d
        vpextrw $1, %xmm0, %r9d
        movswl  %dx, %edx
        vpextrw $5, %xmm1, %ecx
        movswl  %cx, %ecx
        vcvtsi2ss       %ecx, %xmm0, %xmm3
        vcvtsi2ss       %edx, %xmm0, %xmm8
        movswl  %si, %edx
        movswl  %ax, %eax
        movswl  %r10w, %ecx
        vmovd   %xmm1, %esi
        movswl  %si, %esi
        vcvtsi2ss       %esi, %xmm0, %xmm2
        vinsertps       $16, %xmm4, %xmm2, %xmm2 # xmm2 =
xmm2[0],xmm4[0],xmm2[2,3]
        vpextrw $6, %xmm0, %esi
        vcvtsi2ss       %eax, %xmm0, %xmm11
        vcvtsi2ss       %edx, %xmm0, %xmm10
        vcvtsi2ss       %ecx, %xmm0, %xmm9
        vpextrw $2, %xmm0, %ecx
        movswl  %r9w, %eax
        movswl  %r8w, %edx
        vcvtsi2ss       %edx, %xmm0, %xmm4
        vpextrw $5, %xmm0, %edx
        movswl  %dx, %edx
        vcvtsi2ss       %edx, %xmm0, %xmm7
        vinsertps       $16, %xmm3, %xmm4, %xmm4 # xmm4 =
xmm4[0],xmm3[0],xmm4[2,3]
        vinsertps       $32, %xmm8, %xmm2, %xmm6 # xmm6 =
xmm2[0,1],xmm8[0],xmm2[3]
        vcvtsi2ss       %eax, %xmm0, %xmm2
        movswl  %si, %eax
        vcvtsi2ss       %eax, %xmm0, %xmm3
        vpextrw $7, %xmm1, %eax
        movswl  %ax, %eax
        vcvtsi2ss       %eax, %xmm0, %xmm5
        vinsertps       $48, %xmm11, %xmm6, %xmm1 # xmm1 = xmm6[0,1,2],xmm11[0]
        vinsertps       $32, %xmm10, %xmm4, %xmm4 # xmm4 =
xmm4[0,1],xmm10[0],xmm4[3]
        vinsertps       $16, %xmm7, %xmm9, %xmm6 # xmm6 =
xmm9[0],xmm7[0],xmm9[2,3]
        movswl  %cx, %eax
        vmovd   %xmm0, %ecx
        movswl  %cx, %ecx
        vcvtsi2ss       %ecx, %xmm0, %xmm7
        vinsertps       $16, %xmm2, %xmm7, %xmm2 # xmm2 =
xmm7[0],xmm2[0],xmm7[2,3]
        vinsertps       $32, %xmm3, %xmm6, %xmm3 # xmm3 =
xmm6[0,1],xmm3[0],xmm6[3]
        vinsertps       $48, %xmm5, %xmm4, %xmm4 # xmm4 = xmm4[0,1,2],xmm5[0]
        vcvtsi2ss       %eax, %xmm0, %xmm5
        vpextrw $7, %xmm0, %eax
        movswl  %ax, %eax
        vcvtsi2ss       %eax, %xmm0, %xmm6
        vinsertf128     $1, %xmm4, %ymm1, %ymm1
        vinsertps       $48, %xmm6, %xmm3, %xmm3 # xmm3 = xmm3[0,1,2],xmm6[0]
        vinsertps       $32, %xmm5, %xmm2, %xmm2 # xmm2 =
xmm2[0,1],xmm5[0],xmm2[3]
        vpextrw $3, %xmm0, %eax
        movswl  %ax, %eax
        vcvtsi2ss       %eax, %xmm0, %xmm0
        vinsertps       $48, %xmm0, %xmm2, %xmm0 # xmm0 = xmm2[0,1,2],xmm0[0]
        vinsertf128     $1, %xmm3, %ymm0, %ymm0
        vsubps  %ymm1, %ymm0, %ymm0
        vmovaps %ymm0, (%rdi)
        popq    %rbp
        vzeroupper
        ret
.Ltmp6:
        .size   signum32b, .Ltmp6-signum32b
        .cfi_endproc

        .globl  signum32c
        .align  16, 0x90
        .type   signum32c, at function
signum32c:                              # @signum32c
        .cfi_startproc
# BB#0:                                 # %_L1
        pushq   %rbp
.Ltmp9:
        .cfi_def_cfa_offset 16
.Ltmp10:
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
.Ltmp11:
        .cfi_def_cfa_register %rbp
        vmovaps (%rdi), %ymm0
        vxorps  %ymm1, %ymm1, %ymm1
        vcmpltps        %ymm0, %ymm1, %ymm2
        vcvtdq2ps       %ymm2, %ymm2
        vcmpltps        %ymm1, %ymm0, %ymm0
        vcvtdq2ps       %ymm0, %ymm0
        vsubps  %ymm2, %ymm0, %ymm0
        vmovaps %ymm0, (%rdi)
        popq    %rbp
        vzeroupper
        ret
.Ltmp12:
        .size   signum32c, .Ltmp12-signum32c
        .cfi_endproc

        .globl  signum64a
        .align  16, 0x90
        .type   signum64a, at function
signum64a:                              # @signum64a
        .cfi_startproc
# BB#0:                                 # %_L1
        vmovapd (%rdi), %xmm0
        vxorpd  %xmm2, %xmm2, %xmm2
        vcmpltpd        %xmm0, %xmm2, %xmm1
        vpextrq $1, %xmm1, %rax
        vcmpltpd        %xmm2, %xmm0, %xmm0
        vcvtsi2sdq      %rax, %xmm0, %xmm2
        movd    %xmm1, %rax
        vcvtsi2sdq      %rax, %xmm0, %xmm1
        vunpcklpd       %xmm2, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm2[0]
        vpextrq $1, %xmm0, %rax
        vcvtsi2sdq      %rax, %xmm0, %xmm2
        movd    %xmm0, %rax
        vcvtsi2sdq      %rax, %xmm0, %xmm0
        vunpcklpd       %xmm2, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm2[0]
        vsubpd  %xmm1, %xmm0, %xmm0
        vmovapd %xmm0, (%rdi)
        ret
.Ltmp13:
        .size   signum64a, .Ltmp13-signum64a
        .cfi_endproc

        .globl  signum64b
        .align  16, 0x90
        .type   signum64b, at function
signum64b:                              # @signum64b
        .cfi_startproc
# BB#0:                                 # %_L1
        pushq   %rbp
.Ltmp16:
        .cfi_def_cfa_offset 16
.Ltmp17:
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
.Ltmp18:
        .cfi_def_cfa_register %rbp
        vmovapd (%rdi), %ymm0
        vxorps  %ymm1, %ymm1, %ymm1
        vcmpltpd        %ymm0, %ymm1, %ymm2
        vextractf128    $1, %ymm2, %xmm3
        vpshufd $8, %xmm3, %xmm3        # xmm3 = xmm3[0,2,0,0]
        vpshufd $8, %xmm2, %xmm2        # xmm2 = xmm2[0,2,0,0]
        vmovlhps        %xmm3, %xmm2, %xmm2 # xmm2 = xmm2[0],xmm3[0]
        vpslld  $31, %xmm2, %xmm2
        vpsrad  $31, %xmm2, %xmm2
        vcmpltpd        %ymm1, %ymm0, %ymm1
        vextractf128    $1, %ymm1, %xmm3
        vcvtdq2pd       %xmm2, %ymm0
        vpshufd $8, %xmm3, %xmm2        # xmm2 = xmm3[0,2,0,0]
        vpshufd $8, %xmm1, %xmm1        # xmm1 = xmm1[0,2,0,0]
        vmovlhps        %xmm2, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm2[0]
        vpslld  $31, %xmm1, %xmm1
        vpsrad  $31, %xmm1, %xmm1
        vcvtdq2pd       %xmm1, %ymm1
        vsubpd  %ymm0, %ymm1, %ymm0
        vmovapd %ymm0, (%rdi)
        popq    %rbp
        vzeroupper
        ret
.Ltmp19:
        .size   signum64b, .Ltmp19-signum64b
        .cfi_endproc

        .globl  signum64c
        .align  16, 0x90
        .type   signum64c, at function
signum64c:                              # @signum64c
        .cfi_startproc
# BB#0:                                 # %_L1
        pushq   %rbp
.Ltmp22:
        .cfi_def_cfa_offset 16
.Ltmp23:
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
.Ltmp24:
        .cfi_def_cfa_register %rbp
        vmovapd (%rdi), %ymm1
        vxorps  %ymm2, %ymm2, %ymm2
        vcmpltpd        %ymm1, %ymm2, %ymm0
        vextractf128    $1, %ymm0, %xmm3
        vcmpltpd        %ymm2, %ymm1, %ymm1
        vextractf128    $1, %ymm1, %xmm2
        vpsubd  %xmm3, %xmm2, %xmm2
        vpsubd  %xmm0, %xmm1, %xmm0
        vshufps $-120, %xmm2, %xmm0, %xmm0 # xmm0 = xmm0[0,2],xmm2[0,2]
        vcvtdq2pd       %xmm0, %ymm0
        vmovaps %ymm0, (%rdi)
        popq    %rbp
        vzeroupper
        ret
.Ltmp25:
        .size   signum64c, .Ltmp25-signum64c
        .cfi_endproc


        .section        ".note.GNU-stack","", at progbits

-- 
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.



More information about the llvm-bugs mailing list