[LLVMdev] SIMD for sdiv <2 x i64>

zhi chen zchenhn at gmail.com
Fri Jul 24 10:52:40 PDT 2015


------------------------------------ IR
------------------------------------------------------------------
if.then.i.i.i.i.i.i:                              ; preds = %if.then4
  %S25_D = zext <2 x i32> %splatLDS17_D.splat to <2 x i64>
  %umul_with_overflow.i.iS26_D = shl <2 x i64> %S25_D, <i64 3, i64 3>
  %extumul_with_overflow.i.iS26_D = extractelement <2 x i64>
%umul_with_overflow.i.iS26_D, i32 1
  %call5.i.i = tail call noalias i8* @_Znam(i64
%extumul_with_overflow.i.iS26_D) #22
  %splatCallS27_D.splatinsert = insertelement <2 x i8*> undef, i8*
%call5.i.i, i32 0
  %splatCallS27_D.splat = shufflevector <2 x i8*>
%splatCallS27_D.splatinsert, <2 x i8*> undef, <2 x i32> zeroinitializer
  %bitcastS28_D = bitcast <2 x i8*> %splatCallS27_D.splat to <2 x double*>
  %extractS29_D = extractelement <2 x double*> %bitcastS28_D, i32 1
  store double* %extractS29_D, double** %val.i.i, align 8
  %val.i3.i.i = getelementptr inbounds %class.Vector* %__x, i64 0, i32 3
  %4 = load double** %val.i3.i.i, align 8, !tbaa !22
  %splatLDS31_D.splatinsert = insertelement <2 x double*> undef, double*
%4, i32 0
  %splatLDS31_D.splat = shufflevector <2 x double*>
%splatLDS31_D.splatinsert, <2 x double*> undef, <2 x i32> zeroinitializer
  %bitcastS32_D = bitcast <2 x double*> %splatLDS31_D.splat to <2 x i8*>
  %extbitcastS32_D = extractelement <2 x i8*> %bitcastS32_D, i32 1
  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %call5.i.i, i8*
%extbitcastS32_D, i64 %extumul_with_overflow.i.iS26_D, i32 8, i1 false) #9
  br label %invoke.cont

invoke.cont:                                      ; preds =
%if.then.i.i.i.i.i.i, %if.then4
  %sub.ptr.rhs.cast.i = ptrtoint %class.Vector* %__position.coerce to i64
  %sub.ptr.rhs.cast.iS35_D = ptrtoint <2 x %class.Vector*>
%splatInsMapS35_D.splat to <2 x i64>
  %sub.ptr.sub.iS36_D = sub <2 x i64> %sub.ptr.rhs.castS8_D,
%sub.ptr.rhs.cast.iS35_D
  %sub.ptr.div.iS37_D = sdiv <2 x i64> %sub.ptr.sub.iS36_D, <i64 24, i64 24>
  %extractS196_D = extractelement <2 x i64> %sub.ptr.div.iS37_D, i32 1
  %cmp10S38_D = icmp ugt <2 x i64> %sub.ptr.div.iS37_D,
%splatInsMapS1_D.splat
  %zextS39_D = sext <2 x i1> %cmp10S38_D to <2 x i64>
  %BCS39_D = bitcast <2 x i64> %zextS39_D to i128
  %mskS39_D = icmp ne i128 %BCS39_D, 0
  br i1 %mskS39_D, label %if.then11, label %if.else

-------------------------------------------- Assembly
-----------------------------------------------------------------

# BB#3:                                 # %if.then.i.i.i.i.i.i
    vpsllq  $3, %xmm0, %xmm0
    vpextrq $1, %xmm0, %rbx
    movq    %rbx, %rdi
    vmovaps %xmm2, 96(%rsp)         # 16-byte Spill
    vmovaps %xmm5, 64(%rsp)         # 16-byte Spill
    vmovdqa %xmm6, 16(%rsp)         # 16-byte Spill
    callq   _Znam
    movq    %rax, 128(%rsp)
    movq    16(%r12), %rsi
    movq    %rax, %rdi
    movq    %rbx, %rdx
    callq   memmove
    vmovdqa 16(%rsp), %xmm6         # 16-byte Reload
    vmovaps 64(%rsp), %xmm5         # 16-byte Reload
    vmovaps 96(%rsp), %xmm2         # 16-byte Reload
    vmovdqa .LCPI582_0(%rip), %xmm4
.LBB582_4:                              # %invoke.cont
    vmovaps %xmm2, 96(%rsp)         # 16-byte Spill
    vmovdqa 48(%rsp), %xmm0         # 16-byte Reload
    vpsubq  %xmm0, %xmm2, %xmm0
    vpextrq $1, %xmm0, %rax
    movabsq $3074457345618258603, %rcx # imm = 0x2AAAAAAAAAAAAAAB
    imulq   %rcx
    movq    %rdx, %rax
    shrq    $63, %rax
    sarq    $2, %rdx
    addq    %rax, %rdx
    vmovq   %rdx, %xmm1
    vmovq   %xmm0, %rax
    imulq   %rcx
    movq    %rdx, %rax
    shrq    $63, %rax
    sarq    $2, %rdx
    addq    %rax, %rdx
    vmovq   %rdx, %xmm0
    vpunpcklqdq %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[0],xmm1[0]
    vpxor   %xmm4, %xmm1, %xmm0
    vpcmpgtq    %xmm6, %xmm0, %xmm0
    vptest  %xmm0, %xmm0
    je  .LBB582_49

Thanks,
Zhi

On Fri, Jul 24, 2015 at 10:16 AM, Philip Reames <listmail at philipreames.com>
wrote:

>
>
> On 07/24/2015 03:42 AM, Benjamin Kramer wrote:
>
>  On 24.07.2015, at 08:06, zhi chen <zchenhn at gmail.com> wrote:
>>>
>>> It seems that that it's hard to vectorize int64 in LLVM. For example,
>>> LLVM 3.4 generates very complicated code for the following IR. I am running
>>> on a Haswell processor. Is it because there is no alternative AVX/2
>>> instructions for int64? The same thing also happens to zext <2 x i32> -> <2
>>> x i64> and trunc <2 x i64> -> <2 x i32>. Any ideas to optimize these
>>> instructions? Thanks.
>>>
>>> %sub.ptr.sub.i6.i.i.i.i = sub <2 x i64> %sub.ptr.lhs.cast.i4.i.i.i.i,
>>> %sub.ptr.rhs.cast.i5.i.i.i.i
>>> %sub.ptr.div.i7.i.i.i.i = sdiv <2 x i64> %sub.ptr.sub.i6.i.i.i.i, <i64
>>> 24, i64 24>
>>>
>>> Assembly:
>>>      vpsubq  %xmm6, %xmm5, %xmm5
>>>      vmovq   %xmm5, %rax
>>>      movabsq $3074457345618258603, %rbx # imm = 0x2AAAAAAAAAAAAAAB
>>>      imulq   %rbx
>>>      movq    %rdx, %rcx
>>>      movq    %rcx, %rax
>>>      shrq    $63, %rax
>>>      shrq    $2, %rcx
>>>      addl    %eax, %ecx
>>>      vpextrq $1, %xmm5, %rax
>>>      imulq   %rbx
>>>      movq    %rdx, %rax
>>>      shrq    $63, %rax
>>>      shrq    $2, %rdx
>>>      addl    %eax, %edx
>>>      movslq  %edx, %rax
>>>      vmovq   %rax, %xmm5
>>>      movslq  %ecx, %rax
>>>      vmovq   %rax, %xmm6
>>>      vpunpcklqdq %xmm5, %xmm6, %xmm5 # xmm5 = xmm6[0],xmm5[0]
>>>
>> AVX2 doesn't have integer vector division instructions and LLVM lowers
>> divides by constants into (128 bit) multiplies. However, AVX2 doesn't have
>> a way to get to the upper 64 bits of a 64x64->128 bit multiply either, so
>> LLVM uses the scalar imulq instruction to do that. There's not much room to
>> optimize here given the limitations of AVX2.
>>
>> You seem to be subtracting pointers though, so if you can guarantee that
>> the pointers are aligned you could set the exact bit on your 'sdiv'
>> instruction. That should give better code.
>>
> Depending on what you're using the result of the divide for, there might
> be optimizations which could be applied as well.  Can you give a slightly
> larger context for your source IR?  (1-2 level of uses/defs out from the
> instructions would help)
>
>>
>> - Ben
>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150724/eab0f46f/attachment.html>


More information about the llvm-dev mailing list