[LLVMdev] SIMD for sdiv <2 x i64>
zhi chen
zchenhn at gmail.com
Fri Jul 24 10:52:40 PDT 2015
------------------------------------ IR
------------------------------------------------------------------
if.then.i.i.i.i.i.i: ; preds = %if.then4
%S25_D = zext <2 x i32> %splatLDS17_D.splat to <2 x i64>
%umul_with_overflow.i.iS26_D = shl <2 x i64> %S25_D, <i64 3, i64 3>
%extumul_with_overflow.i.iS26_D = extractelement <2 x i64>
%umul_with_overflow.i.iS26_D, i32 1
%call5.i.i = tail call noalias i8* @_Znam(i64
%extumul_with_overflow.i.iS26_D) #22
%splatCallS27_D.splatinsert = insertelement <2 x i8*> undef, i8*
%call5.i.i, i32 0
%splatCallS27_D.splat = shufflevector <2 x i8*>
%splatCallS27_D.splatinsert, <2 x i8*> undef, <2 x i32> zeroinitializer
%bitcastS28_D = bitcast <2 x i8*> %splatCallS27_D.splat to <2 x double*>
%extractS29_D = extractelement <2 x double*> %bitcastS28_D, i32 1
store double* %extractS29_D, double** %val.i.i, align 8
%val.i3.i.i = getelementptr inbounds %class.Vector* %__x, i64 0, i32 3
%4 = load double** %val.i3.i.i, align 8, !tbaa !22
%splatLDS31_D.splatinsert = insertelement <2 x double*> undef, double*
%4, i32 0
%splatLDS31_D.splat = shufflevector <2 x double*>
%splatLDS31_D.splatinsert, <2 x double*> undef, <2 x i32> zeroinitializer
%bitcastS32_D = bitcast <2 x double*> %splatLDS31_D.splat to <2 x i8*>
%extbitcastS32_D = extractelement <2 x i8*> %bitcastS32_D, i32 1
tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %call5.i.i, i8*
%extbitcastS32_D, i64 %extumul_with_overflow.i.iS26_D, i32 8, i1 false) #9
br label %invoke.cont
invoke.cont: ; preds =
%if.then.i.i.i.i.i.i, %if.then4
%sub.ptr.rhs.cast.i = ptrtoint %class.Vector* %__position.coerce to i64
%sub.ptr.rhs.cast.iS35_D = ptrtoint <2 x %class.Vector*>
%splatInsMapS35_D.splat to <2 x i64>
%sub.ptr.sub.iS36_D = sub <2 x i64> %sub.ptr.rhs.castS8_D,
%sub.ptr.rhs.cast.iS35_D
%sub.ptr.div.iS37_D = sdiv <2 x i64> %sub.ptr.sub.iS36_D, <i64 24, i64 24>
%extractS196_D = extractelement <2 x i64> %sub.ptr.div.iS37_D, i32 1
%cmp10S38_D = icmp ugt <2 x i64> %sub.ptr.div.iS37_D,
%splatInsMapS1_D.splat
%zextS39_D = sext <2 x i1> %cmp10S38_D to <2 x i64>
%BCS39_D = bitcast <2 x i64> %zextS39_D to i128
%mskS39_D = icmp ne i128 %BCS39_D, 0
br i1 %mskS39_D, label %if.then11, label %if.else
-------------------------------------------- Assembly
-----------------------------------------------------------------
# BB#3: # %if.then.i.i.i.i.i.i
vpsllq $3, %xmm0, %xmm0
vpextrq $1, %xmm0, %rbx
movq %rbx, %rdi
vmovaps %xmm2, 96(%rsp) # 16-byte Spill
vmovaps %xmm5, 64(%rsp) # 16-byte Spill
vmovdqa %xmm6, 16(%rsp) # 16-byte Spill
callq _Znam
movq %rax, 128(%rsp)
movq 16(%r12), %rsi
movq %rax, %rdi
movq %rbx, %rdx
callq memmove
vmovdqa 16(%rsp), %xmm6 # 16-byte Reload
vmovaps 64(%rsp), %xmm5 # 16-byte Reload
vmovaps 96(%rsp), %xmm2 # 16-byte Reload
vmovdqa .LCPI582_0(%rip), %xmm4
.LBB582_4: # %invoke.cont
vmovaps %xmm2, 96(%rsp) # 16-byte Spill
vmovdqa 48(%rsp), %xmm0 # 16-byte Reload
vpsubq %xmm0, %xmm2, %xmm0
vpextrq $1, %xmm0, %rax
movabsq $3074457345618258603, %rcx # imm = 0x2AAAAAAAAAAAAAAB
imulq %rcx
movq %rdx, %rax
shrq $63, %rax
sarq $2, %rdx
addq %rax, %rdx
vmovq %rdx, %xmm1
vmovq %xmm0, %rax
imulq %rcx
movq %rdx, %rax
shrq $63, %rax
sarq $2, %rdx
addq %rax, %rdx
vmovq %rdx, %xmm0
vpunpcklqdq %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[0],xmm1[0]
vpxor %xmm4, %xmm1, %xmm0
vpcmpgtq %xmm6, %xmm0, %xmm0
vptest %xmm0, %xmm0
je .LBB582_49
Thanks,
Zhi
On Fri, Jul 24, 2015 at 10:16 AM, Philip Reames <listmail at philipreames.com>
wrote:
>
>
> On 07/24/2015 03:42 AM, Benjamin Kramer wrote:
>
> On 24.07.2015, at 08:06, zhi chen <zchenhn at gmail.com> wrote:
>>>
>>> It seems that that it's hard to vectorize int64 in LLVM. For example,
>>> LLVM 3.4 generates very complicated code for the following IR. I am running
>>> on a Haswell processor. Is it because there is no alternative AVX/2
>>> instructions for int64? The same thing also happens to zext <2 x i32> -> <2
>>> x i64> and trunc <2 x i64> -> <2 x i32>. Any ideas to optimize these
>>> instructions? Thanks.
>>>
>>> %sub.ptr.sub.i6.i.i.i.i = sub <2 x i64> %sub.ptr.lhs.cast.i4.i.i.i.i,
>>> %sub.ptr.rhs.cast.i5.i.i.i.i
>>> %sub.ptr.div.i7.i.i.i.i = sdiv <2 x i64> %sub.ptr.sub.i6.i.i.i.i, <i64
>>> 24, i64 24>
>>>
>>> Assembly:
>>> vpsubq %xmm6, %xmm5, %xmm5
>>> vmovq %xmm5, %rax
>>> movabsq $3074457345618258603, %rbx # imm = 0x2AAAAAAAAAAAAAAB
>>> imulq %rbx
>>> movq %rdx, %rcx
>>> movq %rcx, %rax
>>> shrq $63, %rax
>>> shrq $2, %rcx
>>> addl %eax, %ecx
>>> vpextrq $1, %xmm5, %rax
>>> imulq %rbx
>>> movq %rdx, %rax
>>> shrq $63, %rax
>>> shrq $2, %rdx
>>> addl %eax, %edx
>>> movslq %edx, %rax
>>> vmovq %rax, %xmm5
>>> movslq %ecx, %rax
>>> vmovq %rax, %xmm6
>>> vpunpcklqdq %xmm5, %xmm6, %xmm5 # xmm5 = xmm6[0],xmm5[0]
>>>
>> AVX2 doesn't have integer vector division instructions and LLVM lowers
>> divides by constants into (128 bit) multiplies. However, AVX2 doesn't have
>> a way to get to the upper 64 bits of a 64x64->128 bit multiply either, so
>> LLVM uses the scalar imulq instruction to do that. There's not much room to
>> optimize here given the limitations of AVX2.
>>
>> You seem to be subtracting pointers though, so if you can guarantee that
>> the pointers are aligned you could set the exact bit on your 'sdiv'
>> instruction. That should give better code.
>>
> Depending on what you're using the result of the divide for, there might
> be optimizations which could be applied as well. Can you give a slightly
> larger context for your source IR? (1-2 level of uses/defs out from the
> instructions would help)
>
>>
>> - Ben
>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150724/eab0f46f/attachment.html>
More information about the llvm-dev
mailing list