[llvm-bugs] [Bug 47558] New: The number of SIMD loads increases unnecessarily
via llvm-bugs
llvm-bugs at lists.llvm.org
Wed Sep 16 23:58:10 PDT 2020
https://bugs.llvm.org/show_bug.cgi?id=47558
Bug ID: 47558
Summary: The number of SIMD loads increases unnecessarily
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Scalar Optimizations
Assignee: unassignedbugs at nondot.org
Reporter: kazu at google.com
CC: llvm-bugs at lists.llvm.org
This patch:
https://github.com/llvm/llvm-project/commit/8fb055932c085da21f3b721995a06f42006744bd
increases the number of memory loads in certain cases.
Consider:
target triple = "x86_64-unknown-linux-gnu"
declare dso_local float* @getscaleptr() #0
define void @foo(<2 x float>* nonnull %resultptr, <2 x float>* nonnull %opptr)
{
%scaleptr = call nonnull align 16 dereferenceable(64) float* @getscaleptr()
%op = load <2 x float>, <2 x float>* %opptr, align 4
%scale = load float, float* %scaleptr, align 16
%op0 = extractelement <2 x float> %op, i32 0
%product0 = fmul float %op0, %scale
%result0 = insertelement <2 x float> undef, float %product0, i32 0
%op1 = extractelement <2 x float> %op, i32 1
%product1 = fmul float %op1, %scale
%result1 = insertelement <2 x float> %result0, float %product1, i32 1
store <2 x float> %result1, <2 x float>* %resultptr, align 8
ret void
}
This testcase multiplies a <2 x float> value by a scalar float value
and stores the result back to memory.
Compile like so:
$ clang -O2 -msse4.2 -S bug.ll -o bug.s
Then with and without the patch, I get the following assembly diff:
pushq %r14
pushq %rbx
pushq %rax
movq %rsi, %rbx
movq %rdi, %r14
callq getscaleptr
movsd (%rbx), %xmm0 # xmm0 = mem[0],zero
- movss (%rax), %xmm1 # xmm1 = mem[0],zero,zero,zero
- movsldup %xmm1, %xmm1 # xmm1 = xmm1[0,0,2,2]
+ movaps (%rax), %xmm1
+ insertps $16, (%rax), %xmm1 # xmm1 =
xmm1[0],mem[0],xmm1[2,3]
mulps %xmm0, %xmm1
movlps %xmm1, (%r14)
addq $8, %rsp
popq %rbx
popq %r14
retq
Note that the patch replaces movsldup with insertps, which reads from
the same location as movaps, increasing the number of loads.
Here is the "IR Dump After Optimize scalar/vector ops".
Without the patch:
%scaleptr = tail call nonnull align 16 dereferenceable(64) float*
@getscaleptr()
%op = load <2 x float>, <2 x float>* %opptr, align 4
%scale = load float, float* %scaleptr, align 16
%1 = insertelement <2 x float> undef, float %scale, i32 0
%2 = insertelement <2 x float> %1, float %scale, i32 1
%3 = fmul <2 x float> %op, %2
%4 = extractelement <2 x float> %3, i32 0
%result0 = insertelement <2 x float> undef, float %4, i32 0
%5 = extractelement <2 x float> %3, i32 1
%result1 = insertelement <2 x float> %result0, float %5, i32 1
store <2 x float> %result1, <2 x float>* %resultptr, align 8
ret void
With the patch:
%scaleptr = tail call nonnull align 16 dereferenceable(64) float*
@getscaleptr()
%op = load <2 x float>, <2 x float>* %opptr, align 4
%1 = bitcast float* %scaleptr to <4 x float>*
%2 = load <4 x float>, <4 x float>* %1, align 16
%3 = shufflevector <4 x float> %2, <4 x float> undef, <2 x i32> <i32 0, i32
1>
%scale = load float, float* %scaleptr, align 16
%4 = insertelement <2 x float> %3, float %scale, i32 1
%5 = fmul <2 x float> %op, %4
%6 = extractelement <2 x float> %5, i32 0
%result0 = insertelement <2 x float> undef, float %6, i32 0
%7 = extractelement <2 x float> %5, i32 1
%result1 = insertelement <2 x float> %result0, float %7, i32 1
store <2 x float> %result1, <2 x float>* %resultptr, align 8
ret void
Notice the three loads with the patch.
Here is the final LLVM IR.
Without the patch:
%scaleptr = tail call nonnull align 16 dereferenceable(64) float*
@getscaleptr()
%op = load <2 x float>, <2 x float>* %opptr, align 4
%scale = load float, float* %scaleptr, align 16
%1 = insertelement <2 x float> undef, float %scale, i32 0
%2 = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32>
zeroinitializer
%3 = fmul <2 x float> %op, %2
store <2 x float> %3, <2 x float>* %resultptr, align 8
ret void
With the patch:
%scaleptr = tail call nonnull align 16 dereferenceable(64) float*
@getscaleptr()
%op = load <2 x float>, <2 x float>* %opptr, align 4
%1 = bitcast float* %scaleptr to <4 x float>*
%2 = load <4 x float>, <4 x float>* %1, align 16
%3 = shufflevector <4 x float> %2, <4 x float> undef, <2 x i32> <i32 0, i32
undef>
%scale = load float, float* %scaleptr, align 16
%4 = insertelement <2 x float> %3, float %scale, i32 1
%5 = fmul <2 x float> %op, %4
store <2 x float> %5, <2 x float>* %resultptr, align 8
ret void
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20200917/04fbafd9/attachment.html>
More information about the llvm-bugs
mailing list