[llvm-bugs] [Bug 31331] New: Support for rematerialization and folding of memory broadcasts as alternative to spilling
via llvm-bugs
llvm-bugs at lists.llvm.org
Fri Dec 9 07:46:22 PST 2016
https://llvm.org/bugs/show_bug.cgi?id=31331
Bug ID: 31331
Summary: Support for rematerialization and folding of memory
broadcasts as alternative to spilling
Product: libraries
Version: trunk
Hardware: PC
OS: Windows NT
Status: NEW
Severity: normal
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: zvi.rackover at intel.com
CC: llvm-bugs at lists.llvm.org
Classification: Unclassified
The loop in the function below contains a use of a splat-vector of float 1.0's
which becomes a vbroadcastss:
define void @reg_pressure_broadcast(<4 x float>* %arg) local_unnamed_addr
nounwind {
bb:
br label %bb2
bb1: ; preds = %bb2
ret void
bb2: ; preds = %bb2, %bb
%tmp = phi i64 [ 0, %bb ], [ %tmp42, %bb2 ]
%tmp3 = getelementptr inbounds <4 x float>, <4 x float>* %arg, i64 %tmp
%tmp4 = load volatile <4 x float>, <4 x float>* %tmp3
%tmp5 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 1
%tmp6 = load volatile <4 x float>, <4 x float>* %tmp5
%tmp7 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 2
%tmp8 = load volatile <4 x float>, <4 x float>* %tmp7
%tmp9 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 3
%tmp10 = load volatile <4 x float>, <4 x float>* %tmp9
%tmp13 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 4
%tmp11 = load volatile <4 x float>, <4 x float>* %tmp13
%tmp23 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 6
%tmp21 = load volatile <4 x float>, <4 x float>* %tmp23
%tmp24 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 7
%tmp22 = load volatile <4 x float>, <4 x float>* %tmp24
%tmp25 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 8
%tmp26 = load volatile <4 x float>, <4 x float>* %tmp25
; The constant vector here can be generated as a broadcast load
%tmp12 = fadd <4 x float> %tmp11, <float 1.0, float 1.0, float 1.0, float
1.0>
%p = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 5
store volatile <4 x float> %tmp12, <4 x float>* %p
store volatile <4 x float> %tmp4, <4 x float>* %tmp3
store volatile <4 x float> %tmp6, <4 x float>* %tmp5
store volatile <4 x float> %tmp8, <4 x float>* %tmp7
store volatile <4 x float> %tmp11, <4 x float>* %tmp9
store volatile <4 x float> %tmp21, <4 x float>* %tmp23
store volatile <4 x float> %tmp22, <4 x float>* %tmp24
store volatile <4 x float> %tmp26, <4 x float>* %tmp25
%tmp42 = add nuw nsw i64 %tmp, 4
%tmp43 = icmp eq i64 %tmp42, 4096
br i1 %tmp43, label %bb1, label %bb2
}
LICM will hoist the broadcastss out of the loop, and at register-allocation
time register pressure leads to spilling the hoisted-broadcast:
llc -mtriple=i686 -mattr=+avx2
pushl %esi
subl $24, %esp
xorl %eax, %eax
movl 32(%esp), %ecx
vbroadcastss .LCPI0_0, %xmm0
vmovups %xmm0, (%esp) # 16-byte Spill
xorl %edx, %edx
.p2align 4, 0x90
.LBB0_1: # %bb2
# =>This Inner Loop Header: Depth=1
movl %eax, %esi
shll $4, %esi
vmovaps (%ecx,%esi), %xmm1
vmovaps 16(%ecx,%esi), %xmm2
vmovaps 32(%ecx,%esi), %xmm3
vmovaps 48(%ecx,%esi), %xmm4
vmovaps 64(%ecx,%esi), %xmm4
vmovaps 96(%ecx,%esi), %xmm5
vmovaps 112(%ecx,%esi), %xmm6
vmovaps 128(%ecx,%esi), %xmm7
vaddps (%esp), %xmm4, %xmm0 # 16-byte Folded Reload
vmovaps %xmm0, 80(%ecx,%esi)
vmovaps %xmm1, (%ecx,%esi)
vmovaps %xmm2, 16(%ecx,%esi)
vmovaps %xmm3, 32(%ecx,%esi)
vmovaps %xmm4, 48(%ecx,%esi)
vmovaps %xmm5, 96(%ecx,%esi)
vmovaps %xmm6, 112(%ecx,%esi)
vmovaps %xmm7, 128(%ecx,%esi)
addl $4, %eax
adcl $0, %edx
movl %eax, %esi
xorl $4096, %esi # imm = 0x1000
orl %edx, %esi
jne .LBB0_1
# BB#2: # %bb1
addl $24, %esp
popl %esi
retl
Instead of spilling it would be better to fold the load directly from the
constant-pool. Maybe like this:
pushl %esi
xorl %eax, %eax
movl 8(%esp), %ecx
xorl %edx, %edx
.p2align 4, 0x90
.LBB0_1: # %bb2
# =>This Inner Loop Header: Depth=1
movl %eax, %esi
shll $4, %esi
vmovaps (%ecx,%esi), %xmm1
vmovaps 16(%ecx,%esi), %xmm2
vmovaps 32(%ecx,%esi), %xmm3
vmovaps 48(%ecx,%esi), %xmm4
vmovaps 64(%ecx,%esi), %xmm4
vmovaps 96(%ecx,%esi), %xmm5
vmovaps 112(%ecx,%esi), %xmm6
vmovaps 128(%ecx,%esi), %xmm7
vaddps .LCPI0_0, %xmm4, %xmm0 <-----Load directly from const-pool
vmovaps %xmm0, 80(%ecx,%esi)
vmovaps %xmm1, (%ecx,%esi)
vmovaps %xmm2, 16(%ecx,%esi)
vmovaps %xmm3, 32(%ecx,%esi)
vmovaps %xmm4, 48(%ecx,%esi)
vmovaps %xmm5, 96(%ecx,%esi)
vmovaps %xmm6, 112(%ecx,%esi)
vmovaps %xmm7, 128(%ecx,%esi)
addl $4, %eax
adcl $0, %edx
movl %eax, %esi
xorl $4096, %esi # imm = 0x1000
orl %edx, %esi
jne .LBB0_1
# BB#2: # %bb1
popl %esi
retl
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20161209/a70e0801/attachment.html>
More information about the llvm-bugs
mailing list