<html>
<head>
<base href="https://llvm.org/bugs/" />
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW --- - Support for rematerialization and folding of memory broadcasts as alternative to spilling"
href="https://llvm.org/bugs/show_bug.cgi?id=31331">31331</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Support for rematerialization and folding of memory broadcasts as alternative to spilling
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Windows NT
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>normal
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Backend: X86
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>zvi.rackover@intel.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr>
<tr>
<th>Classification</th>
<td>Unclassified
</td>
</tr></table>
<p>
<div>
<pre>The loop in the function below contains a use of a splat-vector of float 1.0's
which becomes a vbroadcastss:
define void @reg_pressure_broadcast(<4 x float>* %arg) local_unnamed_addr
nounwind {
bb:
br label %bb2
bb1: ; preds = %bb2
ret void
bb2: ; preds = %bb2, %bb
%tmp = phi i64 [ 0, %bb ], [ %tmp42, %bb2 ]
%tmp3 = getelementptr inbounds <4 x float>, <4 x float>* %arg, i64 %tmp
%tmp4 = load volatile <4 x float>, <4 x float>* %tmp3
%tmp5 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 1
%tmp6 = load volatile <4 x float>, <4 x float>* %tmp5
%tmp7 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 2
%tmp8 = load volatile <4 x float>, <4 x float>* %tmp7
%tmp9 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 3
%tmp10 = load volatile <4 x float>, <4 x float>* %tmp9
%tmp13 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 4
%tmp11 = load volatile <4 x float>, <4 x float>* %tmp13
%tmp23 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 6
%tmp21 = load volatile <4 x float>, <4 x float>* %tmp23
%tmp24 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 7
%tmp22 = load volatile <4 x float>, <4 x float>* %tmp24
%tmp25 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 8
%tmp26 = load volatile <4 x float>, <4 x float>* %tmp25
; The constant vector here can be generated as a broadcast load
%tmp12 = fadd <4 x float> %tmp11, <float 1.0, float 1.0, float 1.0, float
1.0>
%p = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 5
store volatile <4 x float> %tmp12, <4 x float>* %p
store volatile <4 x float> %tmp4, <4 x float>* %tmp3
store volatile <4 x float> %tmp6, <4 x float>* %tmp5
store volatile <4 x float> %tmp8, <4 x float>* %tmp7
store volatile <4 x float> %tmp11, <4 x float>* %tmp9
store volatile <4 x float> %tmp21, <4 x float>* %tmp23
store volatile <4 x float> %tmp22, <4 x float>* %tmp24
store volatile <4 x float> %tmp26, <4 x float>* %tmp25
%tmp42 = add nuw nsw i64 %tmp, 4
%tmp43 = icmp eq i64 %tmp42, 4096
br i1 %tmp43, label %bb1, label %bb2
}
LICM will hoist the broadcastss out of the loop, and at register-allocation
time register pressure leads to spilling the hoisted-broadcast:
llc -mtriple=i686 -mattr=+avx2
pushl %esi
subl $24, %esp
xorl %eax, %eax
movl 32(%esp), %ecx
vbroadcastss .LCPI0_0, %xmm0
vmovups %xmm0, (%esp) # 16-byte Spill
xorl %edx, %edx
.p2align 4, 0x90
.LBB0_1: # %bb2
# =>This Inner Loop Header: Depth=1
movl %eax, %esi
shll $4, %esi
vmovaps (%ecx,%esi), %xmm1
vmovaps 16(%ecx,%esi), %xmm2
vmovaps 32(%ecx,%esi), %xmm3
vmovaps 48(%ecx,%esi), %xmm4
vmovaps 64(%ecx,%esi), %xmm4
vmovaps 96(%ecx,%esi), %xmm5
vmovaps 112(%ecx,%esi), %xmm6
vmovaps 128(%ecx,%esi), %xmm7
vaddps (%esp), %xmm4, %xmm0 # 16-byte Folded Reload
vmovaps %xmm0, 80(%ecx,%esi)
vmovaps %xmm1, (%ecx,%esi)
vmovaps %xmm2, 16(%ecx,%esi)
vmovaps %xmm3, 32(%ecx,%esi)
vmovaps %xmm4, 48(%ecx,%esi)
vmovaps %xmm5, 96(%ecx,%esi)
vmovaps %xmm6, 112(%ecx,%esi)
vmovaps %xmm7, 128(%ecx,%esi)
addl $4, %eax
adcl $0, %edx
movl %eax, %esi
xorl $4096, %esi # imm = 0x1000
orl %edx, %esi
jne .LBB0_1
# BB#2: # %bb1
addl $24, %esp
popl %esi
retl
Instead of spilling it would be better to fold the load directly from the
constant-pool. Maybe like this:
pushl %esi
xorl %eax, %eax
movl 8(%esp), %ecx
xorl %edx, %edx
.p2align 4, 0x90
.LBB0_1: # %bb2
# =>This Inner Loop Header: Depth=1
movl %eax, %esi
shll $4, %esi
vmovaps (%ecx,%esi), %xmm1
vmovaps 16(%ecx,%esi), %xmm2
vmovaps 32(%ecx,%esi), %xmm3
vmovaps 48(%ecx,%esi), %xmm4
vmovaps 64(%ecx,%esi), %xmm4
vmovaps 96(%ecx,%esi), %xmm5
vmovaps 112(%ecx,%esi), %xmm6
vmovaps 128(%ecx,%esi), %xmm7
vaddps .LCPI0_0, %xmm4, %xmm0 <-----Load directly from const-pool
vmovaps %xmm0, 80(%ecx,%esi)
vmovaps %xmm1, (%ecx,%esi)
vmovaps %xmm2, 16(%ecx,%esi)
vmovaps %xmm3, 32(%ecx,%esi)
vmovaps %xmm4, 48(%ecx,%esi)
vmovaps %xmm5, 96(%ecx,%esi)
vmovaps %xmm6, 112(%ecx,%esi)
vmovaps %xmm7, 128(%ecx,%esi)
addl $4, %eax
adcl $0, %edx
movl %eax, %esi
xorl $4096, %esi # imm = 0x1000
orl %edx, %esi
jne .LBB0_1
# BB#2: # %bb1
popl %esi
retl</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>