[llvm-bugs] [Bug 31331] New: Support for rematerialization and folding of memory broadcasts as alternative to spilling

Fri Dec 9 07:46:22 PST 2016

https://llvm.org/bugs/show_bug.cgi?id=31331

            Bug ID: 31331
           Summary: Support for rematerialization and folding of memory
                    broadcasts as alternative to spilling
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: zvi.rackover at intel.com
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

The loop in the function below contains a use of a splat-vector of float 1.0's
which becomes a vbroadcastss:

define void @reg_pressure_broadcast(<4 x float>* %arg) local_unnamed_addr
nounwind {
bb:
  br label %bb2

bb1:                                              ; preds = %bb2
  ret void

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp42, %bb2 ]
  %tmp3 = getelementptr inbounds <4 x float>, <4 x float>* %arg, i64 %tmp
  %tmp4 = load volatile <4 x float>, <4 x float>* %tmp3
  %tmp5 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 1
  %tmp6 = load volatile <4 x float>, <4 x float>* %tmp5
  %tmp7 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 2
  %tmp8 = load volatile <4 x float>, <4 x float>* %tmp7
  %tmp9 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 3
  %tmp10 = load volatile <4 x float>, <4 x float>* %tmp9
  %tmp13 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 4
  %tmp11 = load volatile <4 x float>, <4 x float>* %tmp13
  %tmp23 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 6
  %tmp21 = load volatile <4 x float>, <4 x float>* %tmp23
  %tmp24 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 7
  %tmp22 = load volatile <4 x float>, <4 x float>* %tmp24
  %tmp25 = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 8
  %tmp26 = load volatile <4 x float>, <4 x float>* %tmp25
  ; The constant vector here can be generated as a broadcast load
  %tmp12 = fadd <4 x float> %tmp11, <float 1.0, float 1.0, float 1.0, float
1.0>
  %p = getelementptr inbounds <4 x float>, <4 x float>* %tmp3, i64 5
  store volatile <4 x float> %tmp12, <4 x float>* %p
  store volatile <4 x float> %tmp4, <4 x float>* %tmp3
  store volatile <4 x float> %tmp6, <4 x float>* %tmp5
  store volatile <4 x float> %tmp8, <4 x float>* %tmp7
  store volatile <4 x float> %tmp11, <4 x float>* %tmp9
  store volatile <4 x float> %tmp21, <4 x float>* %tmp23
  store volatile <4 x float> %tmp22, <4 x float>* %tmp24
  store volatile <4 x float> %tmp26, <4 x float>* %tmp25
  %tmp42 = add nuw nsw i64 %tmp, 4
  %tmp43 = icmp eq i64 %tmp42, 4096
  br i1 %tmp43, label %bb1, label %bb2
}

LICM will hoist the broadcastss out of the loop, and at register-allocation
time register pressure leads to spilling the hoisted-broadcast:

llc -mtriple=i686 -mattr=+avx2

        pushl   %esi                                             
        subl    $24, %esp                                        
        xorl    %eax, %eax                                       
        movl    32(%esp), %ecx                                   
        vbroadcastss    .LCPI0_0, %xmm0                          
        vmovups %xmm0, (%esp)           # 16-byte Spill          
        xorl    %edx, %edx                                       
        .p2align        4, 0x90                                  
.LBB0_1:                                # %bb2                   
                                        # =>This Inner Loop Header: Depth=1
        movl    %eax, %esi                                                 
        shll    $4, %esi                                                   
        vmovaps (%ecx,%esi), %xmm1                                         
        vmovaps 16(%ecx,%esi), %xmm2                                       
        vmovaps 32(%ecx,%esi), %xmm3
        vmovaps 48(%ecx,%esi), %xmm4
        vmovaps 64(%ecx,%esi), %xmm4
        vmovaps 96(%ecx,%esi), %xmm5
        vmovaps 112(%ecx,%esi), %xmm6
        vmovaps 128(%ecx,%esi), %xmm7
        vaddps  (%esp), %xmm4, %xmm0    # 16-byte Folded Reload
        vmovaps %xmm0, 80(%ecx,%esi)
        vmovaps %xmm1, (%ecx,%esi)
        vmovaps %xmm2, 16(%ecx,%esi)
        vmovaps %xmm3, 32(%ecx,%esi)
        vmovaps %xmm4, 48(%ecx,%esi)
        vmovaps %xmm5, 96(%ecx,%esi)
        vmovaps %xmm6, 112(%ecx,%esi)
        vmovaps %xmm7, 128(%ecx,%esi)
        addl    $4, %eax
        adcl    $0, %edx
        movl    %eax, %esi
        xorl    $4096, %esi             # imm = 0x1000
        orl     %edx, %esi
        jne     .LBB0_1
# BB#2:                                 # %bb1
        addl    $24, %esp
        popl    %esi
        retl

Instead of spilling it would be better to fold the load directly from the
constant-pool. Maybe like this:

        pushl   %esi                                             
        xorl    %eax, %eax                                       
        movl    8(%esp), %ecx                                    
        xorl    %edx, %edx                                       
        .p2align        4, 0x90                                  
.LBB0_1:                                # %bb2                   
                                        # =>This Inner Loop Header: Depth=1
        movl    %eax, %esi                                                 
        shll    $4, %esi                                                   
        vmovaps (%ecx,%esi), %xmm1                                         
        vmovaps 16(%ecx,%esi), %xmm2
        vmovaps 32(%ecx,%esi), %xmm3
        vmovaps 48(%ecx,%esi), %xmm4
        vmovaps 64(%ecx,%esi), %xmm4
        vmovaps 96(%ecx,%esi), %xmm5
        vmovaps 112(%ecx,%esi), %xmm6
        vmovaps 128(%ecx,%esi), %xmm7
        vaddps  .LCPI0_0, %xmm4, %xmm0 <-----Load directly from const-pool
        vmovaps %xmm0, 80(%ecx,%esi)
        vmovaps %xmm1, (%ecx,%esi)
        vmovaps %xmm2, 16(%ecx,%esi)
        vmovaps %xmm3, 32(%ecx,%esi)
        vmovaps %xmm4, 48(%ecx,%esi)
        vmovaps %xmm5, 96(%ecx,%esi)
        vmovaps %xmm6, 112(%ecx,%esi)
        vmovaps %xmm7, 128(%ecx,%esi)
        addl    $4, %eax
        adcl    $0, %edx
        movl    %eax, %esi
        xorl    $4096, %esi             # imm = 0x1000
        orl     %edx, %esi
        jne     .LBB0_1
# BB#2:                                 # %bb1
        popl    %esi
        retl

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20161209/a70e0801/attachment.html>