<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - [AArch64] missed ld/st fusion optimization"
   href="https://bugs.llvm.org/show_bug.cgi?id=49981">49981</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>[AArch64] missed ld/st fusion optimization
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>11.0
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>All
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>enhancement
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Backend: AArch64
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>sebpop@gmail.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>arnaud.degrandmaison@arm.com, llvm-bugs@lists.llvm.org, smithp352@googlemail.com, Ties.Stuij@arm.com
          </td>
        </tr></table>
      <p>
        <div>
        <pre>On Arm64 LLVM produces too many loads that could be fused.

The following llvm-ir comes from a reduced test from
<a href="https://github.com/ispc/ispc/issues/2052">https://github.com/ispc/ispc/issues/2052</a>
The bug contains a larger test for matrix multiply in ispc that needs to be
fixed as well.

```
+ /home/ubuntu/ispc/bld1/bin/ispc -O2 --target=neon one.ispc -o -
--emit-llvm-text --cpu=cortex-a57

; ModuleID = 'one.ispc'
source_filename = "one.ispc"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"

%v4_uniform_FVector4 = type { <4 x float> }
%v4_uniform_FMatrix = type { <16 x float> }

; Function Attrs: norecurse nounwind readonly
define %v4_uniform_FVector4
@VectorTransformVector2___REFs_5B__c_unFVector4_5D_REFs_5B__c_unFMatrix_5D_(%v4_uniform_FVector4*
noalias nocapture readonly %VecP, %v4_uniform_FMatrix* noalias nocapture
readonly %Matrix, <4 x i32> %__mask) local_unnamed_addr #0 {
allocas:
  %Matrix_load_bitcast = bitcast %v4_uniform_FMatrix* %Matrix to
%v4_uniform_FVector4*
  %0 = bitcast %v4_uniform_FVector4* %VecP to i32*
  %VecP_load3_offset_load55 = load i32, i32* %0, align 4
  %1 = insertelement <4 x i32> undef, i32 %VecP_load3_offset_load55, i32 0
  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
  %FVec.i.i.sroa.0.12.vec.insert = bitcast <4 x i32> %2 to <4 x float>
  %3 = bitcast %v4_uniform_FMatrix* %Matrix to <4 x float>*
  %A_load_bitcast_load.unpack.i.i.i = load <4 x float>, <4 x float>* %3, align
16, !alias.scope !3, !noalias !10
  %mul_S0_load_S1_load.i.i.i = fmul <4 x float>
%A_load_bitcast_load.unpack.i.i.i, %FVec.i.i.sroa.0.12.vec.insert
  %VecP_load910_offset = getelementptr %v4_uniform_FVector4,
%v4_uniform_FVector4* %VecP, i64 0, i32 0, i64 1
  %VecP_load910_offset_load = load float, float* %VecP_load910_offset, align 4
  %4 = getelementptr %v4_uniform_FVector4, %v4_uniform_FVector4*
%Matrix_load_bitcast, i64 1, i32 0
  %A_load_bitcast_load.unpack.i49 = load <4 x float>, <4 x float>* %4, align
16, !alias.scope !12, !noalias !15
  %S1_load_broadcast.i51 = insertelement <4 x float> undef, float
%VecP_load910_offset_load, i32 0
  %S1_load_broadcast8.i52 = shufflevector <4 x float> %S1_load_broadcast.i51,
<4 x float> undef, <4 x i32> zeroinitializer
  %mul_S0_load_S1_load_broadcast8.i53 = fmul <4 x float>
%A_load_bitcast_load.unpack.i49, %S1_load_broadcast8.i52
  %add_mul_S0_load_S1_load_broadcast8_S2_load.i54 = fadd <4 x float>
%mul_S0_load_S1_load.i.i.i, %mul_S0_load_S1_load_broadcast8.i53
  %VecP_load1718_offset = getelementptr %v4_uniform_FVector4,
%v4_uniform_FVector4* %VecP, i64 0, i32 0, i64 2
  %VecP_load1718_offset_load = load float, float* %VecP_load1718_offset, align
4
  %5 = getelementptr %v4_uniform_FVector4, %v4_uniform_FVector4*
%Matrix_load_bitcast, i64 2, i32 0
  %A_load_bitcast_load.unpack.i43 = load <4 x float>, <4 x float>* %5, align
16, !alias.scope !17, !noalias !20
  %S1_load_broadcast.i45 = insertelement <4 x float> undef, float
%VecP_load1718_offset_load, i32 0
  %S1_load_broadcast8.i46 = shufflevector <4 x float> %S1_load_broadcast.i45,
<4 x float> undef, <4 x i32> zeroinitializer
  %mul_S0_load_S1_load_broadcast8.i47 = fmul <4 x float>
%A_load_bitcast_load.unpack.i43, %S1_load_broadcast8.i46
  %add_mul_S0_load_S1_load_broadcast8_S2_load.i48 = fadd <4 x float>
%add_mul_S0_load_S1_load_broadcast8_S2_load.i54,
%mul_S0_load_S1_load_broadcast8.i47
  %VecP_load2526_offset = getelementptr %v4_uniform_FVector4,
%v4_uniform_FVector4* %VecP, i64 0, i32 0, i64 3
  %VecP_load2526_offset_load = load float, float* %VecP_load2526_offset, align
4
  %6 = getelementptr %v4_uniform_FVector4, %v4_uniform_FVector4*
%Matrix_load_bitcast, i64 3, i32 0
  %A_load_bitcast_load.unpack.i = load <4 x float>, <4 x float>* %6, align 16,
!alias.scope !22, !noalias !25
  %S1_load_broadcast.i = insertelement <4 x float> undef, float
%VecP_load2526_offset_load, i32 0
  %S1_load_broadcast8.i = shufflevector <4 x float> %S1_load_broadcast.i, <4 x
float> undef, <4 x i32> zeroinitializer
  %mul_S0_load_S1_load_broadcast8.i = fmul <4 x float>
%A_load_bitcast_load.unpack.i, %S1_load_broadcast8.i
  %add_mul_S0_load_S1_load_broadcast8_S2_load.i = fadd <4 x float>
%add_mul_S0_load_S1_load_broadcast8_S2_load.i48,
%mul_S0_load_S1_load_broadcast8.i
  %oldret = insertvalue %v4_uniform_FVector4 undef, <4 x float>
%add_mul_S0_load_S1_load_broadcast8_S2_load.i, 0
  ret %v4_uniform_FVector4 %oldret
}

attributes #0 = { norecurse nounwind readonly "target-cpu"="cortex-a57"
"target-features"="+neon" }

!llvm.ident = !{!0}
!llvm.module.flags = !{!1, !2}

!0 = !{!"clang version 11.0.0 (<a href="https://github.com/llvm/llvm-project.git">https://github.com/llvm/llvm-project.git</a>
176249bd6732a8044d457092ed932768724a6f06)"}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 7, !"PIC Level", i32 2}
!3 = !{!4, !6, !8}
!4 = distinct !{!4, !5,
!"operator*___REFs_5B__c_unFVector4_5D_REFs_5B__c_unFVector4_5D_: %A"}
!5 = distinct !{!5,
!"operator*___REFs_5B__c_unFVector4_5D_REFs_5B__c_unFVector4_5D_"}
!6 = distinct !{!6, !7, !"operator*___REFs_5B__c_unFVector4_5D_Cunf: %A"}
!7 = distinct !{!7, !"operator*___REFs_5B__c_unFVector4_5D_Cunf"}
!8 = distinct !{!8, !9, !"VectorMultiply___REFs_5B__c_unFVector4_5D_Cunf: %A"}
!9 = distinct !{!9, !"VectorMultiply___REFs_5B__c_unFVector4_5D_Cunf"}
!10 = !{!11}
!11 = distinct !{!11, !5,
!"operator*___REFs_5B__c_unFVector4_5D_REFs_5B__c_unFVector4_5D_: %B"}
!12 = !{!13}
!13 = distinct !{!13, !14,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_:
%A"}
!14 = distinct !{!14,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_"}
!15 = !{!16}
!16 = distinct !{!16, !14,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_:
%C"}
!17 = !{!18}
!18 = distinct !{!18, !19,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_:
%A"}
!19 = distinct !{!19,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_"}
!20 = !{!21}
!21 = distinct !{!21, !19,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_:
%C"}
!22 = !{!23}
!23 = distinct !{!23, !24,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_:
%A"}
!24 = distinct !{!24,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_"}
!25 = !{!26}
!26 = distinct !{!26, !24,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_:
%C"}
```

When compiling this llvm-IR with LLVM-11 we get the following code:  

```
+ /home/ubuntu/ispc/bld1/bin/ispc -O2 --target=neon one.ispc -o - --emit-asm
--cpu=cortex-a57
[...]
// %bb.0:                               // %allocas
        mov     x8, x0
        ld1r    { v1.4s }, [x8], #4
        ldp     q2, q3, [x1, #16]
        ldr     s0, [x8]
        fmul    v0.4s, v2.4s, v0.s[0]
        ldr     q2, [x1]
        fmla    v0.4s, v2.4s, v1.4s
        ldr     q4, [x1, #48]
        ldp     s1, s2, [x0, #8]
        fmla    v0.4s, v3.4s, v1.s[0]
        fmla    v0.4s, v4.4s, v2.s[0]
        ret
```
Load/store optimization does not seem to catch the pattern.
<a href="https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp">https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp</a>
```
+ llc-11 --aarch64-enable-ldst-opt -O2 one.ll -o -
[ same number of loads ]
```

The expected output (hand optimized) is:
```
        ld1     { v1.4s, v2.4s, v3.4s, v4.4s }, [x1]
        ldr     q5, [x0]
        fmul    v0.4s, v1.4s, v5.s[0]
        fmla    v0.4s, v2.4s, v5.s[1]
        fmla    v0.4s, v3.4s, v5.s[2]
        fmla    v0.4s, v4.4s, v5.s[3]
        ret
```</pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>