[llvm-bugs] [Bug 49981] New: [AArch64] missed ld/st fusion optimization
via llvm-bugs
llvm-bugs at lists.llvm.org
Thu Apr 15 18:40:28 PDT 2021
https://bugs.llvm.org/show_bug.cgi?id=49981
Bug ID: 49981
Summary: [AArch64] missed ld/st fusion optimization
Product: libraries
Version: 11.0
Hardware: PC
OS: All
Status: NEW
Severity: enhancement
Priority: P
Component: Backend: AArch64
Assignee: unassignedbugs at nondot.org
Reporter: sebpop at gmail.com
CC: arnaud.degrandmaison at arm.com,
llvm-bugs at lists.llvm.org, smithp352 at googlemail.com,
Ties.Stuij at arm.com
On Arm64 LLVM produces too many loads that could be fused.
The following llvm-ir comes from a reduced test from
https://github.com/ispc/ispc/issues/2052
The bug contains a larger test for matrix multiply in ispc that needs to be
fixed as well.
```
+ /home/ubuntu/ispc/bld1/bin/ispc -O2 --target=neon one.ispc -o -
--emit-llvm-text --cpu=cortex-a57
; ModuleID = 'one.ispc'
source_filename = "one.ispc"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"
%v4_uniform_FVector4 = type { <4 x float> }
%v4_uniform_FMatrix = type { <16 x float> }
; Function Attrs: norecurse nounwind readonly
define %v4_uniform_FVector4
@VectorTransformVector2___REFs_5B__c_unFVector4_5D_REFs_5B__c_unFMatrix_5D_(%v4_uniform_FVector4*
noalias nocapture readonly %VecP, %v4_uniform_FMatrix* noalias nocapture
readonly %Matrix, <4 x i32> %__mask) local_unnamed_addr #0 {
allocas:
%Matrix_load_bitcast = bitcast %v4_uniform_FMatrix* %Matrix to
%v4_uniform_FVector4*
%0 = bitcast %v4_uniform_FVector4* %VecP to i32*
%VecP_load3_offset_load55 = load i32, i32* %0, align 4
%1 = insertelement <4 x i32> undef, i32 %VecP_load3_offset_load55, i32 0
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
%FVec.i.i.sroa.0.12.vec.insert = bitcast <4 x i32> %2 to <4 x float>
%3 = bitcast %v4_uniform_FMatrix* %Matrix to <4 x float>*
%A_load_bitcast_load.unpack.i.i.i = load <4 x float>, <4 x float>* %3, align
16, !alias.scope !3, !noalias !10
%mul_S0_load_S1_load.i.i.i = fmul <4 x float>
%A_load_bitcast_load.unpack.i.i.i, %FVec.i.i.sroa.0.12.vec.insert
%VecP_load910_offset = getelementptr %v4_uniform_FVector4,
%v4_uniform_FVector4* %VecP, i64 0, i32 0, i64 1
%VecP_load910_offset_load = load float, float* %VecP_load910_offset, align 4
%4 = getelementptr %v4_uniform_FVector4, %v4_uniform_FVector4*
%Matrix_load_bitcast, i64 1, i32 0
%A_load_bitcast_load.unpack.i49 = load <4 x float>, <4 x float>* %4, align
16, !alias.scope !12, !noalias !15
%S1_load_broadcast.i51 = insertelement <4 x float> undef, float
%VecP_load910_offset_load, i32 0
%S1_load_broadcast8.i52 = shufflevector <4 x float> %S1_load_broadcast.i51,
<4 x float> undef, <4 x i32> zeroinitializer
%mul_S0_load_S1_load_broadcast8.i53 = fmul <4 x float>
%A_load_bitcast_load.unpack.i49, %S1_load_broadcast8.i52
%add_mul_S0_load_S1_load_broadcast8_S2_load.i54 = fadd <4 x float>
%mul_S0_load_S1_load.i.i.i, %mul_S0_load_S1_load_broadcast8.i53
%VecP_load1718_offset = getelementptr %v4_uniform_FVector4,
%v4_uniform_FVector4* %VecP, i64 0, i32 0, i64 2
%VecP_load1718_offset_load = load float, float* %VecP_load1718_offset, align
4
%5 = getelementptr %v4_uniform_FVector4, %v4_uniform_FVector4*
%Matrix_load_bitcast, i64 2, i32 0
%A_load_bitcast_load.unpack.i43 = load <4 x float>, <4 x float>* %5, align
16, !alias.scope !17, !noalias !20
%S1_load_broadcast.i45 = insertelement <4 x float> undef, float
%VecP_load1718_offset_load, i32 0
%S1_load_broadcast8.i46 = shufflevector <4 x float> %S1_load_broadcast.i45,
<4 x float> undef, <4 x i32> zeroinitializer
%mul_S0_load_S1_load_broadcast8.i47 = fmul <4 x float>
%A_load_bitcast_load.unpack.i43, %S1_load_broadcast8.i46
%add_mul_S0_load_S1_load_broadcast8_S2_load.i48 = fadd <4 x float>
%add_mul_S0_load_S1_load_broadcast8_S2_load.i54,
%mul_S0_load_S1_load_broadcast8.i47
%VecP_load2526_offset = getelementptr %v4_uniform_FVector4,
%v4_uniform_FVector4* %VecP, i64 0, i32 0, i64 3
%VecP_load2526_offset_load = load float, float* %VecP_load2526_offset, align
4
%6 = getelementptr %v4_uniform_FVector4, %v4_uniform_FVector4*
%Matrix_load_bitcast, i64 3, i32 0
%A_load_bitcast_load.unpack.i = load <4 x float>, <4 x float>* %6, align 16,
!alias.scope !22, !noalias !25
%S1_load_broadcast.i = insertelement <4 x float> undef, float
%VecP_load2526_offset_load, i32 0
%S1_load_broadcast8.i = shufflevector <4 x float> %S1_load_broadcast.i, <4 x
float> undef, <4 x i32> zeroinitializer
%mul_S0_load_S1_load_broadcast8.i = fmul <4 x float>
%A_load_bitcast_load.unpack.i, %S1_load_broadcast8.i
%add_mul_S0_load_S1_load_broadcast8_S2_load.i = fadd <4 x float>
%add_mul_S0_load_S1_load_broadcast8_S2_load.i48,
%mul_S0_load_S1_load_broadcast8.i
%oldret = insertvalue %v4_uniform_FVector4 undef, <4 x float>
%add_mul_S0_load_S1_load_broadcast8_S2_load.i, 0
ret %v4_uniform_FVector4 %oldret
}
attributes #0 = { norecurse nounwind readonly "target-cpu"="cortex-a57"
"target-features"="+neon" }
!llvm.ident = !{!0}
!llvm.module.flags = !{!1, !2}
!0 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git
176249bd6732a8044d457092ed932768724a6f06)"}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 7, !"PIC Level", i32 2}
!3 = !{!4, !6, !8}
!4 = distinct !{!4, !5,
!"operator*___REFs_5B__c_unFVector4_5D_REFs_5B__c_unFVector4_5D_: %A"}
!5 = distinct !{!5,
!"operator*___REFs_5B__c_unFVector4_5D_REFs_5B__c_unFVector4_5D_"}
!6 = distinct !{!6, !7, !"operator*___REFs_5B__c_unFVector4_5D_Cunf: %A"}
!7 = distinct !{!7, !"operator*___REFs_5B__c_unFVector4_5D_Cunf"}
!8 = distinct !{!8, !9, !"VectorMultiply___REFs_5B__c_unFVector4_5D_Cunf: %A"}
!9 = distinct !{!9, !"VectorMultiply___REFs_5B__c_unFVector4_5D_Cunf"}
!10 = !{!11}
!11 = distinct !{!11, !5,
!"operator*___REFs_5B__c_unFVector4_5D_REFs_5B__c_unFVector4_5D_: %B"}
!12 = !{!13}
!13 = distinct !{!13, !14,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_:
%A"}
!14 = distinct !{!14,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_"}
!15 = !{!16}
!16 = distinct !{!16, !14,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_:
%C"}
!17 = !{!18}
!18 = distinct !{!18, !19,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_:
%A"}
!19 = distinct !{!19,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_"}
!20 = !{!21}
!21 = distinct !{!21, !19,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_:
%C"}
!22 = !{!23}
!23 = distinct !{!23, !24,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_:
%A"}
!24 = distinct !{!24,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_"}
!25 = !{!26}
!26 = distinct !{!26, !24,
!"VectorMultiplyAdd___REFs_5B__c_unFVector4_5D_CunfREFs_5B__c_unFVector4_5D_:
%C"}
```
When compiling this llvm-IR with LLVM-11 we get the following code:
```
+ /home/ubuntu/ispc/bld1/bin/ispc -O2 --target=neon one.ispc -o - --emit-asm
--cpu=cortex-a57
[...]
// %bb.0: // %allocas
mov x8, x0
ld1r { v1.4s }, [x8], #4
ldp q2, q3, [x1, #16]
ldr s0, [x8]
fmul v0.4s, v2.4s, v0.s[0]
ldr q2, [x1]
fmla v0.4s, v2.4s, v1.4s
ldr q4, [x1, #48]
ldp s1, s2, [x0, #8]
fmla v0.4s, v3.4s, v1.s[0]
fmla v0.4s, v4.4s, v2.s[0]
ret
```
Load/store optimization does not seem to catch the pattern.
https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
```
+ llc-11 --aarch64-enable-ldst-opt -O2 one.ll -o -
[ same number of loads ]
```
The expected output (hand optimized) is:
```
ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x1]
ldr q5, [x0]
fmul v0.4s, v1.4s, v5.s[0]
fmla v0.4s, v2.4s, v5.s[1]
fmla v0.4s, v3.4s, v5.s[2]
fmla v0.4s, v4.4s, v5.s[3]
ret
```
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210416/022f40c5/attachment-0001.html>
More information about the llvm-bugs
mailing list