[PATCH] D25020: [ARM] Fix 26% performance regression on Cortex-A9 caused by not using VMLA/VMLS

Wed Sep 28 07:32:00 PDT 2016

eastig created this revision.
eastig added reviewers: jmolloy, rengolin, t.p.northover.
eastig added a subscriber: llvm-commits.
Herald added subscribers: samparker, rengolin, aemerson.

We have 26% performance regression on Cortex-A9. We found it is caused by not using VMLA/VMLS.
There is ARMDAGToDAGISel::hasNoVMLxHazardUse which prevents generation of VMLx instructions for Cortex-A8 and Cortex-A9. Also there is a MLxExpansion pass which expands VMLx instructions in case of Cortex-A8 and Cortex-A9. The code is based on the note: 

http://infocenter.arm.com/help/topic/com.arm.doc.ddi0344k/BCGDCECC.html#ftn.CEGHAGEA

```
The VMLA.F and VMLS.F type instructions have additional restrictions that determine when they can be issued:
If a VMLA.F is followed by a VMLA.F with no RAW hazard, the second VFMLA.F issues with no stalls.
If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the VADD.F or VMUL.F stalls 4 cycles before issue. The 4 cycle stall preserves the in-order retirement of the instructions.
A VMLA.F followed by any NEON floating-point instruction with RAW hazard stalls for 8 cycles.
```

3.7.0/3.7.1 have a bug which causes Subtarget->isCortexA9() to return false even the specified CPU is Cortex-A9. So the code did not work and VMLx instructions were generated.
In 3.8.0 it is fixed:
```
$ cat fml.ll 
define double @test(double %a, double %b, double %c, double %d, double %e, double %f) #0 {
  %1 = fmul double %a, %c
  %2 = fmul double %b, %d
  %3 = fsub double %1, %2

  %4 = fmul double %a, %d
  %5 = fmul double %b, %c
  %6 = fadd double %5, %4

  %7 = fsub double %e, %3
  %8 = fsub double %f, %6
  %9 = fadd double %3, %8
  %10 = fadd double %6, %7
  %11 = fmul double %9, %10

  ret double %11
}

attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+neon,+vfp3,-crypto,-d16,-fp-armv8,-fp-only-sp,-fp16,-vfp4" "unsafe-fp-math"="false" "use-soft-float"="false" }

$ 3.7.1/bin/llc -mtriple=arm-eabi -mcpu=cortex-a9 fml.ll -o -
        .text
        .syntax unified
        .eabi_attribute 67, "2.09"      @ Tag_conformance
        .cpu    cortex-a9
        .eabi_attribute 6, 10   @ Tag_CPU_arch
        .eabi_attribute 7, 65   @ Tag_CPU_arch_profile
        .eabi_attribute 8, 1    @ Tag_ARM_ISA_use
        .eabi_attribute 9, 2    @ Tag_THUMB_ISA_use
        .fpu    neon-fp16
        .eabi_attribute 17, 1   @ Tag_ABI_PCS_GOT_use
        .eabi_attribute 20, 1   @ Tag_ABI_FP_denormal
        .eabi_attribute 21, 1   @ Tag_ABI_FP_exceptions
        .eabi_attribute 23, 3   @ Tag_ABI_FP_number_model
        .eabi_attribute 34, 0   @ Tag_CPU_unaligned_access
        .eabi_attribute 24, 1   @ Tag_ABI_align_needed
        .eabi_attribute 25, 1   @ Tag_ABI_align_preserved
        .eabi_attribute 36, 1   @ Tag_FP_HP_extension
        .eabi_attribute 38, 1   @ Tag_ABI_FP_16bit_format
        .eabi_attribute 42, 1   @ Tag_MPextension_use
        .eabi_attribute 14, 0   @ Tag_ABI_PCS_R9_use
        .eabi_attribute 68, 1   @ Tag_Virtualization_use
        .file   "fml.ll"
        .globl  test
        .align  2
        .type   test,%function
test:                                   @ @test
        .fnstart
@ BB#0:
        vldr    d16, [sp]
        vldr    d18, [sp, #8]
        vmov    d17, r0, r1
        vmul.f64        d19, d17, d16
        vmul.f64        d17, d17, d18
        vmov    d20, r2, r3
        vmls.f64        d19, d20, d18
        vmla.f64        d17, d20, d16
        vldr    d16, [sp, #16]
        vldr    d18, [sp, #24]
        vsub.f64        d16, d16, d19
        vsub.f64        d18, d18, d17
        vadd.f64        d16, d17, d16
        vadd.f64        d17, d19, d18
        vmul.f64        d16, d17, d16
        vmov    r0, r1, d16
        bx      lr
.Lfunc_end0:
        .size   test, .Lfunc_end0-test
        .cantunwind
        .fnend

        .section        ".note.GNU-stack","",%progbits

$ 3.8.0/bin/llc -mtriple=arm-eabi -mcpu=cortex-a9 fml.ll -o -
        .text
        .syntax unified
        .eabi_attribute 67, "2.09"      @ Tag_conformance
        .cpu    cortex-a9
        .eabi_attribute 6, 10   @ Tag_CPU_arch
        .eabi_attribute 7, 65   @ Tag_CPU_arch_profile
        .eabi_attribute 8, 1    @ Tag_ARM_ISA_use
        .eabi_attribute 9, 2    @ Tag_THUMB_ISA_use
        .fpu    neon-fp16
        .eabi_attribute 17, 1   @ Tag_ABI_PCS_GOT_use
        .eabi_attribute 20, 1   @ Tag_ABI_FP_denormal
        .eabi_attribute 21, 1   @ Tag_ABI_FP_exceptions
        .eabi_attribute 23, 3   @ Tag_ABI_FP_number_model
        .eabi_attribute 34, 1   @ Tag_CPU_unaligned_access
        .eabi_attribute 24, 1   @ Tag_ABI_align_needed
        .eabi_attribute 25, 1   @ Tag_ABI_align_preserved
        .eabi_attribute 36, 1   @ Tag_FP_HP_extension
        .eabi_attribute 38, 1   @ Tag_ABI_FP_16bit_format
        .eabi_attribute 42, 1   @ Tag_MPextension_use
        .eabi_attribute 14, 0   @ Tag_ABI_PCS_R9_use
        .eabi_attribute 68, 1   @ Tag_Virtualization_use
        .file   "fml.ll"
        .globl  test
        .align  2
        .type   test,%function
test:                                   @ @test
        .fnstart
@ BB#0:
        vldmia  sp, {d16, d17}
        vmov    d18, r2, r3
        vmov    d19, r0, r1
        vmul.f64        d20, d18, d17
        vmul.f64        d21, d19, d16
        vmul.f64        d17, d19, d17
        vmul.f64        d16, d18, d16
        vsub.f64        d18, d21, d20
        vldr    d19, [sp, #24]
        vadd.f64        d16, d16, d17
        vldr    d17, [sp, #16]
        vsub.f64        d17, d17, d18
        vsub.f64        d19, d19, d16
        vadd.f64        d16, d16, d17
        vadd.f64        d17, d18, d19
        vmul.f64        d16, d17, d16
        vmov    r0, r1, d16
        bx      lr
.Lfunc_end0:
        .size   test, .Lfunc_end0-test
        .cantunwind
        .fnend

        .section        ".note.GNU-stack","",%progbits
        .eabi_attribute 30, 1   @ Tag_ABI_optimization_goals

```

VMLx instructions can be faster on Cortex-A9 because of accumulator forwarding:

http://infocenter.arm.com/help/topic/com.arm.doc.ddi0409i/BCGDCIBA.html#ftn.id3445094

```
If a multiply-accumulate follows a multiply or another multiply-accumulate, and depends on the result of that first instruction, then if the dependency between both instructions are of the same type and size, the processor uses a special multiplier accumulator forwarding. This special forwarding means the multiply instructions can issue back-to-back because the result of the first instruction in cycle 5 is forwarded to the accumulator of the second instruction in cycle 4. If the size and type of the instructions do not match, then Dd or Qd is required in cycle 3. This applies to combinations of the multiply-accumulate instructions VMLA, VMLS, VQDMLA, and VQDMLS, and the multiply instructions VMUL andVQDMUL.
```

This patch fixes this issue.

https://reviews.llvm.org/D25020

Files:
  lib/Target/ARM/ARMISelDAGToDAG.cpp
  lib/Target/ARM/MLxExpansionPass.cpp
  test/CodeGen/ARM/fmacs.ll
  test/CodeGen/ARM/fml.ll

-------------- next part --------------
A non-text attachment was scrubbed...
Name: D25020.72817.patch
Type: text/x-patch
Size: 8548 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20160928/afd2b0c9/attachment.bin>