[llvm] 0f26125 - [AArch64] Use first op of FADDPv* instead of implicit def.

Thu Mar 3 05:32:27 PST 2022

Author: Florian Hahn
Date: 2022-03-03T13:32:09Z
New Revision: 0f261256e01f55aac04cf271f238e36fda8013d5

URL: https://github.com/llvm/llvm-project/commit/0f261256e01f55aac04cf271f238e36fda8013d5
DIFF: https://github.com/llvm/llvm-project/commit/0f261256e01f55aac04cf271f238e36fda8013d5.diff

LOG: [AArch64] Use first op of FADDPv* instead of implicit def.

This patch updates the FADDPv* patterns that only use the lower half of
the result register. For those patterns, the second operand does not
matter because its results won't be used.

Instead of introducing new implicit defs for those operands, just use
the first operand. The problem with using new implicit defs is that
register allocation can introduce unnecessary dependencies by using a
different register than the first operand.

For motivating cases, see the changes in the fadd_reduction_*_in_loop
cases. Without this change, the first faddp in the loop has an
unnecessary additional dependency through v0, which is also used for
a cross-iteration reduction.

This can noticeable impact performance. For slightly bigger loops,
this change can improve performance by 15%.

Reviewed By: sdesmalen, t.p.northover

Differential Revision: https://reviews.llvm.org/D120706

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/test/CodeGen/AArch64/vecreduce-fadd.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 8e1f61925794f..1152f8b20a7b4 100644

--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5345,19 +5345,22 @@ defm FMAXP   : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
 defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
 defm FMINP   : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
 
+// Only the lower half of the result of the inner FADDP is used in the patterns
+// below, so the second operand does not matter. Re-use the first input
+// operand, so no additional dependencies need to be introduced.
 let Predicates = [HasFullFP16] in {
 def : Pat<(f16 (vecreduce_fadd (v8f16 V128:$Rn))),
             (FADDPv2i16p
               (EXTRACT_SUBREG
-                 (FADDPv8f16 (FADDPv8f16 V128:$Rn, (v8f16 (IMPLICIT_DEF))), (v8f16 (IMPLICIT_DEF))),
+                 (FADDPv8f16 (FADDPv8f16 V128:$Rn, V128:$Rn), V128:$Rn),
                dsub))>;
 def : Pat<(f16 (vecreduce_fadd (v4f16 V64:$Rn))),
-          (FADDPv2i16p (FADDPv4f16 V64:$Rn, (v4f16 (IMPLICIT_DEF))))>;
+          (FADDPv2i16p (FADDPv4f16 V64:$Rn, V64:$Rn))>;
 }
 def : Pat<(f32 (vecreduce_fadd (v4f32 V128:$Rn))),
           (FADDPv2i32p
             (EXTRACT_SUBREG
-              (FADDPv4f32 V128:$Rn, (v4f32 (IMPLICIT_DEF))),
+              (FADDPv4f32 V128:$Rn, V128:$Rn),
              dsub))>;
 def : Pat<(f32 (vecreduce_fadd (v2f32 V64:$Rn))),
           (FADDPv2i32p V64:$Rn)>;

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index c147b8d448c21..e83105f0bfdec 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -45,8 +45,8 @@ define half @add_HalfH(<4 x half> %bin.rdx)  {
 define half @add_H(<8 x half> %bin.rdx)  {
 ; FULLFP16-LABEL: add_H:
 ; FULLFP16:       // %bb.0:
-; FULLFP16-NEXT:    faddp v0.8h, v0.8h, v0.8h
-; FULLFP16-NEXT:    faddp v0.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
 ; FULLFP16-NEXT:    faddp h0, v0.2h
 ; FULLFP16-NEXT:    ret
 ;
@@ -115,8 +115,8 @@ define half @add_2H(<16 x half> %bin.rdx)  {
 ; FULLFP16-LABEL: add_2H:
 ; FULLFP16:       // %bb.0:
 ; FULLFP16-NEXT:    fadd v0.8h, v0.8h, v1.8h
-; FULLFP16-NEXT:    faddp v0.8h, v0.8h, v0.8h
-; FULLFP16-NEXT:    faddp v0.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
 ; FULLFP16-NEXT:    faddp h0, v0.2h
 ; FULLFP16-NEXT:    ret
 ;
@@ -248,7 +248,7 @@ define float @fadd_reduction_v4f32_in_loop(float* %ptr.start) {
 ; CHECK-NEXT:    ldr q1, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp w8, #112
-; CHECK-NEXT:    faddp v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    faddp v1.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    faddp s1, v1.2s
 ; CHECK-NEXT:    fadd s0, s1, s0
 ; CHECK-NEXT:    b.ne .LBB9_1
@@ -286,7 +286,7 @@ define half @fadd_reduction_v4f16_in_loop(half* %ptr.start) {
 ; FULLFP16-NEXT:    ldr d1, [x0, x8]
 ; FULLFP16-NEXT:    add x8, x8, #8
 ; FULLFP16-NEXT:    cmp w8, #56
-; FULLFP16-NEXT:    faddp v1.4h, v1.4h, v0.4h
+; FULLFP16-NEXT:    faddp v1.4h, v1.4h, v1.4h
 ; FULLFP16-NEXT:    faddp h1, v1.2h
 ; FULLFP16-NEXT:    fadd h0, h1, h0
 ; FULLFP16-NEXT:    b.ne .LBB10_1
@@ -357,8 +357,8 @@ define half @fadd_reduction_v8f16_in_loop(half* %ptr.start) {
 ; FULLFP16-NEXT:    ldr q1, [x0, x8]
 ; FULLFP16-NEXT:    add x8, x8, #8
 ; FULLFP16-NEXT:    cmp w8, #56
-; FULLFP16-NEXT:    faddp v1.8h, v1.8h, v0.8h
-; FULLFP16-NEXT:    faddp v1.8h, v1.8h, v0.8h
+; FULLFP16-NEXT:    faddp v2.8h, v1.8h, v1.8h
+; FULLFP16-NEXT:    faddp v1.8h, v2.8h, v1.8h
 ; FULLFP16-NEXT:    faddp h1, v1.2h
 ; FULLFP16-NEXT:    fadd h0, h1, h0
 ; FULLFP16-NEXT:    b.ne .LBB11_1