[llvm] 9aa3948 - [AArch64] Prefer to fold dup into fmul/fma as opposed to ld1r

Tue Mar 7 13:24:31 PST 2023

Author: David Green
Date: 2023-03-07T21:24:16Z
New Revision: 9aa39481d9eb718e872993791547053a3c1f16d5

URL: https://github.com/llvm/llvm-project/commit/9aa39481d9eb718e872993791547053a3c1f16d5
DIFF: https://github.com/llvm/llvm-project/commit/9aa39481d9eb718e872993791547053a3c1f16d5.diff

LOG: [AArch64] Prefer to fold dup into fmul/fma as opposed to ld1r

There is a fold to create LD1DUPpost from dup(load) that can be postinc. If the
dup is used by a "by element" operation such as fmul or fma then it can be
slightly better to fold the dup into the fmul instead, which produces slightly
fast code.

  ld1r { v1.4s }, [x0], #4
  fmul v0.4s, v1.4s, v0.4s
vs
  ldr s1, [x0], #4
  fmul v0.4s, v0.4s, v1.s[0]

This could also be done with integer operations such as smull/umull too, so
long as the load/dup gets correctly combined into the mul operation. Currently
this just operates on foating point types.

Differential Revision: https://reviews.llvm.org/D145184

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/ld1postmul.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index aa1049cf09a08..f3efb62d0da94 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19348,6 +19348,15 @@ static SDValue performPostLD1Combine(SDNode *N,
       return SDValue();
   }
 
+  // If there is one use and it can splat the value, prefer that operation.
+  // TODO: This could be expanded to more operations if they reliably use the
+  // index variants.
+  if (N->hasOneUse()) {
+    unsigned UseOpc = N->use_begin()->getOpcode();
+    if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
+      return SDValue();
+  }
+
   SDValue Addr = LD->getOperand(1);
   SDValue Vector = N->getOperand(0);
   // Search for a use of the address operand that is an increment.

diff  --git a/llvm/test/CodeGen/AArch64/ld1postmul.ll b/llvm/test/CodeGen/AArch64/ld1postmul.ll
index 5c219ec01688b..44f6ee093c5ce 100644
--- a/llvm/test/CodeGen/AArch64/ld1postmul.ll
+++ b/llvm/test/CodeGen/AArch64/ld1postmul.ll
@@ -63,8 +63,8 @@ define ptr @fmul_v4f16(ptr %p, ptr %ps, <4 x half> %t) {
 ;
 ; CHECK-FP16-LABEL: fmul_v4f16:
 ; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    ld1r { v1.4h }, [x0], #2
-; CHECK-FP16-NEXT:    fmul v0.4h, v1.4h, v0.4h
+; CHECK-FP16-NEXT:    ldr h1, [x0], #2
+; CHECK-FP16-NEXT:    fmul v0.4h, v0.4h, v1.h[0]
 ; CHECK-FP16-NEXT:    str d0, [x1]
 ; CHECK-FP16-NEXT:    ret
   %l = load half, ptr %p
@@ -93,8 +93,8 @@ define ptr @fmla_v4f16(ptr %p, ptr %ps, <4 x half> %t, <4 x half> %u) {
 ;
 ; CHECK-FP16-LABEL: fmla_v4f16:
 ; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    ld1r { v2.4h }, [x0], #2
-; CHECK-FP16-NEXT:    fmla v1.4h, v0.4h, v2.4h
+; CHECK-FP16-NEXT:    ldr h2, [x0], #2
+; CHECK-FP16-NEXT:    fmla v1.4h, v0.4h, v2.h[0]
 ; CHECK-FP16-NEXT:    str d1, [x1]
 ; CHECK-FP16-NEXT:    ret
   %l = load half, ptr %p
@@ -110,8 +110,8 @@ define ptr @fmla_v4f16(ptr %p, ptr %ps, <4 x half> %t, <4 x half> %u) {
 define ptr @fmul_v4f32(ptr %p, ptr %ps, <4 x float> %t) {
 ; CHECK-LABEL: fmul_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1r { v1.4s }, [x0], #4
-; CHECK-NEXT:    fmul v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ldr s1, [x0], #4
+; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.s[0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
   %l = load float, ptr %p
@@ -126,8 +126,8 @@ define ptr @fmul_v4f32(ptr %p, ptr %ps, <4 x float> %t) {
 define ptr @fmla_v4f32(ptr %p, ptr %ps, <4 x float> %t, <4 x float> %u) {
 ; CHECK-LABEL: fmla_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1r { v2.4s }, [x0], #4
-; CHECK-NEXT:    fmla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr s2, [x0], #4
+; CHECK-NEXT:    fmla v1.4s, v0.4s, v2.s[0]
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    ret
   %l = load float, ptr %p
@@ -143,8 +143,8 @@ define ptr @fmla_v4f32(ptr %p, ptr %ps, <4 x float> %t, <4 x float> %u) {
 define ptr @fmul_v2f64(ptr %p, ptr %ps, <2 x double> %t) {
 ; CHECK-LABEL: fmul_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1r { v1.2d }, [x0], #8
-; CHECK-NEXT:    fmul v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    ldr d1, [x0], #8
+; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.d[0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
   %l = load double, ptr %p
@@ -159,8 +159,8 @@ define ptr @fmul_v2f64(ptr %p, ptr %ps, <2 x double> %t) {
 define ptr @fmla_v2f64(ptr %p, ptr %ps, <2 x double> %t, <2 x double> %u) {
 ; CHECK-LABEL: fmla_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1r { v2.2d }, [x0], #8
-; CHECK-NEXT:    fmla v1.2d, v0.2d, v2.2d
+; CHECK-NEXT:    ldr d2, [x0], #8
+; CHECK-NEXT:    fmla v1.2d, v0.2d, v2.d[0]
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    ret
   %l = load double, ptr %p