[llvm] [CodeGen][AArch64] Sink splat operands of FMul instructions (PR #116222)

Thu Nov 14 05:05:30 PST 2024

https://github.com/hazzlim created https://github.com/llvm/llvm-project/pull/116222

Sink shuffle operands of FMul instructions if these are splats, as we can generate lane-indexed variants for these.


>From 8f7c6c64249aeb65e2d82dcb1030d6680b529183 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Thu, 14 Nov 2024 12:21:32 +0000
Subject: [PATCH 1/2] [CodeGen][AArch64] Update fmul sinksplat test (NFC)

In the `fmul` test in CodeGen/AArch64/sinksplat.ll both operands to the
fmul are loop-invariant and so the operation is hoisted out of the loop,
meaning we fail to test what we are trying to test (sinking of
splats). Fix this by making one of the operands not loop-invariant.
---
 llvm/test/CodeGen/AArch64/sinksplat.ll | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index d156ec079ae941..1e587ef487eadb 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -230,29 +230,34 @@ l2:
   ret <4 x i32> %c
 }
 
-define <4 x float> @fmul(<4 x float> %x, ptr %y) {
+define <4 x float> @fmul(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    fmul v1.4s, v2.4s, v1.s[3]
+; CHECK-NEXT:    ld1r { v1.4s }, [x0]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB7_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    subs w8, w8, #1
+; CHECK-NEXT:    ldr q2, [x1, x8]
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    cmp w8, #16
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    b.eq .LBB7_1
 ; CHECK-NEXT:  // %bb.2: // %l2
 ; CHECK-NEXT:    ret
 entry:
-  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %x.val = load float, ptr %x
+  %x.ins = insertelement <4 x float> poison, float %x.val, i64 0
+  %a = shufflevector <4 x float> %x.ins, <4 x float> undef, <4 x i32> zeroinitializer
   br label %l1
 
 l1:
   %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
   %q = phi <4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ]
-  %l = load <4 x float>, ptr %y
+  %idx.y = mul nuw nsw i32 %p, 4
+  %ptr.y = getelementptr float, ptr %y, i32 %idx.y
+  %l = load <4 x float>, ptr %ptr.y
   %b = fmul <4 x float> %l, %a
   %c = fadd <4 x float> %b, %q
   %pa = add i32 %p, 1

>From 3686f68bf797597460bc538c35757496c003cc74 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Thu, 14 Nov 2024 12:41:30 +0000
Subject: [PATCH 2/2] [CodeGen][AArch64] Sink splat operands of FMul
 instructions

Sink shuffle operands of FMul instructions if these are splats, as we
can generate lane-indexed variants for these.
---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 8 ++++++++
 llvm/test/CodeGen/AArch64/sinksplat.ll                 | 7 +++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a97b0d3b1db92a..eb3b9609f697ca 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5239,6 +5239,14 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     // Is it profitable to sink if we found two of the same type of extends.
     return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
   }
+  case Instruction::FMul: {
+    // Sink splats for index lane variants
+    if (isSplatShuffle(I->getOperand(0)))
+      Ops.push_back(&I->getOperandUse(0));
+    if (isSplatShuffle(I->getOperand(1)))
+      Ops.push_back(&I->getOperandUse(1));
+    return !Ops.empty();
+  }
   default:
     return false;
   }
diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index 1e587ef487eadb..c94d5bf2a208ff 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -234,14 +234,14 @@ define <4 x float> @fmul(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ld1r { v1.4s }, [x0]
+; CHECK-NEXT:    ldr s1, [x0]
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB7_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x1, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp w8, #16
-; CHECK-NEXT:    fmul v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v1.s[0]
 ; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    b.eq .LBB7_1
 ; CHECK-NEXT:  // %bb.2: // %l2
@@ -275,10 +275,9 @@ define <4 x float> @fmuladd(<4 x float> %x, ptr %y) {
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB8_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    fmla v0.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fmla v0.4s, v2.4s, v1.s[3]
 ; CHECK-NEXT:    subs w8, w8, #1
 ; CHECK-NEXT:    b.eq .LBB8_1
 ; CHECK-NEXT:  // %bb.2: // %l2