[llvm] [CodeGen][AArch64] Sink splat operands of FMul instructions (PR #116222)

Thu Nov 14 05:06:01 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Hari Limaye (hazzlim)

<details>
<summary>Changes</summary>

Sink shuffle operands of FMul instructions if these are splats, as we can generate lane-indexed variants for these.


---
Full diff: https://github.com/llvm/llvm-project/pull/116222.diff


2 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+8) 
- (modified) llvm/test/CodeGen/AArch64/sinksplat.ll (+15-11) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a97b0d3b1db92a..eb3b9609f697ca 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5239,6 +5239,14 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     // Is it profitable to sink if we found two of the same type of extends.
     return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
   }
+  case Instruction::FMul: {
+    // Sink splats for index lane variants
+    if (isSplatShuffle(I->getOperand(0)))
+      Ops.push_back(&I->getOperandUse(0));
+    if (isSplatShuffle(I->getOperand(1)))
+      Ops.push_back(&I->getOperandUse(1));
+    return !Ops.empty();
+  }
   default:
     return false;
   }
diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index d156ec079ae941..c94d5bf2a208ff 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -230,29 +230,34 @@ l2:
   ret <4 x i32> %c
 }
 
-define <4 x float> @fmul(<4 x float> %x, ptr %y) {
+define <4 x float> @fmul(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    fmul v1.4s, v2.4s, v1.s[3]
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB7_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    subs w8, w8, #1
+; CHECK-NEXT:    ldr q2, [x1, x8]
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    cmp w8, #16
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v1.s[0]
+; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    b.eq .LBB7_1
 ; CHECK-NEXT:  // %bb.2: // %l2
 ; CHECK-NEXT:    ret
 entry:
-  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %x.val = load float, ptr %x
+  %x.ins = insertelement <4 x float> poison, float %x.val, i64 0
+  %a = shufflevector <4 x float> %x.ins, <4 x float> undef, <4 x i32> zeroinitializer
   br label %l1
 
 l1:
   %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
   %q = phi <4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ]
-  %l = load <4 x float>, ptr %y
+  %idx.y = mul nuw nsw i32 %p, 4
+  %ptr.y = getelementptr float, ptr %y, i32 %idx.y
+  %l = load <4 x float>, ptr %ptr.y
   %b = fmul <4 x float> %l, %a
   %c = fadd <4 x float> %b, %q
   %pa = add i32 %p, 1
@@ -270,10 +275,9 @@ define <4 x float> @fmuladd(<4 x float> %x, ptr %y) {
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB8_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    fmla v0.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fmla v0.4s, v2.4s, v1.s[3]
 ; CHECK-NEXT:    subs w8, w8, #1
 ; CHECK-NEXT:    b.eq .LBB8_1
 ; CHECK-NEXT:  // %bb.2: // %l2

``````````

</details>


https://github.com/llvm/llvm-project/pull/116222