[llvm] [CodeGen][AArch64] Sink splat operands of FMul instructions (PR #116222)

Hari Limaye via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 15 01:32:51 PST 2024


https://github.com/hazzlim updated https://github.com/llvm/llvm-project/pull/116222

>From 8f7c6c64249aeb65e2d82dcb1030d6680b529183 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Thu, 14 Nov 2024 12:21:32 +0000
Subject: [PATCH 1/5] [CodeGen][AArch64] Update fmul sinksplat test (NFC)

In the `fmul` test in CodeGen/AArch64/sinksplat.ll both operands to the
fmul are loop-invariant and so the operation is hoisted out of the loop,
meaning we fail to test what we are trying to test (sinking of
splats). Fix this by making one of the operands not loop-invariant.
---
 llvm/test/CodeGen/AArch64/sinksplat.ll | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index d156ec079ae941..1e587ef487eadb 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -230,29 +230,34 @@ l2:
   ret <4 x i32> %c
 }
 
-define <4 x float> @fmul(<4 x float> %x, ptr %y) {
+define <4 x float> @fmul(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    fmul v1.4s, v2.4s, v1.s[3]
+; CHECK-NEXT:    ld1r { v1.4s }, [x0]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB7_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    subs w8, w8, #1
+; CHECK-NEXT:    ldr q2, [x1, x8]
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    cmp w8, #16
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    b.eq .LBB7_1
 ; CHECK-NEXT:  // %bb.2: // %l2
 ; CHECK-NEXT:    ret
 entry:
-  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %x.val = load float, ptr %x
+  %x.ins = insertelement <4 x float> poison, float %x.val, i64 0
+  %a = shufflevector <4 x float> %x.ins, <4 x float> undef, <4 x i32> zeroinitializer
   br label %l1
 
 l1:
   %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
   %q = phi <4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ]
-  %l = load <4 x float>, ptr %y
+  %idx.y = mul nuw nsw i32 %p, 4
+  %ptr.y = getelementptr float, ptr %y, i32 %idx.y
+  %l = load <4 x float>, ptr %ptr.y
   %b = fmul <4 x float> %l, %a
   %c = fadd <4 x float> %b, %q
   %pa = add i32 %p, 1

>From 3686f68bf797597460bc538c35757496c003cc74 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Thu, 14 Nov 2024 12:41:30 +0000
Subject: [PATCH 2/5] [CodeGen][AArch64] Sink splat operands of FMul
 instructions

Sink shuffle operands of FMul instructions if these are splats, as we
can generate lane-indexed variants for these.
---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 8 ++++++++
 llvm/test/CodeGen/AArch64/sinksplat.ll                 | 7 +++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a97b0d3b1db92a..eb3b9609f697ca 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5239,6 +5239,14 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     // Is it profitable to sink if we found two of the same type of extends.
     return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
   }
+  case Instruction::FMul: {
+    // Sink splats for index lane variants
+    if (isSplatShuffle(I->getOperand(0)))
+      Ops.push_back(&I->getOperandUse(0));
+    if (isSplatShuffle(I->getOperand(1)))
+      Ops.push_back(&I->getOperandUse(1));
+    return !Ops.empty();
+  }
   default:
     return false;
   }
diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index 1e587ef487eadb..c94d5bf2a208ff 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -234,14 +234,14 @@ define <4 x float> @fmul(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ld1r { v1.4s }, [x0]
+; CHECK-NEXT:    ldr s1, [x0]
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB7_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x1, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp w8, #16
-; CHECK-NEXT:    fmul v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v1.s[0]
 ; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    b.eq .LBB7_1
 ; CHECK-NEXT:  // %bb.2: // %l2
@@ -275,10 +275,9 @@ define <4 x float> @fmuladd(<4 x float> %x, ptr %y) {
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB8_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    fmla v0.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fmla v0.4s, v2.4s, v1.s[3]
 ; CHECK-NEXT:    subs w8, w8, #1
 ; CHECK-NEXT:    b.eq .LBB8_1
 ; CHECK-NEXT:  // %bb.2: // %l2

>From 6b9c2e2c2a32694be983107c02c892a4db0d3373 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Thu, 14 Nov 2024 15:08:21 +0000
Subject: [PATCH 3/5] Add scalable test

---
 llvm/test/CodeGen/AArch64/sinksplat.ll | 48 +++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index c94d5bf2a208ff..2ab7dd96f51f27 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -o - %s | FileCheck %s
 
 define <4 x i32> @smull(<4 x i16> %x, ptr %y) {
 ; CHECK-LABEL: smull:
@@ -422,6 +422,52 @@ l2:
   ret <4 x i32> %r
 }
 
+; We shouldn't sink the splat operand for scalable vectors.
+define <vscale x 4 x float> @fmul_scalable(ptr %x, ptr %y) {
+; CHECK-LABEL: fmul_scalable:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    sxtw x8, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    mov z1.s, s1
+; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:  .LBB13_1: // %l1
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    subs w9, w9, #1
+; CHECK-NEXT:    add x1, x1, x8
+; CHECK-NEXT:    fmul z2.s, z2.s, z1.s
+; CHECK-NEXT:    fadd z0.s, z2.s, z0.s
+; CHECK-NEXT:    b.eq .LBB13_1
+; CHECK-NEXT:  // %bb.2: // %l2
+; CHECK-NEXT:    ret
+entry:
+  %x.val = load float, ptr %x
+  %x.ins = insertelement <vscale x 4 x float> poison, float %x.val, i64 0
+  %a = shufflevector <vscale x 4 x float> %x.ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+  %33 = tail call i32 @llvm.vscale.i32()
+  %34 = shl nuw nsw i32 %33, 4
+  br label %l1
+
+l1:
+  %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+  %q = phi <vscale x 4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ]
+  %idx.y = mul nuw nsw i32 %p, %34
+  %ptr.y = getelementptr float, ptr %y, i32 %idx.y
+  %l = load <vscale x 4 x float>, ptr %ptr.y
+  %b = fmul <vscale x 4 x float> %l, %a
+  %c = fadd <vscale x 4 x float> %b, %q
+  %pa = add i32 %p, 1
+  %c1 = icmp eq i32 %p, 0
+  br i1 %c1, label %l1, label %l2
+
+l2:
+  ret <vscale x 4 x float> %c
+}
+
 
 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)

>From 4ab2719c3f1c2d31806c3e292b010bc57f94b081 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Thu, 14 Nov 2024 16:10:10 +0000
Subject: [PATCH 4/5] Don't sink splat operands when VT is scalable

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 4 ++++
 llvm/test/CodeGen/AArch64/sinksplat.ll                 | 5 ++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index eb3b9609f697ca..c9652340e5b5d5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5240,6 +5240,10 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
   }
   case Instruction::FMul: {
+    // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
+    if (I->getType()->isScalableTy())
+      return false;
+
     // Sink splats for index lane variants
     if (isSplatShuffle(I->getOperand(0)))
       Ops.push_back(&I->getOperandUse(0));
diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index 2ab7dd96f51f27..6ccfaa3bc172f5 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -426,13 +426,12 @@ l2:
 define <vscale x 4 x float> @fmul_scalable(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul_scalable:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    rdvl x8, #1
-; CHECK-NEXT:    ldr s1, [x0]
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    sxtw x8, w8
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    mov z1.s, s1
+; CHECK-NEXT:    ld1rw { z1.s }, p0/z, [x0]
 ; CHECK-NEXT:    lsl x8, x8, #2
 ; CHECK-NEXT:  .LBB13_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1

>From 698c06bfb87d67680380a4139fe0cf722f3bac75 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Thu, 14 Nov 2024 23:25:54 +0000
Subject: [PATCH 5/5] Improve handling of half element types

- Add tests for half element types, and only sink operands when
  subtargtet has fullfp16
- Refactor scalable test to use target-features attribute, rather than
  -mattr on the RUN line
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  4 +
 llvm/test/CodeGen/AArch64/sinksplat.ll        | 91 ++++++++++++++++++-
 2 files changed, 91 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c9652340e5b5d5..84212b03686b19 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5244,6 +5244,10 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     if (I->getType()->isScalableTy())
       return false;
 
+    if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
+        !ST->hasFullFP16())
+      return false;
+
     // Sink splats for index lane variants
     if (isSplatShuffle(I->getOperand(0)))
       Ops.push_back(&I->getOperandUse(0));
diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index 6ccfaa3bc172f5..cceeb9f3e830ac 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
 
 define <4 x i32> @smull(<4 x i16> %x, ptr %y) {
 ; CHECK-LABEL: smull:
@@ -422,8 +422,91 @@ l2:
   ret <4 x i32> %r
 }
 
+; We shouldn't sink without fullfp16.
+define <4 x half> @fmul_half(ptr %x, ptr %y) {
+; CHECK-LABEL: fmul_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v1.4h }, [x0]
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NEXT:  .LBB13_1: // %l1
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr d2, [x1, x8]
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    add x8, x8, #8
+; CHECK-NEXT:    cmp w8, #8
+; CHECK-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    b.eq .LBB13_1
+; CHECK-NEXT:  // %bb.2: // %l2
+; CHECK-NEXT:    ret
+entry:
+  %x.val = load half, ptr %x
+  %x.ins = insertelement <4 x half> poison, half %x.val, i64 0
+  %a = shufflevector <4 x half> %x.ins, <4 x half> undef, <4 x i32> zeroinitializer
+  br label %l1
+
+l1:
+  %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+  %q = phi <4 x half> [ zeroinitializer, %entry ], [ %c, %l1 ]
+  %idx.y = mul nuw nsw i32 %p, 4
+  %ptr.y = getelementptr half, ptr %y, i32 %idx.y
+  %l = load <4 x half>, ptr %ptr.y
+  %b = fmul <4 x half> %l, %a
+  %c = fadd <4 x half> %b, %q
+  %pa = add i32 %p, 1
+  %c1 = icmp eq i32 %p, 0
+  br i1 %c1, label %l1, label %l2
+
+l2:
+  ret <4 x half> %c
+}
+
+define <4 x half> @fmul_half_fullfp16(ptr %x, ptr %y) "target-features"="+fullfp16" {
+; CHECK-LABEL: fmul_half_fullfp16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB14_1: // %l1
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr d2, [x1, x8]
+; CHECK-NEXT:    add x8, x8, #8
+; CHECK-NEXT:    cmp w8, #8
+; CHECK-NEXT:    fmul v2.4h, v2.4h, v1.h[0]
+; CHECK-NEXT:    fadd v0.4h, v2.4h, v0.4h
+; CHECK-NEXT:    b.eq .LBB14_1
+; CHECK-NEXT:  // %bb.2: // %l2
+; CHECK-NEXT:    ret
+entry:
+  %x.val = load half, ptr %x
+  %x.ins = insertelement <4 x half> poison, half %x.val, i64 0
+  %a = shufflevector <4 x half> %x.ins, <4 x half> undef, <4 x i32> zeroinitializer
+  br label %l1
+
+l1:
+  %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+  %q = phi <4 x half> [ zeroinitializer, %entry ], [ %c, %l1 ]
+  %idx.y = mul nuw nsw i32 %p, 4
+  %ptr.y = getelementptr half, ptr %y, i32 %idx.y
+  %l = load <4 x half>, ptr %ptr.y
+  %b = fmul <4 x half> %l, %a
+  %c = fadd <4 x half> %b, %q
+  %pa = add i32 %p, 1
+  %c1 = icmp eq i32 %p, 0
+  br i1 %c1, label %l1, label %l2
+
+l2:
+  ret <4 x half> %c
+}
+
 ; We shouldn't sink the splat operand for scalable vectors.
-define <vscale x 4 x float> @fmul_scalable(ptr %x, ptr %y) {
+define <vscale x 4 x float> @fmul_scalable(ptr %x, ptr %y) "target-features"="+sve" {
 ; CHECK-LABEL: fmul_scalable:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
@@ -433,14 +516,14 @@ define <vscale x 4 x float> @fmul_scalable(ptr %x, ptr %y) {
 ; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:    ld1rw { z1.s }, p0/z, [x0]
 ; CHECK-NEXT:    lsl x8, x8, #2
-; CHECK-NEXT:  .LBB13_1: // %l1
+; CHECK-NEXT:  .LBB15_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
 ; CHECK-NEXT:    subs w9, w9, #1
 ; CHECK-NEXT:    add x1, x1, x8
 ; CHECK-NEXT:    fmul z2.s, z2.s, z1.s
 ; CHECK-NEXT:    fadd z0.s, z2.s, z0.s
-; CHECK-NEXT:    b.eq .LBB13_1
+; CHECK-NEXT:    b.eq .LBB15_1
 ; CHECK-NEXT:  // %bb.2: // %l2
 ; CHECK-NEXT:    ret
 entry:



More information about the llvm-commits mailing list