[llvm] [LLVM][CodeGen][SVE] Use BFMLALB for promoted bfloat fma operations. (PR #167340)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 10 10:37:06 PST 2025
https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/167340
>From d0e82f298e0ab203304f2d61308ae71009cb1b0f Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Mon, 10 Nov 2025 15:42:56 +0000
Subject: [PATCH 1/4] Update sve-bf16-combines.ll to show output without
sve-b16b16 support.
---
.../test/CodeGen/AArch64/sve-bf16-combines.ll | 791 +++++++++++++++---
1 file changed, 664 insertions(+), 127 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
index 5c58eab391972..1e3657ad703d9 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
@@ -1,79 +1,217 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s
+; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=SVE
+; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=SVE-B16B16
target triple = "aarch64-unknown-linux-gnu"
define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmla_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmla_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z2.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmul z3.s, z4.s, z3.s
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z2.h, p0/m, z3.s
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z0.s, z0.h
+; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fadd z2.s, z3.s, z2.s
+; SVE-NEXT: fadd z0.s, z0.s, z1.s
+; SVE-NEXT: bfcvt z1.h, p0/m, z2.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmla_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: ptrue p0.h
+; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
%res = fadd contract <vscale x 8 x bfloat> %acc, %mul
ret <vscale x 8 x bfloat> %res
}
define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmla_nxv4bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmla_nxv4bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fadd z0.s, z0.s, z1.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmla_nxv4bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: ptrue p0.s
+; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
%res = fadd contract <vscale x 4 x bfloat> %acc, %mul
ret <vscale x 4 x bfloat> %res
}
define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmla_nxv2bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmla_nxv2bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: fmul z1.s, p0/m, z1.s, z2.s
+; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmla_nxv2bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: ptrue p0.d
+; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
%res = fadd contract <vscale x 2 x bfloat> %acc, %mul
ret <vscale x 2 x bfloat> %res
}
define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmls_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmls_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z2.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmul z3.s, z4.s, z3.s
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z2.h, p0/m, z3.s
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z0.s, z0.h
+; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fsub z2.s, z3.s, z2.s
+; SVE-NEXT: fsub z0.s, z0.s, z1.s
+; SVE-NEXT: bfcvt z1.h, p0/m, z2.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmls_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: ptrue p0.h
+; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
%res = fsub contract <vscale x 8 x bfloat> %acc, %mul
ret <vscale x 8 x bfloat> %res
}
define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmls_nxv4bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmls_nxv4bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fsub z0.s, z0.s, z1.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmls_nxv4bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: ptrue p0.s
+; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
%res = fsub contract <vscale x 4 x bfloat> %acc, %mul
ret <vscale x 4 x bfloat> %res
}
define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmls_nxv2bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmls_nxv2bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: fmul z1.s, p0/m, z1.s, z2.s
+; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fsub z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmls_nxv2bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: ptrue p0.d
+; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
%res = fsub contract <vscale x 2 x bfloat> %acc, %mul
ret <vscale x 2 x bfloat> %res
}
define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmla_sel_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmla_sel_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z2.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmul z3.s, z4.s, z3.s
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fadd z2.s, z3.s, z2.s
+; SVE-NEXT: fadd z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmla_sel_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
%add = fadd contract <vscale x 8 x bfloat> %acc, %mul
%res = select <vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %add, <vscale x 8 x bfloat> %acc
@@ -81,10 +219,23 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
}
define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmla_sel_nxv4bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmla_sel_nxv4bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: lsl z2.s, z0.s, #16
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fadd z1.s, z2.s, z1.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmla_sel_nxv4bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
%add = fadd contract <vscale x 4 x bfloat> %acc, %mul
%res = select <vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %add, <vscale x 4 x bfloat> %acc
@@ -92,10 +243,23 @@ define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
}
define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmla_sel_nxv2bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmla_sel_nxv2bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: ptrue p1.d
+; SVE-NEXT: fmul z1.s, p1/m, z1.s, z2.s
+; SVE-NEXT: lsl z2.s, z0.s, #16
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fadd z1.s, p1/m, z1.s, z2.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmla_sel_nxv2bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
%add = fadd contract <vscale x 2 x bfloat> %acc, %mul
%res = select <vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %add, <vscale x 2 x bfloat> %acc
@@ -103,10 +267,39 @@ define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
}
define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmls_sel_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmls_sel_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z2.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmul z3.s, z4.s, z3.s
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fsub z2.s, z3.s, z2.s
+; SVE-NEXT: fsub z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmls_sel_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
%sub = fsub contract <vscale x 8 x bfloat> %acc, %mul
%res = select <vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %sub, <vscale x 8 x bfloat> %acc
@@ -114,10 +307,23 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
}
define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmls_sel_nxv4bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmls_sel_nxv4bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: lsl z2.s, z0.s, #16
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fsub z1.s, z2.s, z1.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmls_sel_nxv4bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
%sub = fsub contract <vscale x 4 x bfloat> %acc, %mul
%res = select <vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %sub, <vscale x 4 x bfloat> %acc
@@ -125,10 +331,23 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
}
define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmls_sel_nxv2bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmls_sel_nxv2bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: ptrue p1.d
+; SVE-NEXT: fmul z1.s, p1/m, z1.s, z2.s
+; SVE-NEXT: lsl z2.s, z0.s, #16
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fsubr z1.s, p1/m, z1.s, z2.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmls_sel_nxv2bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
%sub = fsub contract <vscale x 2 x bfloat> %acc, %mul
%res = select <vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %sub, <vscale x 2 x bfloat> %acc
@@ -136,33 +355,90 @@ define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
}
define <vscale x 8 x bfloat> @fadd_sel_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT: mov z0.h, p0/m, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fadd_sel_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z2.s, z1.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: fadd z2.s, z3.s, z2.s
+; SVE-NEXT: fadd z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fadd_sel_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT: ret
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> zeroinitializer
%fadd = fadd nsz <vscale x 8 x bfloat> %a, %sel
ret <vscale x 8 x bfloat> %fadd
}
define <vscale x 8 x bfloat> @fsub_sel_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT: mov z0.h, p0/m, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fsub_sel_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z2.s, z1.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: fsub z2.s, z3.s, z2.s
+; SVE-NEXT: fsub z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fsub_sel_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT: ret
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> zeroinitializer
%fsub = fsub <vscale x 8 x bfloat> %a, %sel
ret <vscale x 8 x bfloat> %fsub
}
define <vscale x 8 x bfloat> @fadd_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_negzero_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT: mov z0.h, p0/m, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fadd_sel_negzero_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z2.s, z1.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: fadd z2.s, z3.s, z2.s
+; SVE-NEXT: fadd z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fadd_sel_negzero_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT: ret
%nz = fneg <vscale x 8 x bfloat> zeroinitializer
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %nz
%fadd = fadd <vscale x 8 x bfloat> %a, %sel
@@ -170,11 +446,30 @@ define <vscale x 8 x bfloat> @fadd_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a
}
define <vscale x 8 x bfloat> @fsub_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_negzero_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT: mov z0.h, p0/m, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fsub_sel_negzero_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z2.s, z1.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: fsub z2.s, z3.s, z2.s
+; SVE-NEXT: fsub z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fsub_sel_negzero_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT: ret
%nz = fneg <vscale x 8 x bfloat> zeroinitializer
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %nz
%fsub = fsub nsz <vscale x 8 x bfloat> %a, %sel
@@ -182,13 +477,46 @@ define <vscale x 8 x bfloat> @fsub_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a
}
define <vscale x 8 x bfloat> @fadd_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_fmul_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: movi v3.2d, #0000000000000000
-; CHECK-NEXT: bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h
-; CHECK-NEXT: bfadd z0.h, z0.h, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fadd_sel_fmul_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z2.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmul z3.s, z4.s, z3.s
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT: movi v3.2d, #0000000000000000
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: sel z1.h, p0, z1.h, z3.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z0.s, z0.h
+; SVE-NEXT: uunpkhi z2.s, z1.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fadd z2.s, z3.s, z2.s
+; SVE-NEXT: fadd z0.s, z0.s, z1.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z0.h, p1/m, z0.s
+; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fadd_sel_fmul_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: movi v3.2d, #0000000000000000
+; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT: sel z1.h, p0, z1.h, z3.h
+; SVE-B16B16-NEXT: bfadd z0.h, z0.h, z1.h
+; SVE-B16B16-NEXT: ret
%fmul = fmul <vscale x 8 x bfloat> %b, %c
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer
%fadd = fadd contract <vscale x 8 x bfloat> %a, %sel
@@ -196,12 +524,41 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <
}
define <vscale x 8 x bfloat> @fsub_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_fmul_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT: bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT: mov z0.h, p0/m, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fsub_sel_fmul_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z2.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmul z3.s, z4.s, z3.s
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fsub z2.s, z3.s, z2.s
+; SVE-NEXT: fsub z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fsub_sel_fmul_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT: ret
%fmul = fmul <vscale x 8 x bfloat> %b, %c
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer
%fsub = fsub contract <vscale x 8 x bfloat> %a, %sel
@@ -209,12 +566,41 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <
}
define <vscale x 8 x bfloat> @fadd_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_fmul_nsz_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT: bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT: mov z0.h, p0/m, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fadd_sel_fmul_nsz_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z2.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmul z3.s, z4.s, z3.s
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fadd z2.s, z3.s, z2.s
+; SVE-NEXT: fadd z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fadd_sel_fmul_nsz_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT: ret
%fmul = fmul <vscale x 8 x bfloat> %b, %c
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer
%fadd = fadd nsz contract <vscale x 8 x bfloat> %a, %sel
@@ -222,12 +608,41 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %
}
define <vscale x 8 x bfloat> @fsub_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_fmul_nsz_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT: bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT: mov z0.h, p0/m, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fsub_sel_fmul_nsz_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z2.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmul z3.s, z4.s, z3.s
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fsub z2.s, z3.s, z2.s
+; SVE-NEXT: fsub z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fsub_sel_fmul_nsz_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT: ret
%fmul = fmul <vscale x 8 x bfloat> %b, %c
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer
%fsub = fsub nsz contract <vscale x 8 x bfloat> %a, %sel
@@ -235,12 +650,41 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %
}
define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_fmul_negzero_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT: bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT: mov z0.h, p0/m, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fadd_sel_fmul_negzero_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z2.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmul z3.s, z4.s, z3.s
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fadd z2.s, z3.s, z2.s
+; SVE-NEXT: fadd z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fadd_sel_fmul_negzero_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT: ret
%fmul = fmul <vscale x 8 x bfloat> %b, %c
%nz = fneg <vscale x 8 x bfloat> zeroinitializer
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz
@@ -249,15 +693,50 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
}
define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_fmul_negzero_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #32768 // =0x8000
-; CHECK-NEXT: bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT: fmov h3, w8
-; CHECK-NEXT: mov z3.h, h3
-; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h
-; CHECK-NEXT: bfsub z0.h, z0.h, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fsub_sel_fmul_negzero_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z2.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: mov w8, #32768 // =0x8000
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmul z3.s, z4.s, z3.s
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT: fmov h3, w8
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: mov z3.h, h3
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: sel z1.h, p0, z1.h, z3.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z0.s, z0.h
+; SVE-NEXT: uunpkhi z2.s, z1.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fsub z2.s, z3.s, z2.s
+; SVE-NEXT: fsub z0.s, z0.s, z1.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z0.h, p1/m, z0.s
+; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: mov w8, #32768 // =0x8000
+; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT: fmov h3, w8
+; SVE-B16B16-NEXT: mov z3.h, h3
+; SVE-B16B16-NEXT: sel z1.h, p0, z1.h, z3.h
+; SVE-B16B16-NEXT: bfsub z0.h, z0.h, z1.h
+; SVE-B16B16-NEXT: ret
%fmul = fmul <vscale x 8 x bfloat> %b, %c
%nz = fneg <vscale x 8 x bfloat> zeroinitializer
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz
@@ -266,12 +745,41 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
}
define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT: bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT: mov z0.h, p0/m, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z2.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmul z3.s, z4.s, z3.s
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fadd z2.s, z3.s, z2.s
+; SVE-NEXT: fadd z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT: ret
%fmul = fmul <vscale x 8 x bfloat> %b, %c
%nz = fneg <vscale x 8 x bfloat> zeroinitializer
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz
@@ -280,12 +788,41 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x b
}
define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT: bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT: mov z0.h, p0/m, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z2.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmul z3.s, z4.s, z3.s
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fsub z2.s, z3.s, z2.s
+; SVE-NEXT: fsub z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT: ret
%fmul = fmul <vscale x 8 x bfloat> %b, %c
%nz = fneg <vscale x 8 x bfloat> zeroinitializer
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz
>From e0fd9b91c59682e6df886c71ae50968acafc3b21 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Mon, 10 Nov 2025 15:54:13 +0000
Subject: [PATCH 2/4] [LLVM][CodeGen][SVE] Enable BFloat fma contraction more
aggressively.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 2 +-
.../test/CodeGen/AArch64/sve-bf16-combines.ll | 148 +++++++-----------
2 files changed, 55 insertions(+), 95 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..da6c65f2c1c7d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18570,7 +18570,7 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
case MVT::f64:
return true;
case MVT::bf16:
- return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&
+ return VT.isScalableVector() && Subtarget->hasBF16() &&
Subtarget->isNonStreamingSVEorSME2Available();
default:
break;
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
index 1e3657ad703d9..230bd9cf5420f 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
@@ -9,26 +9,20 @@ define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
; SVE: // %bb.0:
; SVE-NEXT: uunpkhi z3.s, z2.h
; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpkhi z5.s, z0.h
; SVE-NEXT: uunpklo z2.s, z2.h
; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: uunpklo z0.s, z0.h
; SVE-NEXT: ptrue p0.s
; SVE-NEXT: lsl z3.s, z3.s, #16
; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z5.s, z5.s, #16
; SVE-NEXT: lsl z2.s, z2.s, #16
; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fmul z3.s, z4.s, z3.s
-; SVE-NEXT: fmul z1.s, z1.s, z2.s
-; SVE-NEXT: bfcvt z2.h, p0/m, z3.s
-; SVE-NEXT: uunpkhi z3.s, z0.h
-; SVE-NEXT: uunpklo z0.s, z0.h
-; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z3.s, z3.s, #16
; SVE-NEXT: lsl z0.s, z0.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fadd z2.s, z3.s, z2.s
-; SVE-NEXT: fadd z0.s, z0.s, z1.s
-; SVE-NEXT: bfcvt z1.h, p0/m, z2.s
+; SVE-NEXT: fmad z3.s, p0/m, z4.s, z5.s
+; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT: bfcvt z1.h, p0/m, z3.s
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
; SVE-NEXT: ret
@@ -48,12 +42,9 @@ define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale
; SVE: // %bb.0:
; SVE-NEXT: lsl z2.s, z2.s, #16
; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: ptrue p0.s
; SVE-NEXT: lsl z0.s, z0.s, #16
-; SVE-NEXT: fmul z1.s, z1.s, z2.s
-; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fadd z0.s, z0.s, z1.s
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
; SVE-NEXT: ret
;
@@ -72,12 +63,9 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale
; SVE: // %bb.0:
; SVE-NEXT: lsl z2.s, z2.s, #16
; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: ptrue p0.d
; SVE-NEXT: lsl z0.s, z0.s, #16
-; SVE-NEXT: fmul z1.s, p0/m, z1.s, z2.s
-; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
; SVE-NEXT: ret
;
@@ -94,28 +82,24 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale
define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
; SVE-LABEL: fmls_nxv8bf16:
; SVE: // %bb.0:
+; SVE-NEXT: ptrue p0.h
; SVE-NEXT: uunpkhi z3.s, z2.h
-; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpkhi z4.s, z0.h
; SVE-NEXT: uunpklo z2.s, z2.h
-; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: uunpklo z0.s, z0.h
+; SVE-NEXT: fneg z1.h, p0/m, z1.h
; SVE-NEXT: ptrue p0.s
; SVE-NEXT: lsl z3.s, z3.s, #16
; SVE-NEXT: lsl z4.s, z4.s, #16
; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fmul z3.s, z4.s, z3.s
-; SVE-NEXT: fmul z1.s, z1.s, z2.s
-; SVE-NEXT: bfcvt z2.h, p0/m, z3.s
-; SVE-NEXT: uunpkhi z3.s, z0.h
-; SVE-NEXT: uunpklo z0.s, z0.h
-; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z3.s, z3.s, #16
; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: uunpkhi z5.s, z1.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: lsl z5.s, z5.s, #16
; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fsub z2.s, z3.s, z2.s
-; SVE-NEXT: fsub z0.s, z0.s, z1.s
-; SVE-NEXT: bfcvt z1.h, p0/m, z2.s
+; SVE-NEXT: fmad z3.s, p0/m, z5.s, z4.s
+; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT: bfcvt z1.h, p0/m, z3.s
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
; SVE-NEXT: ret
@@ -133,14 +117,12 @@ define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
; SVE-LABEL: fmls_nxv4bf16:
; SVE: // %bb.0:
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
; SVE-NEXT: lsl z0.s, z0.s, #16
-; SVE-NEXT: fmul z1.s, z1.s, z2.s
-; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT: fneg z1.h, p0/m, z1.h
; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fsub z0.s, z0.s, z1.s
+; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
; SVE-NEXT: ret
;
@@ -157,14 +139,12 @@ define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale
define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
; SVE-LABEL: fmls_nxv2bf16:
; SVE: // %bb.0:
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: lsl z2.s, z2.s, #16
; SVE-NEXT: lsl z0.s, z0.s, #16
-; SVE-NEXT: fmul z1.s, p0/m, z1.s, z2.s
-; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT: fneg z1.h, p0/m, z1.h
; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fsub z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
; SVE-NEXT: ret
;
@@ -183,26 +163,20 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
; SVE: // %bb.0:
; SVE-NEXT: uunpkhi z3.s, z2.h
; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpkhi z5.s, z0.h
; SVE-NEXT: uunpklo z2.s, z2.h
; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: uunpklo z6.s, z0.h
; SVE-NEXT: ptrue p1.s
; SVE-NEXT: lsl z3.s, z3.s, #16
; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: lsl z5.s, z5.s, #16
; SVE-NEXT: lsl z2.s, z2.s, #16
; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fmul z3.s, z4.s, z3.s
-; SVE-NEXT: uunpklo z4.s, z0.h
-; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: lsl z6.s, z6.s, #16
+; SVE-NEXT: fmad z3.s, p1/m, z4.s, z5.s
+; SVE-NEXT: fmad z1.s, p1/m, z2.s, z6.s
; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
-; SVE-NEXT: uunpkhi z3.s, z0.h
-; SVE-NEXT: lsl z4.s, z4.s, #16
-; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z3.s, z3.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fadd z2.s, z3.s, z2.s
-; SVE-NEXT: fadd z1.s, z4.s, z1.s
-; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
; SVE-NEXT: mov z0.h, p0/m, z1.h
@@ -223,12 +197,9 @@ define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
; SVE: // %bb.0:
; SVE-NEXT: lsl z2.s, z2.s, #16
; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: lsl z3.s, z0.s, #16
; SVE-NEXT: ptrue p1.s
-; SVE-NEXT: fmul z1.s, z1.s, z2.s
-; SVE-NEXT: lsl z2.s, z0.s, #16
-; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fadd z1.s, z2.s, z1.s
+; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
; SVE-NEXT: ret
;
@@ -247,12 +218,9 @@ define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
; SVE: // %bb.0:
; SVE-NEXT: lsl z2.s, z2.s, #16
; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: lsl z3.s, z0.s, #16
; SVE-NEXT: ptrue p1.d
-; SVE-NEXT: fmul z1.s, p1/m, z1.s, z2.s
-; SVE-NEXT: lsl z2.s, z0.s, #16
-; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fadd z1.s, p1/m, z1.s, z2.s
+; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
; SVE-NEXT: ret
;
@@ -269,28 +237,24 @@ define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
; SVE-LABEL: fmls_sel_nxv8bf16:
; SVE: // %bb.0:
+; SVE-NEXT: ptrue p1.h
; SVE-NEXT: uunpkhi z3.s, z2.h
-; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpkhi z4.s, z0.h
; SVE-NEXT: uunpklo z2.s, z2.h
-; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: uunpklo z6.s, z0.h
+; SVE-NEXT: fneg z1.h, p1/m, z1.h
; SVE-NEXT: ptrue p1.s
; SVE-NEXT: lsl z3.s, z3.s, #16
; SVE-NEXT: lsl z4.s, z4.s, #16
; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z6.s, z6.s, #16
+; SVE-NEXT: uunpkhi z5.s, z1.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: lsl z5.s, z5.s, #16
; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fmul z3.s, z4.s, z3.s
-; SVE-NEXT: uunpklo z4.s, z0.h
-; SVE-NEXT: fmul z1.s, z1.s, z2.s
+; SVE-NEXT: fmad z3.s, p1/m, z5.s, z4.s
+; SVE-NEXT: fmad z1.s, p1/m, z2.s, z6.s
; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
-; SVE-NEXT: uunpkhi z3.s, z0.h
-; SVE-NEXT: lsl z4.s, z4.s, #16
-; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z3.s, z3.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fsub z2.s, z3.s, z2.s
-; SVE-NEXT: fsub z1.s, z4.s, z1.s
-; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
; SVE-NEXT: mov z0.h, p0/m, z1.h
@@ -309,14 +273,12 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
; SVE-LABEL: fmls_sel_nxv4bf16:
; SVE: // %bb.0:
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
; SVE-NEXT: ptrue p1.s
-; SVE-NEXT: fmul z1.s, z1.s, z2.s
-; SVE-NEXT: lsl z2.s, z0.s, #16
-; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z0.s, #16
+; SVE-NEXT: fneg z1.h, p1/m, z1.h
; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fsub z1.s, z2.s, z1.s
+; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
; SVE-NEXT: ret
;
@@ -333,14 +295,12 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
; SVE-LABEL: fmls_sel_nxv2bf16:
; SVE: // %bb.0:
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
; SVE-NEXT: ptrue p1.d
-; SVE-NEXT: fmul z1.s, p1/m, z1.s, z2.s
-; SVE-NEXT: lsl z2.s, z0.s, #16
-; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z0.s, #16
+; SVE-NEXT: fneg z1.h, p1/m, z1.h
; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fsubr z1.s, p1/m, z1.s, z2.s
+; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
; SVE-NEXT: ret
;
>From 9886945da72e51cc8a2b6203088e8cd76e8bea94 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Mon, 10 Nov 2025 15:29:29 +0000
Subject: [PATCH 3/4] [LLVM][CodeGen][SVE] Use BFMLALB for promoted bfloat fma
operations.
NOTE: From what I can see LLVM has no support for FEAT_AFP in terms of
feature detection or ACLE builtins and so I believe the compiler can
(and does) work under the assumption the feature is not enabled.
---
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 4 +
llvm/test/CodeGen/AArch64/sve-bf16-arith.ll | 28 +++---
.../test/CodeGen/AArch64/sve-bf16-combines.ll | 95 +++++++------------
3 files changed, 50 insertions(+), 77 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3b268dcbca600..ce6de5c780cf3 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2578,6 +2578,10 @@ let Predicates = [HasBF16, HasSVE_or_SME] in {
defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
+ def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc, (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))),
+ (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))),
+ (BFMLALB_ZZZ nxv4f32:$acc, ZPR:$Zn, ZPR:$Zm)>;
+
defm BFCVT_ZPmZ : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>;
defm BFCVTNT_ZPmZ : sve_bfloat_convert_top<"bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32_v2>;
} // End HasBF16, HasSVE_or_SME
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
index 0580f5e0b019a..582e8456c05b3 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
@@ -466,12 +466,10 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x
define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c) {
; NOB16B16-LABEL: fmla_nxv4bf16:
; NOB16B16: // %bb.0:
-; NOB16B16-NEXT: lsl z1.s, z1.s, #16
-; NOB16B16-NEXT: lsl z0.s, z0.s, #16
; NOB16B16-NEXT: lsl z2.s, z2.s, #16
; NOB16B16-NEXT: ptrue p0.s
-; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
-; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z2.s
; NOB16B16-NEXT: ret
;
; B16B16-LABEL: fmla_nxv4bf16:
@@ -486,24 +484,20 @@ define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x
define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
; NOB16B16-LABEL: fmla_nxv8bf16:
; NOB16B16: // %bb.0:
-; NOB16B16-NEXT: uunpkhi z3.s, z1.h
-; NOB16B16-NEXT: uunpkhi z4.s, z0.h
-; NOB16B16-NEXT: uunpkhi z5.s, z2.h
+; NOB16B16-NEXT: uunpkhi z3.s, z2.h
+; NOB16B16-NEXT: uunpklo z2.s, z2.h
+; NOB16B16-NEXT: uunpkhi z4.s, z1.h
+; NOB16B16-NEXT: uunpkhi z5.s, z0.h
; NOB16B16-NEXT: uunpklo z1.s, z1.h
; NOB16B16-NEXT: uunpklo z0.s, z0.h
-; NOB16B16-NEXT: uunpklo z2.s, z2.h
; NOB16B16-NEXT: ptrue p0.s
; NOB16B16-NEXT: lsl z3.s, z3.s, #16
-; NOB16B16-NEXT: lsl z4.s, z4.s, #16
-; NOB16B16-NEXT: lsl z5.s, z5.s, #16
-; NOB16B16-NEXT: lsl z1.s, z1.s, #16
-; NOB16B16-NEXT: lsl z0.s, z0.s, #16
; NOB16B16-NEXT: lsl z2.s, z2.s, #16
-; NOB16B16-NEXT: fmad z3.s, p0/m, z4.s, z5.s
-; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
-; NOB16B16-NEXT: bfcvt z1.h, p0/m, z3.s
-; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
-; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: bfmlalb z3.s, z5.h, z4.h
+; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z3.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: uzp1 z0.h, z1.h, z0.h
; NOB16B16-NEXT: ret
;
; B16B16-LABEL: fmla_nxv8bf16:
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
index 230bd9cf5420f..16e8feb0dc5bb 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
@@ -7,21 +7,17 @@ target triple = "aarch64-unknown-linux-gnu"
define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
; SVE-LABEL: fmla_nxv8bf16:
; SVE: // %bb.0:
-; SVE-NEXT: uunpkhi z3.s, z2.h
-; SVE-NEXT: uunpkhi z4.s, z1.h
-; SVE-NEXT: uunpkhi z5.s, z0.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z0.s, z0.h
+; SVE-NEXT: uunpkhi z4.s, z2.h
+; SVE-NEXT: uunpkhi z5.s, z1.h
; SVE-NEXT: uunpklo z2.s, z2.h
; SVE-NEXT: uunpklo z1.s, z1.h
-; SVE-NEXT: uunpklo z0.s, z0.h
; SVE-NEXT: ptrue p0.s
; SVE-NEXT: lsl z3.s, z3.s, #16
-; SVE-NEXT: lsl z4.s, z4.s, #16
-; SVE-NEXT: lsl z5.s, z5.s, #16
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
; SVE-NEXT: lsl z0.s, z0.s, #16
-; SVE-NEXT: fmad z3.s, p0/m, z4.s, z5.s
-; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT: bfmlalb z3.s, z5.h, z4.h
+; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
; SVE-NEXT: bfcvt z1.h, p0/m, z3.s
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
@@ -40,11 +36,9 @@ define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
; SVE-LABEL: fmla_nxv4bf16:
; SVE: // %bb.0:
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
; SVE-NEXT: lsl z0.s, z0.s, #16
; SVE-NEXT: ptrue p0.s
-; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
; SVE-NEXT: ret
;
@@ -83,22 +77,18 @@ define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
; SVE-LABEL: fmls_nxv8bf16:
; SVE: // %bb.0:
; SVE-NEXT: ptrue p0.h
-; SVE-NEXT: uunpkhi z3.s, z2.h
-; SVE-NEXT: uunpkhi z4.s, z0.h
-; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
; SVE-NEXT: uunpklo z0.s, z0.h
+; SVE-NEXT: uunpkhi z5.s, z2.h
+; SVE-NEXT: uunpklo z2.s, z2.h
; SVE-NEXT: fneg z1.h, p0/m, z1.h
; SVE-NEXT: ptrue p0.s
; SVE-NEXT: lsl z3.s, z3.s, #16
-; SVE-NEXT: lsl z4.s, z4.s, #16
-; SVE-NEXT: lsl z2.s, z2.s, #16
; SVE-NEXT: lsl z0.s, z0.s, #16
-; SVE-NEXT: uunpkhi z5.s, z1.h
+; SVE-NEXT: uunpkhi z4.s, z1.h
; SVE-NEXT: uunpklo z1.s, z1.h
-; SVE-NEXT: lsl z5.s, z5.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fmad z3.s, p0/m, z5.s, z4.s
-; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT: bfmlalb z3.s, z4.h, z5.h
+; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
; SVE-NEXT: bfcvt z1.h, p0/m, z3.s
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
@@ -118,11 +108,9 @@ define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale
; SVE-LABEL: fmls_nxv4bf16:
; SVE: // %bb.0:
; SVE-NEXT: ptrue p0.s
-; SVE-NEXT: lsl z2.s, z2.s, #16
; SVE-NEXT: lsl z0.s, z0.s, #16
; SVE-NEXT: fneg z1.h, p0/m, z1.h
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
; SVE-NEXT: ret
;
@@ -161,24 +149,20 @@ define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale
define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
; SVE-LABEL: fmla_sel_nxv8bf16:
; SVE: // %bb.0:
-; SVE-NEXT: uunpkhi z3.s, z2.h
-; SVE-NEXT: uunpkhi z4.s, z1.h
-; SVE-NEXT: uunpkhi z5.s, z0.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: uunpkhi z5.s, z2.h
+; SVE-NEXT: uunpkhi z6.s, z1.h
; SVE-NEXT: uunpklo z2.s, z2.h
; SVE-NEXT: uunpklo z1.s, z1.h
-; SVE-NEXT: uunpklo z6.s, z0.h
; SVE-NEXT: ptrue p1.s
; SVE-NEXT: lsl z3.s, z3.s, #16
; SVE-NEXT: lsl z4.s, z4.s, #16
-; SVE-NEXT: lsl z5.s, z5.s, #16
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: lsl z6.s, z6.s, #16
-; SVE-NEXT: fmad z3.s, p1/m, z4.s, z5.s
-; SVE-NEXT: fmad z1.s, p1/m, z2.s, z6.s
-; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
-; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: bfmlalb z3.s, z6.h, z5.h
+; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h
+; SVE-NEXT: bfcvt z1.h, p1/m, z3.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z4.s
+; SVE-NEXT: uzp1 z1.h, z2.h, z1.h
; SVE-NEXT: mov z0.h, p0/m, z1.h
; SVE-NEXT: ret
;
@@ -195,12 +179,9 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
; SVE-LABEL: fmla_sel_nxv4bf16:
; SVE: // %bb.0:
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
; SVE-NEXT: lsl z3.s, z0.s, #16
-; SVE-NEXT: ptrue p1.s
-; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
-; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT: bfcvt z0.h, p0/m, z3.s
; SVE-NEXT: ret
;
; SVE-B16B16-LABEL: fmla_sel_nxv4bf16:
@@ -238,25 +219,21 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
; SVE-LABEL: fmls_sel_nxv8bf16:
; SVE: // %bb.0:
; SVE-NEXT: ptrue p1.h
-; SVE-NEXT: uunpkhi z3.s, z2.h
-; SVE-NEXT: uunpkhi z4.s, z0.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: uunpkhi z6.s, z2.h
; SVE-NEXT: uunpklo z2.s, z2.h
-; SVE-NEXT: uunpklo z6.s, z0.h
; SVE-NEXT: fneg z1.h, p1/m, z1.h
; SVE-NEXT: ptrue p1.s
; SVE-NEXT: lsl z3.s, z3.s, #16
; SVE-NEXT: lsl z4.s, z4.s, #16
-; SVE-NEXT: lsl z2.s, z2.s, #16
-; SVE-NEXT: lsl z6.s, z6.s, #16
; SVE-NEXT: uunpkhi z5.s, z1.h
; SVE-NEXT: uunpklo z1.s, z1.h
-; SVE-NEXT: lsl z5.s, z5.s, #16
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fmad z3.s, p1/m, z5.s, z4.s
-; SVE-NEXT: fmad z1.s, p1/m, z2.s, z6.s
-; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
-; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: bfmlalb z3.s, z5.h, z6.h
+; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h
+; SVE-NEXT: bfcvt z1.h, p1/m, z3.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z4.s
+; SVE-NEXT: uzp1 z1.h, z2.h, z1.h
; SVE-NEXT: mov z0.h, p0/m, z1.h
; SVE-NEXT: ret
;
@@ -274,12 +251,10 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
; SVE-LABEL: fmls_sel_nxv4bf16:
; SVE: // %bb.0:
; SVE-NEXT: ptrue p1.s
-; SVE-NEXT: lsl z2.s, z2.s, #16
; SVE-NEXT: lsl z3.s, z0.s, #16
; SVE-NEXT: fneg z1.h, p1/m, z1.h
-; SVE-NEXT: lsl z1.s, z1.s, #16
-; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
-; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT: bfcvt z0.h, p0/m, z3.s
; SVE-NEXT: ret
;
; SVE-B16B16-LABEL: fmls_sel_nxv4bf16:
>From 8112f30b5dde935d4a3a58666023609f49a464b0 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Mon, 10 Nov 2025 18:36:33 +0000
Subject: [PATCH 4/4] Reformat AArch64SVEInstrInfo.td changes.
---
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index ce6de5c780cf3..ece012a035bfc 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2578,8 +2578,9 @@ let Predicates = [HasBF16, HasSVE_or_SME] in {
defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
- def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc, (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))),
- (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))),
+ def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc,
+ (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))),
+ (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))),
(BFMLALB_ZZZ nxv4f32:$acc, ZPR:$Zn, ZPR:$Zm)>;
defm BFCVT_ZPmZ : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>;
More information about the llvm-commits
mailing list