[llvm] [LLVM][CodeGen][SVE] Use BFMLALB for promoted bfloat fma operations. (PR #167340)

Mon Nov 10 10:37:06 PST 2025

https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/167340

>From d0e82f298e0ab203304f2d61308ae71009cb1b0f Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Mon, 10 Nov 2025 15:42:56 +0000
Subject: [PATCH 1/4] Update sve-bf16-combines.ll to show output without
 sve-b16b16 support.

---
 .../test/CodeGen/AArch64/sve-bf16-combines.ll | 791 +++++++++++++++---
 1 file changed, 664 insertions(+), 127 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
index 5c58eab391972..1e3657ad703d9 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
@@ -1,79 +1,217 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s
+; RUN: llc -mattr=+sve,+bf16             < %s | FileCheck %s --check-prefixes=SVE
+; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=SVE-B16B16
 
 target triple = "aarch64-unknown-linux-gnu"
 
 define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmla_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmla_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p0/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE-NEXT:    bfcvt z1.h, p0/m, z2.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmla_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    ptrue p0.h
+; SVE-B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
   %res = fadd contract <vscale x 8 x bfloat> %acc, %mul
   ret <vscale x 8 x bfloat> %res
 }
 
 define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmla_nxv4bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmla_nxv4bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmla_nxv4bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    ptrue p0.s
+; SVE-B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
   %res = fadd contract <vscale x 4 x bfloat> %acc, %mul
   ret <vscale x 4 x bfloat> %res
 }
 
 define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmla_nxv2bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmla_nxv2bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    ptrue p0.d
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    fmul z1.s, p0/m, z1.s, z2.s
+; SVE-NEXT:    bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmla_nxv2bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    ptrue p0.d
+; SVE-B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
   %res = fadd contract <vscale x 2 x bfloat> %acc, %mul
   ret <vscale x 2 x bfloat> %res
 }
 
 define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmls_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmls_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p0/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z0.s, z0.s, z1.s
+; SVE-NEXT:    bfcvt z1.h, p0/m, z2.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmls_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    ptrue p0.h
+; SVE-B16B16-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
   %res = fsub contract <vscale x 8 x bfloat> %acc, %mul
   ret <vscale x 8 x bfloat> %res
 }
 
 define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmls_nxv4bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmls_nxv4bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z0.s, z0.s, z1.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmls_nxv4bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    ptrue p0.s
+; SVE-B16B16-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
   %res = fsub contract <vscale x 4 x bfloat> %acc, %mul
   ret <vscale x 4 x bfloat> %res
 }
 
 define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmls_nxv2bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmls_nxv2bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    ptrue p0.d
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    fmul z1.s, p0/m, z1.s, z2.s
+; SVE-NEXT:    bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmls_nxv2bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    ptrue p0.d
+; SVE-B16B16-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
   %res = fsub contract <vscale x 2 x bfloat> %acc, %mul
   ret <vscale x 2 x bfloat> %res
 }
 
 define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmla_sel_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmla_sel_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmla_sel_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
   %add = fadd contract <vscale x 8 x bfloat> %acc, %mul
   %res = select <vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %add, <vscale x 8 x bfloat> %acc
@@ -81,10 +219,23 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
 }
 
 define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmla_sel_nxv4bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmla_sel_nxv4bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    lsl z2.s, z0.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z1.s, z2.s, z1.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmla_sel_nxv4bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
   %add = fadd contract <vscale x 4 x bfloat> %acc, %mul
   %res = select <vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %add, <vscale x 4 x bfloat> %acc
@@ -92,10 +243,23 @@ define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
 }
 
 define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmla_sel_nxv2bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmla_sel_nxv2bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    ptrue p1.d
+; SVE-NEXT:    fmul z1.s, p1/m, z1.s, z2.s
+; SVE-NEXT:    lsl z2.s, z0.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z1.s, p1/m, z1.s, z2.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmla_sel_nxv2bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
   %add = fadd contract <vscale x 2 x bfloat> %acc, %mul
   %res = select <vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %add, <vscale x 2 x bfloat> %acc
@@ -103,10 +267,39 @@ define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
 }
 
 define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmls_sel_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmls_sel_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmls_sel_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
   %sub = fsub contract <vscale x 8 x bfloat> %acc, %mul
   %res = select <vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %sub, <vscale x 8 x bfloat> %acc
@@ -114,10 +307,23 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
 }
 
 define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmls_sel_nxv4bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmls_sel_nxv4bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    lsl z2.s, z0.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z1.s, z2.s, z1.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmls_sel_nxv4bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
   %sub = fsub contract <vscale x 4 x bfloat> %acc, %mul
   %res = select <vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %sub, <vscale x 4 x bfloat> %acc
@@ -125,10 +331,23 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
 }
 
 define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmls_sel_nxv2bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmls_sel_nxv2bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    ptrue p1.d
+; SVE-NEXT:    fmul z1.s, p1/m, z1.s, z2.s
+; SVE-NEXT:    lsl z2.s, z0.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsubr z1.s, p1/m, z1.s, z2.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmls_sel_nxv2bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
   %sub = fsub contract <vscale x 2 x bfloat> %acc, %mul
   %res = select <vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %sub, <vscale x 2 x bfloat> %acc
@@ -136,33 +355,90 @@ define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
 }
 
 define <vscale x 8 x bfloat> @fadd_sel_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fadd_sel_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fadd_sel_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> zeroinitializer
   %fadd = fadd nsz <vscale x 8 x bfloat> %a, %sel
   ret <vscale x 8 x bfloat> %fadd
 }
 
 define <vscale x 8 x bfloat> @fsub_sel_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fsub_sel_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fsub_sel_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> zeroinitializer
   %fsub = fsub <vscale x 8 x bfloat> %a, %sel
   ret <vscale x 8 x bfloat> %fsub
 }
 
 define <vscale x 8 x bfloat> @fadd_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_negzero_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fadd_sel_negzero_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fadd_sel_negzero_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %nz = fneg <vscale x 8 x bfloat> zeroinitializer
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %nz
   %fadd = fadd <vscale x 8 x bfloat> %a, %sel
@@ -170,11 +446,30 @@ define <vscale x 8 x bfloat> @fadd_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a
 }
 
 define <vscale x 8 x bfloat> @fsub_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_negzero_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fsub_sel_negzero_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fsub_sel_negzero_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %nz = fneg <vscale x 8 x bfloat> zeroinitializer
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %nz
   %fsub = fsub nsz <vscale x 8 x bfloat> %a, %sel
@@ -182,13 +477,46 @@ define <vscale x 8 x bfloat> @fsub_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a
 }
 
 define <vscale x 8 x bfloat> @fadd_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_fmul_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
-; CHECK-NEXT:    bfadd z0.h, z0.h, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fadd_sel_fmul_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    movi v3.2d, #0000000000000000
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    sel z1.h, p0, z1.h, z3.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z0.h, p1/m, z0.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fadd_sel_fmul_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    movi v3.2d, #0000000000000000
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    sel z1.h, p0, z1.h, z3.h
+; SVE-B16B16-NEXT:    bfadd z0.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer
   %fadd = fadd contract <vscale x 8 x bfloat> %a, %sel
@@ -196,12 +524,41 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <
 }
 
 define <vscale x 8 x bfloat> @fsub_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_fmul_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fsub_sel_fmul_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fsub_sel_fmul_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer
   %fsub = fsub contract <vscale x 8 x bfloat> %a, %sel
@@ -209,12 +566,41 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <
 }
 
 define <vscale x 8 x bfloat> @fadd_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_fmul_nsz_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fadd_sel_fmul_nsz_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fadd_sel_fmul_nsz_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer
   %fadd = fadd nsz contract <vscale x 8 x bfloat> %a, %sel
@@ -222,12 +608,41 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %
 }
 
 define <vscale x 8 x bfloat> @fsub_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_fmul_nsz_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fsub_sel_fmul_nsz_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fsub_sel_fmul_nsz_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer
   %fsub = fsub nsz contract <vscale x 8 x bfloat> %a, %sel
@@ -235,12 +650,41 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %
 }
 
 define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_fmul_negzero_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fadd_sel_fmul_negzero_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fadd_sel_fmul_negzero_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %nz = fneg <vscale x 8 x bfloat> zeroinitializer
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz
@@ -249,15 +693,50 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
 }
 
 define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_fmul_negzero_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    fmov h3, w8
-; CHECK-NEXT:    mov z3.h, h3
-; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
-; CHECK-NEXT:    bfsub z0.h, z0.h, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fsub_sel_fmul_negzero_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    mov w8, #32768 // =0x8000
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    fmov h3, w8
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    mov z3.h, h3
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    sel z1.h, p0, z1.h, z3.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z0.s, z0.s, z1.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z0.h, p1/m, z0.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    mov w8, #32768 // =0x8000
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    fmov h3, w8
+; SVE-B16B16-NEXT:    mov z3.h, h3
+; SVE-B16B16-NEXT:    sel z1.h, p0, z1.h, z3.h
+; SVE-B16B16-NEXT:    bfsub z0.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %nz = fneg <vscale x 8 x bfloat> zeroinitializer
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz
@@ -266,12 +745,41 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
 }
 
 define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %nz = fneg <vscale x 8 x bfloat> zeroinitializer
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz
@@ -280,12 +788,41 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x b
 }
 
 define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %nz = fneg <vscale x 8 x bfloat> zeroinitializer
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz

>From e0fd9b91c59682e6df886c71ae50968acafc3b21 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Mon, 10 Nov 2025 15:54:13 +0000
Subject: [PATCH 2/4] [LLVM][CodeGen][SVE] Enable BFloat fma contraction more
 aggressively.

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   2 +-
 .../test/CodeGen/AArch64/sve-bf16-combines.ll | 148 +++++++-----------
 2 files changed, 55 insertions(+), 95 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..da6c65f2c1c7d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18570,7 +18570,7 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
   case MVT::f64:
     return true;
   case MVT::bf16:
-    return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&
+    return VT.isScalableVector() && Subtarget->hasBF16() &&
            Subtarget->isNonStreamingSVEorSME2Available();
   default:
     break;
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
index 1e3657ad703d9..230bd9cf5420f 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
@@ -9,26 +9,20 @@ define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    uunpkhi z3.s, z2.h
 ; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpkhi z5.s, z0.h
 ; SVE-NEXT:    uunpklo z2.s, z2.h
 ; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
 ; SVE-NEXT:    ptrue p0.s
 ; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z5.s, z5.s, #16
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmul z3.s, z4.s, z3.s
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    bfcvt z2.h, p0/m, z3.s
-; SVE-NEXT:    uunpkhi z3.s, z0.h
-; SVE-NEXT:    uunpklo z0.s, z0.h
-; SVE-NEXT:    bfcvt z1.h, p0/m, z1.s
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z0.s, z0.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fadd z2.s, z3.s, z2.s
-; SVE-NEXT:    fadd z0.s, z0.s, z1.s
-; SVE-NEXT:    bfcvt z1.h, p0/m, z2.s
+; SVE-NEXT:    fmad z3.s, p0/m, z4.s, z5.s
+; SVE-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT:    bfcvt z1.h, p0/m, z3.s
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    uzp1 z0.h, z0.h, z1.h
 ; SVE-NEXT:    ret
@@ -48,12 +42,9 @@ define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    ptrue p0.s
 ; SVE-NEXT:    lsl z0.s, z0.s, #16
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    bfcvt z1.h, p0/m, z1.s
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    ret
 ;
@@ -72,12 +63,9 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    ptrue p0.d
 ; SVE-NEXT:    lsl z0.s, z0.s, #16
-; SVE-NEXT:    fmul z1.s, p0/m, z1.s, z2.s
-; SVE-NEXT:    bfcvt z1.h, p0/m, z1.s
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT:    ptrue p0.d
+; SVE-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    ret
 ;
@@ -94,28 +82,24 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale
 define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
 ; SVE-LABEL: fmls_nxv8bf16:
 ; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p0.h
 ; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpkhi z4.s, z0.h
 ; SVE-NEXT:    uunpklo z2.s, z2.h
-; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    fneg z1.h, p0/m, z1.h
 ; SVE-NEXT:    ptrue p0.s
 ; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z4.s, z4.s, #16
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmul z3.s, z4.s, z3.s
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    bfcvt z2.h, p0/m, z3.s
-; SVE-NEXT:    uunpkhi z3.s, z0.h
-; SVE-NEXT:    uunpklo z0.s, z0.h
-; SVE-NEXT:    bfcvt z1.h, p0/m, z1.s
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    uunpkhi z5.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    lsl z5.s, z5.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fsub z2.s, z3.s, z2.s
-; SVE-NEXT:    fsub z0.s, z0.s, z1.s
-; SVE-NEXT:    bfcvt z1.h, p0/m, z2.s
+; SVE-NEXT:    fmad z3.s, p0/m, z5.s, z4.s
+; SVE-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT:    bfcvt z1.h, p0/m, z3.s
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    uzp1 z0.h, z0.h, z1.h
 ; SVE-NEXT:    ret
@@ -133,14 +117,12 @@ define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
 define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
 ; SVE-LABEL: fmls_nxv4bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z0.s, z0.s, #16
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT:    fneg z1.h, p0/m, z1.h
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fsub z0.s, z0.s, z1.s
+; SVE-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    ret
 ;
@@ -157,14 +139,12 @@ define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale
 define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
 ; SVE-LABEL: fmls_nxv2bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    ptrue p0.d
+; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z0.s, z0.s, #16
-; SVE-NEXT:    fmul z1.s, p0/m, z1.s, z2.s
-; SVE-NEXT:    bfcvt z1.h, p0/m, z1.s
+; SVE-NEXT:    fneg z1.h, p0/m, z1.h
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    ret
 ;
@@ -183,26 +163,20 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    uunpkhi z3.s, z2.h
 ; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpkhi z5.s, z0.h
 ; SVE-NEXT:    uunpklo z2.s, z2.h
 ; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z6.s, z0.h
 ; SVE-NEXT:    ptrue p1.s
 ; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z5.s, z5.s, #16
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmul z3.s, z4.s, z3.s
-; SVE-NEXT:    uunpklo z4.s, z0.h
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    lsl z6.s, z6.s, #16
+; SVE-NEXT:    fmad z3.s, p1/m, z4.s, z5.s
+; SVE-NEXT:    fmad z1.s, p1/m, z2.s, z6.s
 ; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
-; SVE-NEXT:    uunpkhi z3.s, z0.h
-; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z3.s, z3.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fadd z2.s, z3.s, z2.s
-; SVE-NEXT:    fadd z1.s, z4.s, z1.s
-; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
 ; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
 ; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
 ; SVE-NEXT:    mov z0.h, p0/m, z1.h
@@ -223,12 +197,9 @@ define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    lsl z3.s, z0.s, #16
 ; SVE-NEXT:    ptrue p1.s
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    lsl z2.s, z0.s, #16
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fadd z1.s, z2.s, z1.s
+; SVE-NEXT:    fmad z1.s, p1/m, z2.s, z3.s
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z1.s
 ; SVE-NEXT:    ret
 ;
@@ -247,12 +218,9 @@ define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    lsl z3.s, z0.s, #16
 ; SVE-NEXT:    ptrue p1.d
-; SVE-NEXT:    fmul z1.s, p1/m, z1.s, z2.s
-; SVE-NEXT:    lsl z2.s, z0.s, #16
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fadd z1.s, p1/m, z1.s, z2.s
+; SVE-NEXT:    fmad z1.s, p1/m, z2.s, z3.s
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z1.s
 ; SVE-NEXT:    ret
 ;
@@ -269,28 +237,24 @@ define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
 define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
 ; SVE-LABEL: fmls_sel_nxv8bf16:
 ; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p1.h
 ; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpkhi z4.s, z0.h
 ; SVE-NEXT:    uunpklo z2.s, z2.h
-; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z6.s, z0.h
+; SVE-NEXT:    fneg z1.h, p1/m, z1.h
 ; SVE-NEXT:    ptrue p1.s
 ; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z4.s, z4.s, #16
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z6.s, z6.s, #16
+; SVE-NEXT:    uunpkhi z5.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    lsl z5.s, z5.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmul z3.s, z4.s, z3.s
-; SVE-NEXT:    uunpklo z4.s, z0.h
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    fmad z3.s, p1/m, z5.s, z4.s
+; SVE-NEXT:    fmad z1.s, p1/m, z2.s, z6.s
 ; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
-; SVE-NEXT:    uunpkhi z3.s, z0.h
-; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z3.s, z3.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fsub z2.s, z3.s, z2.s
-; SVE-NEXT:    fsub z1.s, z4.s, z1.s
-; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
 ; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
 ; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
 ; SVE-NEXT:    mov z0.h, p0/m, z1.h
@@ -309,14 +273,12 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
 define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
 ; SVE-LABEL: fmls_sel_nxv4bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    ptrue p1.s
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    lsl z2.s, z0.s, #16
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z0.s, #16
+; SVE-NEXT:    fneg z1.h, p1/m, z1.h
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fsub z1.s, z2.s, z1.s
+; SVE-NEXT:    fmad z1.s, p1/m, z2.s, z3.s
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z1.s
 ; SVE-NEXT:    ret
 ;
@@ -333,14 +295,12 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
 define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
 ; SVE-LABEL: fmls_sel_nxv2bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    ptrue p1.d
-; SVE-NEXT:    fmul z1.s, p1/m, z1.s, z2.s
-; SVE-NEXT:    lsl z2.s, z0.s, #16
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z0.s, #16
+; SVE-NEXT:    fneg z1.h, p1/m, z1.h
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fsubr z1.s, p1/m, z1.s, z2.s
+; SVE-NEXT:    fmad z1.s, p1/m, z2.s, z3.s
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z1.s
 ; SVE-NEXT:    ret
 ;

>From 9886945da72e51cc8a2b6203088e8cd76e8bea94 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Mon, 10 Nov 2025 15:29:29 +0000
Subject: [PATCH 3/4] [LLVM][CodeGen][SVE] Use BFMLALB for promoted bfloat fma
 operations.

NOTE: From what I can see LLVM has no support for FEAT_AFP in terms of
feature detection or ACLE builtins and so I believe the compiler can
(and does) work under the assumption the feature is not enabled.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  4 +
 llvm/test/CodeGen/AArch64/sve-bf16-arith.ll   | 28 +++---
 .../test/CodeGen/AArch64/sve-bf16-combines.ll | 95 +++++++------------
 3 files changed, 50 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3b268dcbca600..ce6de5c780cf3 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2578,6 +2578,10 @@ let Predicates = [HasBF16, HasSVE_or_SME] in {
   defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
   defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
 
+  def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc, (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))),
+                                                                  (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))),
+            (BFMLALB_ZZZ nxv4f32:$acc, ZPR:$Zn, ZPR:$Zm)>;
+
   defm BFCVT_ZPmZ   : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>;
   defm BFCVTNT_ZPmZ : sve_bfloat_convert_top<"bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32_v2>;
 } // End HasBF16, HasSVE_or_SME
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
index 0580f5e0b019a..582e8456c05b3 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
@@ -466,12 +466,10 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x
 define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c) {
 ; NOB16B16-LABEL: fmla_nxv4bf16:
 ; NOB16B16:       // %bb.0:
-; NOB16B16-NEXT:    lsl z1.s, z1.s, #16
-; NOB16B16-NEXT:    lsl z0.s, z0.s, #16
 ; NOB16B16-NEXT:    lsl z2.s, z2.s, #16
 ; NOB16B16-NEXT:    ptrue p0.s
-; NOB16B16-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
-; NOB16B16-NEXT:    bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT:    bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-NEXT:    bfcvt z0.h, p0/m, z2.s
 ; NOB16B16-NEXT:    ret
 ;
 ; B16B16-LABEL: fmla_nxv4bf16:
@@ -486,24 +484,20 @@ define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x
 define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
 ; NOB16B16-LABEL: fmla_nxv8bf16:
 ; NOB16B16:       // %bb.0:
-; NOB16B16-NEXT:    uunpkhi z3.s, z1.h
-; NOB16B16-NEXT:    uunpkhi z4.s, z0.h
-; NOB16B16-NEXT:    uunpkhi z5.s, z2.h
+; NOB16B16-NEXT:    uunpkhi z3.s, z2.h
+; NOB16B16-NEXT:    uunpklo z2.s, z2.h
+; NOB16B16-NEXT:    uunpkhi z4.s, z1.h
+; NOB16B16-NEXT:    uunpkhi z5.s, z0.h
 ; NOB16B16-NEXT:    uunpklo z1.s, z1.h
 ; NOB16B16-NEXT:    uunpklo z0.s, z0.h
-; NOB16B16-NEXT:    uunpklo z2.s, z2.h
 ; NOB16B16-NEXT:    ptrue p0.s
 ; NOB16B16-NEXT:    lsl z3.s, z3.s, #16
-; NOB16B16-NEXT:    lsl z4.s, z4.s, #16
-; NOB16B16-NEXT:    lsl z5.s, z5.s, #16
-; NOB16B16-NEXT:    lsl z1.s, z1.s, #16
-; NOB16B16-NEXT:    lsl z0.s, z0.s, #16
 ; NOB16B16-NEXT:    lsl z2.s, z2.s, #16
-; NOB16B16-NEXT:    fmad z3.s, p0/m, z4.s, z5.s
-; NOB16B16-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
-; NOB16B16-NEXT:    bfcvt z1.h, p0/m, z3.s
-; NOB16B16-NEXT:    bfcvt z0.h, p0/m, z0.s
-; NOB16B16-NEXT:    uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT:    bfmlalb z3.s, z5.h, z4.h
+; NOB16B16-NEXT:    bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-NEXT:    bfcvt z0.h, p0/m, z3.s
+; NOB16B16-NEXT:    bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT:    uzp1 z0.h, z1.h, z0.h
 ; NOB16B16-NEXT:    ret
 ;
 ; B16B16-LABEL: fmla_nxv8bf16:
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
index 230bd9cf5420f..16e8feb0dc5bb 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
@@ -7,21 +7,17 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
 ; SVE-LABEL: fmla_nxv8bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z1.h
-; SVE-NEXT:    uunpkhi z5.s, z0.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    uunpkhi z4.s, z2.h
+; SVE-NEXT:    uunpkhi z5.s, z1.h
 ; SVE-NEXT:    uunpklo z2.s, z2.h
 ; SVE-NEXT:    uunpklo z1.s, z1.h
-; SVE-NEXT:    uunpklo z0.s, z0.h
 ; SVE-NEXT:    ptrue p0.s
 ; SVE-NEXT:    lsl z3.s, z3.s, #16
-; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    lsl z5.s, z5.s, #16
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    lsl z0.s, z0.s, #16
-; SVE-NEXT:    fmad z3.s, p0/m, z4.s, z5.s
-; SVE-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT:    bfmlalb z3.s, z5.h, z4.h
+; SVE-NEXT:    bfmlalb z0.s, z1.h, z2.h
 ; SVE-NEXT:    bfcvt z1.h, p0/m, z3.s
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    uzp1 z0.h, z0.h, z1.h
@@ -40,11 +36,9 @@ define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
 define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
 ; SVE-LABEL: fmla_nxv4bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    lsl z0.s, z0.s, #16
 ; SVE-NEXT:    ptrue p0.s
-; SVE-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT:    bfmlalb z0.s, z1.h, z2.h
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    ret
 ;
@@ -83,22 +77,18 @@ define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
 ; SVE-LABEL: fmls_nxv8bf16:
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    ptrue p0.h
-; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z0.h
-; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
 ; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    uunpkhi z5.s, z2.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
 ; SVE-NEXT:    fneg z1.h, p0/m, z1.h
 ; SVE-NEXT:    ptrue p0.s
 ; SVE-NEXT:    lsl z3.s, z3.s, #16
-; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z0.s, z0.s, #16
-; SVE-NEXT:    uunpkhi z5.s, z1.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
 ; SVE-NEXT:    uunpklo z1.s, z1.h
-; SVE-NEXT:    lsl z5.s, z5.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmad z3.s, p0/m, z5.s, z4.s
-; SVE-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT:    bfmlalb z3.s, z4.h, z5.h
+; SVE-NEXT:    bfmlalb z0.s, z1.h, z2.h
 ; SVE-NEXT:    bfcvt z1.h, p0/m, z3.s
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    uzp1 z0.h, z0.h, z1.h
@@ -118,11 +108,9 @@ define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale
 ; SVE-LABEL: fmls_nxv4bf16:
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    ptrue p0.s
-; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z0.s, z0.s, #16
 ; SVE-NEXT:    fneg z1.h, p0/m, z1.h
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT:    bfmlalb z0.s, z1.h, z2.h
 ; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    ret
 ;
@@ -161,24 +149,20 @@ define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale
 define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
 ; SVE-LABEL: fmla_sel_nxv8bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z1.h
-; SVE-NEXT:    uunpkhi z5.s, z0.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    uunpkhi z5.s, z2.h
+; SVE-NEXT:    uunpkhi z6.s, z1.h
 ; SVE-NEXT:    uunpklo z2.s, z2.h
 ; SVE-NEXT:    uunpklo z1.s, z1.h
-; SVE-NEXT:    uunpklo z6.s, z0.h
 ; SVE-NEXT:    ptrue p1.s
 ; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    lsl z5.s, z5.s, #16
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    lsl z6.s, z6.s, #16
-; SVE-NEXT:    fmad z3.s, p1/m, z4.s, z5.s
-; SVE-NEXT:    fmad z1.s, p1/m, z2.s, z6.s
-; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    bfmlalb z3.s, z6.h, z5.h
+; SVE-NEXT:    bfmlalb z4.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z1.h, p1/m, z3.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z4.s
+; SVE-NEXT:    uzp1 z1.h, z2.h, z1.h
 ; SVE-NEXT:    mov z0.h, p0/m, z1.h
 ; SVE-NEXT:    ret
 ;
@@ -195,12 +179,9 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
 define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
 ; SVE-LABEL: fmla_sel_nxv4bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    lsl z3.s, z0.s, #16
-; SVE-NEXT:    ptrue p1.s
-; SVE-NEXT:    fmad z1.s, p1/m, z2.s, z3.s
-; SVE-NEXT:    bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT:    bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z0.h, p0/m, z3.s
 ; SVE-NEXT:    ret
 ;
 ; SVE-B16B16-LABEL: fmla_sel_nxv4bf16:
@@ -238,25 +219,21 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
 ; SVE-LABEL: fmls_sel_nxv8bf16:
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    ptrue p1.h
-; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z0.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    uunpkhi z6.s, z2.h
 ; SVE-NEXT:    uunpklo z2.s, z2.h
-; SVE-NEXT:    uunpklo z6.s, z0.h
 ; SVE-NEXT:    fneg z1.h, p1/m, z1.h
 ; SVE-NEXT:    ptrue p1.s
 ; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z6.s, z6.s, #16
 ; SVE-NEXT:    uunpkhi z5.s, z1.h
 ; SVE-NEXT:    uunpklo z1.s, z1.h
-; SVE-NEXT:    lsl z5.s, z5.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmad z3.s, p1/m, z5.s, z4.s
-; SVE-NEXT:    fmad z1.s, p1/m, z2.s, z6.s
-; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    bfmlalb z3.s, z5.h, z6.h
+; SVE-NEXT:    bfmlalb z4.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z1.h, p1/m, z3.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z4.s
+; SVE-NEXT:    uzp1 z1.h, z2.h, z1.h
 ; SVE-NEXT:    mov z0.h, p0/m, z1.h
 ; SVE-NEXT:    ret
 ;
@@ -274,12 +251,10 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
 ; SVE-LABEL: fmls_sel_nxv4bf16:
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    ptrue p1.s
-; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z3.s, z0.s, #16
 ; SVE-NEXT:    fneg z1.h, p1/m, z1.h
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmad z1.s, p1/m, z2.s, z3.s
-; SVE-NEXT:    bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT:    bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z0.h, p0/m, z3.s
 ; SVE-NEXT:    ret
 ;
 ; SVE-B16B16-LABEL: fmls_sel_nxv4bf16:

>From 8112f30b5dde935d4a3a58666023609f49a464b0 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Mon, 10 Nov 2025 18:36:33 +0000
Subject: [PATCH 4/4] Reformat AArch64SVEInstrInfo.td changes.

---
 llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index ce6de5c780cf3..ece012a035bfc 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2578,8 +2578,9 @@ let Predicates = [HasBF16, HasSVE_or_SME] in {
   defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
   defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
 
-  def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc, (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))),
-                                                                  (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))),
+  def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc,
+                (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))),
+                (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))),
             (BFMLALB_ZZZ nxv4f32:$acc, ZPR:$Zn, ZPR:$Zm)>;
 
   defm BFCVT_ZPmZ   : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>;