[Mlir-commits] [clang] [llvm] [mlir] [AArch64][llvm][clang] Remove `int_aarch64_sve_bfmmla` and reuse existing def (NFC) (PR #193970)

Fri Apr 24 07:56:46 PDT 2026

https://github.com/jthackray updated https://github.com/llvm/llvm-project/pull/193970

>From 72315270e4401fa2b63a2787df5f267a8fb031f7 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Fri, 24 Apr 2026 14:05:25 +0100
Subject: [PATCH 1/2] [AArch64][llvm][clang] Remove `int_aarch64_sve_bfmmla`
 and reuse existing def (NFC)

Remove the dedicated (superfluous) `int_aarch64_sve_bfmmla` def and
changed `svbfmmla` to use the existing shared fmmla intrinsic instead.

No functional change.
---
 clang/include/clang/Basic/arm_sve.td                      | 2 +-
 .../test/CodeGen/AArch64/sve-intrinsics/acle_sve_bfmmla.c | 4 ++--
 llvm/include/llvm/IR/IntrinsicsAArch64.td                 | 3 ---
 llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td            | 2 +-
 llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll        | 4 ++--
 .../MemorySanitizer/AArch64/sve-intrinsics-bfloat.ll      | 8 ++++----
 6 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 2d20ce6154bcb..10d24d1d8cfe1 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -256,7 +256,7 @@ let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in {
 }
 
 let SVETargetGuard = "bf16", SMETargetGuard = InvalidMode in {
-  def SVBFMMLA       : SInst<"svbfmmla[_{0}]",       "MMdd",  "b", MergeNone, "aarch64_sve_bfmmla",       [IsOverloadNone]>;
+  def SVBFMMLA       : SInst<"svbfmmla[_{0}]",       "MMdd",  "b", MergeNone, "aarch64_sve_fmmla",        [IsOverloadFirstandLast]>;
 }
 
 let SVETargetGuard = "bf16", SMETargetGuard = "bf16" in {
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_bfmmla.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_bfmmla.c
index 7e93514c51ec7..9c166305ee67b 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_bfmmla.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_bfmmla.c
@@ -17,12 +17,12 @@
 
 // CHECK-LABEL: @test_bfmmla_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8bf16(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z15test_bfmmla_f32u13__SVFloat32_tu14__SVBfloat16_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8bf16(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 svfloat32_t test_bfmmla_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 578f54561910b..9af79b77c88de 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2843,8 +2843,6 @@ def int_aarch64_sve_bfdot   : SVE_4Vec_BF16;
 def int_aarch64_sve_bfmlalb : SVE_4Vec_BF16;
 def int_aarch64_sve_bfmlalt : SVE_4Vec_BF16;
 
-def int_aarch64_sve_bfmmla  : SVE_4Vec_BF16;
-
 def int_aarch64_sve_bfdot_lane_v2   : SVE_4Vec_BF16_Indexed;
 def int_aarch64_sve_bfmlalb_lane_v2 : SVE_4Vec_BF16_Indexed;
 def int_aarch64_sve_bfmlalt_lane_v2 : SVE_4Vec_BF16_Indexed;
@@ -4290,4 +4288,3 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_pmlal_pair_x2 : DefaultAttrsIntrinsic<[llvm_nxv2i64_ty, llvm_nxv2i64_ty],
       [llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty], [IntrNoMem]>;
 }
-
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 8ab4e7d33d41c..8b099457ab812 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2619,7 +2619,7 @@ let Predicates = [HasBF16, HasSVE_or_SME] in {
 } // End HasBF16, HasSVE_or_SME
 
 let Predicates = [HasBF16, HasSVE] in {
-  defm BFMMLA_ZZZ_HtoS : sve_fp_matrix_mla<0b011, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_bfmmla, nxv4f32, nxv8bf16>;
+  defm BFMMLA_ZZZ_HtoS : sve_fp_matrix_mla<0b011, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla, nxv4f32, nxv8bf16>;
 } // End HasBF16, HasSVE
 
 let Predicates = [HasBF16, HasSVE_or_SME] in {
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll
index 7f352041ec587..93e39bc61680e 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll
@@ -229,7 +229,7 @@ define <vscale x 4 x float> @bfmmla_f32(<vscale x 4 x float> %a, <vscale x 8 x b
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmmla z0.s, z1.h, z2.h
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8bf16(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
   ret <vscale x 4 x float> %out
 }
 
@@ -283,7 +283,7 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float>, <vs
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8bf16(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat>, <vscale x 4 x i1>, <vscale x 4 x float>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sve-intrinsics-bfloat.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sve-intrinsics-bfloat.ll
index a75e3dca6b53e..8d2cd15d01879 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sve-intrinsics-bfloat.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sve-intrinsics-bfloat.ll
@@ -10,7 +10,7 @@
 ; - llvm.aarch64.sve.bfmlalb.lane.v2
 ; - llvm.aarch64.sve.bfmlalt
 ; - llvm.aarch64.sve.bfmlalt.lane.v2
-; - llvm.aarch64.sve.bfmmla
+; - llvm.aarch64.sve.fmmla.nxv4f32.nxv8bf16
 ; - llvm.aarch64.sve.convert.from.svbool.nxv4i1
 ; - llvm.aarch64.sve.convert.to.svbool.nxv8i1
 ; - llvm.aarch64.sve.fcvt.bf16f32.v2
@@ -304,11 +304,11 @@ define <vscale x 4 x float> @bfmmla_f32(<vscale x 4 x float> %a, <vscale x 8 x b
 ; CHECK-LABEL: define <vscale x 4 x float> @bfmmla_f32(
 ; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 8 x bfloat> [[B:%.*]], <vscale x 8 x bfloat> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float> [[A]], <vscale x 8 x bfloat> [[B]], <vscale x 8 x bfloat> [[C]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8bf16(<vscale x 4 x float> [[A]], <vscale x 8 x bfloat> [[B]], <vscale x 8 x bfloat> [[C]])
 ; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
 ;
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8bf16(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
   ret <vscale x 4 x float> %out
 }
 
@@ -372,7 +372,7 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float>, <vs
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8bf16(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat>, <vscale x 4 x i1>, <vscale x 4 x float>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)

>From d57fad59e23518b6b45f9f17fabf5a93b8746745 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Fri, 24 Apr 2026 15:44:16 +0100
Subject: [PATCH 2/2] fixup! Fix MLIR and add autoupgrade code and test

---
 llvm/lib/IR/AutoUpgrade.cpp                         |  6 ++++++
 llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll | 10 ++++++++++
 mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td       |  8 ++++----
 mlir/test/Target/LLVMIR/arm-sve.mlir                |  2 +-
 4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 2728897372009..143623c72246b 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1004,6 +1004,12 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
     if (Name.consume_front("sve.")) {
       // 'aarch64.sve.*'.
       if (Name.consume_front("bf")) {
+        if (Name == "mmla") {
+          Type *Tys[] = {F->getReturnType(), std::next(F->arg_begin())->getType()};
+          NewFn = Intrinsic::getOrInsertDeclaration(
+              F->getParent(), Intrinsic::aarch64_sve_fmmla, Tys);
+          return true;
+        }
         if (Name.consume_back(".lane")) {
           // 'aarch64.sve.bf*.lane'.
           Intrinsic::ID ID =
diff --git a/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll b/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll
index 87cf42ec6827a..832c2c4e072f4 100644
--- a/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll
+++ b/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll
@@ -198,6 +198,15 @@ define <vscale x 4 x float> @bfmlalt_lane(<vscale x 4 x float> %a, <vscale x 8 x
   ret <vscale x 4 x float> %out
 }
 
+; bfmmla
+define <vscale x 4 x float> @bfmmla_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
+; CHECK-LABEL: @bfmmla_f32
+; CHECK:       %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8bf16(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+; CHECK-NEXT:  ret <vscale x 4 x float> %out
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  ret <vscale x 4 x float> %out
+}
+
 declare  <vscale x 32 x i8> @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare  <vscale x 32 x i8> @llvm.aarch64.sve.tuple.create2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare  <vscale x 32 x i8> @llvm.aarch64.sve.tuple.create2(<vscale x 16 x i8>, <vscale x 16 x i8>)
@@ -209,3 +218,4 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv8i32(<vscale x 8 x i32
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
diff --git a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
index 38cfb3474f2f1..9ff415a60a068 100644
--- a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
+++ b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
@@ -294,10 +294,10 @@ def UsmmlaOp : ArmSVE_Op<"usmmla", [Pure,
     "$acc `,` $src1 `,` $src2 attr-dict `:` type($src1) `to` type($dst)";
 }
 
-def BfmmlaOp : ArmSVE_IntrOp<"bfmmla", [Pure,
-                                        AllTypesMatch<["src1", "src2"]>,
-                                        AllTypesMatch<["acc", "res"]>,
-                                        ]> {
+def BfmmlaOp : LLVM_IntrOpBase<ArmSVE_Dialect, "intr.bfmmla", "aarch64_sve_fmmla", [0], [1],
+                               [Pure,
+                                AllTypesMatch<["src1", "src2"]>,
+                                AllTypesMatch<["acc", "res"]>], 1> {
   let summary = "BFloat16 matrix multiply-accumulate";
   let description = [{
     BFMMLA: BFloat16 matrix multiply-accumulate into 2×2 matrices";
diff --git a/mlir/test/Target/LLVMIR/arm-sve.mlir b/mlir/test/Target/LLVMIR/arm-sve.mlir
index 737145c74e331..dbe82dd3be181 100644
--- a/mlir/test/Target/LLVMIR/arm-sve.mlir
+++ b/mlir/test/Target/LLVMIR/arm-sve.mlir
@@ -65,7 +65,7 @@ llvm.func @arm_sve_bfmmla(%arg0: vector<[8]xbf16>,
                           %arg1: vector<[8]xbf16>,
                           %arg2: vector<[4]xf32>)
                           -> vector<[4]xf32> {
-  // CHECK: call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float>
+  // CHECK: call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8bf16(<vscale x 4 x float>
   %0 = "arm_sve.intr.bfmmla"(%arg2, %arg0, %arg1) :
     (vector<[4]xf32>, vector<[8]xbf16>, vector<[8]xbf16>)
         -> vector<[4]xf32>