[llvm] [LMI] Support non-power-of-2 types for the matmul remainder (PR #163987)

Adam Nemet via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 17 11:03:48 PDT 2025


https://github.com/anemet updated https://github.com/llvm/llvm-project/pull/163987

>From a1b2a554b22620c45c6f56ba416905a572595c38 Mon Sep 17 00:00:00 2001
From: Adam Nemet <anemet at apple.com>
Date: Fri, 17 Oct 2025 08:56:53 -0700
Subject: [PATCH 1/3] [LMI] Support non-power-of-2 types for the matmul
 remainder

In the inner loop of matmul, instead of continuously halving the HW
vector register width, I just use the remainder vector directly if
it's legal.

We don't have in-tree targets that have this so I opted for adding a
hidden flag to simulate this for testing purposes:
-matrix-split-matmul-remainder=0

The tests are the vectorization-friendly 3x3x1 matrix-vector and
1x3x3 vector-matrix multiplies for CM, RM respectively.
---
 .../Scalar/LowerMatrixIntrinsics.cpp          | 36 +++++--
 .../multiply-remainder-rm.ll                  | 95 +++++++++++++++++++
 .../multiply-remainder.ll                     | 95 +++++++++++++++++++
 3 files changed, 219 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 7cae94ebb4ba1..242cbee8b3c9d 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -97,6 +97,11 @@ static cl::opt<MatrixLayoutTy> MatrixLayout(
 static cl::opt<bool> PrintAfterTransposeOpt("matrix-print-after-transpose-opt",
                                             cl::init(false));
 
+static cl::opt<bool> SplitMatmulRemainder(
+    "matrix-split-matmul-remainder", cl::Hidden,
+    cl::desc("Split remainder vector in the inner loop of matmul"),
+    cl::init(true));
+
 /// Helper function to either return Scope, if it is a subprogram or the
 /// attached subprogram for a local scope.
 static DISubprogram *getSubprogram(DIScope *Scope) {
@@ -1719,6 +1724,26 @@ class LowerMatrixIntrinsics {
     ToRemove.push_back(MatMul);
   }
 
+  /// Given \p Remainder iterations of the the matmul inner loop,
+  /// potentially lower \p Blocksize that is used for the underlying
+  /// vector.
+  unsigned capBlockSize(unsigned BlockSize, unsigned Remainder, Type *EltType) {
+    if (BlockSize <= Remainder)
+      return BlockSize;
+
+    // If the remainder is also a legal type just use it.
+    if (TTI.isTypeLegal(FixedVectorType::get(EltType, Remainder)) ||
+        !SplitMatmulRemainder)
+      return Remainder;
+
+    // Gradually lower the vectorization factor to cover the
+    // remainder.
+    do {
+      BlockSize /= 2;
+    } while (BlockSize > Remainder);
+    return BlockSize;
+  }
+
   /// Compute \p Result += \p A * \p B for input matrices with left-associating
   /// addition.
   ///
@@ -1756,10 +1781,8 @@ class LowerMatrixIntrinsics {
         bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J));
 
         for (unsigned I = 0; I < R; I += BlockSize) {
-          // Gradually lower the vectorization factor to cover the remainder.
-          while (I + BlockSize > R)
-            BlockSize /= 2;
-
+          // Lower block size to make sure we stay within bounds.
+          BlockSize = capBlockSize(BlockSize, R - I, Result.getElementType());
           Value *Sum = IsTiled ? Result.extractVector(I, J, BlockSize, Builder)
                                : nullptr;
           for (unsigned K = 0; K < M; ++K) {
@@ -1784,9 +1807,8 @@ class LowerMatrixIntrinsics {
         unsigned BlockSize = VF;
         bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I));
         for (unsigned J = 0; J < C; J += BlockSize) {
-          // Gradually lower the vectorization factor to cover the remainder.
-          while (J + BlockSize > C)
-            BlockSize /= 2;
+          // Lower the vectorization factor to cover the remainder.
+          BlockSize = capBlockSize(BlockSize, C - J, Result.getElementType());
 
           Value *Sum = nullptr;
           for (unsigned K = 0; K < M; ++K) {
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll
new file mode 100644
index 0000000000000..f71aa5305f679
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='lower-matrix-intrinsics'                                  -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=RM_SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder=0 -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=RM_NO_SPLIT_REMAINDER %s
+
+; REQUIRES: aarch64-registered-target
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:8:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+define void @matmul(ptr %a, ptr %b, ptr %c) {
+; RM_SPLIT_REMAINDER-LABEL: define void @matmul(
+; RM_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; RM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; RM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4
+; RM_SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3
+; RM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; RM_SPLIT_REMAINDER-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6
+; RM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4
+; RM_SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
+; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[SPLAT_SPLAT]], [[BLOCK]]
+; RM_SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[SPLAT_SPLAT7]], [[BLOCK5]]
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]]
+; RM_SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
+; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[SPLAT_SPLAT10]], [[BLOCK8]]
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2>
+; RM_SPLIT_REMAINDER-NEXT:    [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2>
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
+; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP12:%.*]] = fmul <1 x float> [[SPLAT_SPLAT13]], [[BLOCK11]]
+; RM_SPLIT_REMAINDER-NEXT:    [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <1 x i32> <i32 2>
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
+; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP14:%.*]] = fmul <1 x float> [[SPLAT_SPLAT16]], [[BLOCK14]]
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
+; RM_SPLIT_REMAINDER-NEXT:    [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <1 x i32> <i32 2>
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
+; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP17:%.*]] = fmul <1 x float> [[SPLAT_SPLAT19]], [[BLOCK17]]
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; RM_SPLIT_REMAINDER-NEXT:    [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3>
+; RM_SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP20]], ptr [[C]], align 4
+; RM_SPLIT_REMAINDER-NEXT:    ret void
+;
+; RM_NO_SPLIT_REMAINDER-LABEL: define void @matmul(
+; RM_NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <3 x float> [[SPLAT_SPLAT]], [[BLOCK]]
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <3 x float> [[SPLAT_SPLAT7]], [[BLOCK5]]
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]]
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <3 x float> [[SPLAT_SPLAT10]], [[BLOCK8]]
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]]
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5>
+; RM_NO_SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP10]], ptr [[C]], align 4
+; RM_NO_SPLIT_REMAINDER-NEXT:    ret void
+;
+  %a_load = load <3 x float>, ptr %a, align 4
+  %b_load = load <9 x float>, ptr %b, align 4
+  %matmul = tail call <3 x float> @llvm.matrix.multiply.v3f32.v9f32.v3f32(<3 x float> %a_load, <9 x float> %b_load, i32 1, i32 3, i32 3)
+  store <3 x float> %matmul, ptr %c, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll
new file mode 100644
index 0000000000000..b60c3858c31c6
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='lower-matrix-intrinsics'                                  -S < %s | FileCheck --check-prefix=CM_SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder=0 -S < %s | FileCheck --check-prefix=CM_NO_SPLIT_REMAINDER %s
+
+; REQUIRES: aarch64-registered-target
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:8:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+define void @matmul(ptr %a, ptr %b, ptr %c) {
+; CM_SPLIT_REMAINDER-LABEL: define void @matmul(
+; CM_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; CM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; CM_SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3
+; CM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; CM_SPLIT_REMAINDER-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6
+; CM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
+; CM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4
+; CM_SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
+; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; CM_SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]]
+; CM_SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
+; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[BLOCK8]], [[SPLAT_SPLAT10]]
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2>
+; CM_SPLIT_REMAINDER-NEXT:    [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <1 x i32> <i32 2>
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
+; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP12:%.*]] = fmul <1 x float> [[BLOCK11]], [[SPLAT_SPLAT13]]
+; CM_SPLIT_REMAINDER-NEXT:    [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2>
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
+; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP14:%.*]] = fmul <1 x float> [[BLOCK14]], [[SPLAT_SPLAT16]]
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
+; CM_SPLIT_REMAINDER-NEXT:    [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <1 x i32> <i32 2>
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
+; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP17:%.*]] = fmul <1 x float> [[BLOCK17]], [[SPLAT_SPLAT19]]
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CM_SPLIT_REMAINDER-NEXT:    [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3>
+; CM_SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP20]], ptr [[C]], align 4
+; CM_SPLIT_REMAINDER-NEXT:    ret void
+;
+; CM_NO_SPLIT_REMAINDER-LABEL: define void @matmul(
+; CM_NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <3 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <3 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]]
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <3 x float> [[BLOCK8]], [[SPLAT_SPLAT10]]
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]]
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5>
+; CM_NO_SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP10]], ptr [[C]], align 4
+; CM_NO_SPLIT_REMAINDER-NEXT:    ret void
+;
+  %a_load = load <9 x float>, ptr %a, align 4
+  %b_load = load <3 x float>, ptr %b, align 4
+  %matmul = tail call <3 x float> @llvm.matrix.multiply.v9f32.v3f32.v3f32(<9 x float> %a_load, <3 x float> %b_load, i32 3, i32 3, i32 1)
+  store <3 x float> %matmul, ptr %c, align 4
+  ret void
+}

>From 0b375ac91f530ce4cd4ab43716f1f2666cfa01f9 Mon Sep 17 00:00:00 2001
From: Adam Nemet <anemet at apple.com>
Date: Fri, 17 Oct 2025 10:45:44 -0700
Subject: [PATCH 2/3] [LMI] Change matrix-split-matmul-remainder into a
 treshold

---
 .../Scalar/LowerMatrixIntrinsics.cpp          |  18 +-
 .../multiply-remainder-rm.ll                  | 159 +++++++++---------
 .../multiply-remainder.ll                     | 159 +++++++++---------
 3 files changed, 172 insertions(+), 164 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 242cbee8b3c9d..93dada4824f19 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -97,10 +97,11 @@ static cl::opt<MatrixLayoutTy> MatrixLayout(
 static cl::opt<bool> PrintAfterTransposeOpt("matrix-print-after-transpose-opt",
                                             cl::init(false));
 
-static cl::opt<bool> SplitMatmulRemainder(
-    "matrix-split-matmul-remainder", cl::Hidden,
-    cl::desc("Split remainder vector in the inner loop of matmul"),
-    cl::init(true));
+static cl::opt<unsigned> SplitMatmulRemainderOverThreshold(
+    "matrix-split-matmul-remainder-over-threshold", cl::Hidden,
+    cl::desc("Illegal remainder vectors over this size in bits should be split "
+             "in the inner loop of matmul"),
+    cl::init(0));
 
 /// Helper function to either return Scope, if it is a subprogram or the
 /// attached subprogram for a local scope.
@@ -1732,8 +1733,13 @@ class LowerMatrixIntrinsics {
       return BlockSize;
 
     // If the remainder is also a legal type just use it.
-    if (TTI.isTypeLegal(FixedVectorType::get(EltType, Remainder)) ||
-        !SplitMatmulRemainder)
+    FixedVectorType *VecTy = FixedVectorType::get(EltType, Remainder);
+    if (TTI.isTypeLegal(VecTy))
+      return Remainder;
+
+    // Similarly, if the vector is small enough that we don't want
+    // to split further.
+    if (VecTy->getPrimitiveSizeInBits() <= SplitMatmulRemainderOverThreshold)
       return Remainder;
 
     // Gradually lower the vectorization factor to cover the
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll
index f71aa5305f679..4ec58988bca7b 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='lower-matrix-intrinsics'                                  -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=RM_SPLIT_REMAINDER %s
-; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder=0 -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=RM_NO_SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics'                                                  -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=96 -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=NO_SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=64 -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
 
 ; REQUIRES: aarch64-registered-target
 
@@ -8,84 +9,84 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:8:32:64-S128"
 target triple = "aarch64-apple-ios"
 
 define void @matmul(ptr %a, ptr %b, ptr %c) {
-; RM_SPLIT_REMAINDER-LABEL: define void @matmul(
-; RM_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
-; RM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
-; RM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4
-; RM_SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3
-; RM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
-; RM_SPLIT_REMAINDER-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6
-; RM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4
-; RM_SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
-; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
-; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[SPLAT_SPLAT]], [[BLOCK]]
-; RM_SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
-; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
-; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[SPLAT_SPLAT7]], [[BLOCK5]]
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]]
-; RM_SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
-; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
-; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[SPLAT_SPLAT10]], [[BLOCK8]]
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2>
-; RM_SPLIT_REMAINDER-NEXT:    [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2>
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
-; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
-; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP12:%.*]] = fmul <1 x float> [[SPLAT_SPLAT13]], [[BLOCK11]]
-; RM_SPLIT_REMAINDER-NEXT:    [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <1 x i32> <i32 2>
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
-; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
-; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP14:%.*]] = fmul <1 x float> [[SPLAT_SPLAT16]], [[BLOCK14]]
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
-; RM_SPLIT_REMAINDER-NEXT:    [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <1 x i32> <i32 2>
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
-; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
-; RM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP17:%.*]] = fmul <1 x float> [[SPLAT_SPLAT19]], [[BLOCK17]]
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
-; RM_SPLIT_REMAINDER-NEXT:    [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3>
-; RM_SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP20]], ptr [[C]], align 4
-; RM_SPLIT_REMAINDER-NEXT:    ret void
+; SPLIT_REMAINDER-LABEL: define void @matmul(
+; SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4
+; SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; SPLIT_REMAINDER-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4
+; SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[SPLAT_SPLAT]], [[BLOCK]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[SPLAT_SPLAT7]], [[BLOCK5]]
+; SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[SPLAT_SPLAT10]], [[BLOCK8]]
+; SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
+; SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2>
+; SPLIT_REMAINDER-NEXT:    [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT:    [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP12:%.*]] = fmul <1 x float> [[SPLAT_SPLAT13]], [[BLOCK11]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT:    [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP14:%.*]] = fmul <1 x float> [[SPLAT_SPLAT16]], [[BLOCK14]]
+; SPLIT_REMAINDER-NEXT:    [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT:    [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP17:%.*]] = fmul <1 x float> [[SPLAT_SPLAT19]], [[BLOCK17]]
+; SPLIT_REMAINDER-NEXT:    [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
+; SPLIT_REMAINDER-NEXT:    [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; SPLIT_REMAINDER-NEXT:    [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3>
+; SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP20]], ptr [[C]], align 4
+; SPLIT_REMAINDER-NEXT:    ret void
 ;
-; RM_NO_SPLIT_REMAINDER-LABEL: define void @matmul(
-; RM_NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <3 x float> [[SPLAT_SPLAT]], [[BLOCK]]
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <3 x float> [[SPLAT_SPLAT7]], [[BLOCK5]]
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]]
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <3 x float> [[SPLAT_SPLAT10]], [[BLOCK8]]
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]]
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; RM_NO_SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5>
-; RM_NO_SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP10]], ptr [[C]], align 4
-; RM_NO_SPLIT_REMAINDER-NEXT:    ret void
+; NO_SPLIT_REMAINDER-LABEL: define void @matmul(
+; NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <3 x float> [[SPLAT_SPLAT]], [[BLOCK]]
+; NO_SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <3 x float> [[SPLAT_SPLAT7]], [[BLOCK5]]
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]]
+; NO_SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <3 x float> [[SPLAT_SPLAT10]], [[BLOCK8]]
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]]
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5>
+; NO_SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP10]], ptr [[C]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    ret void
 ;
   %a_load = load <3 x float>, ptr %a, align 4
   %b_load = load <9 x float>, ptr %b, align 4
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll
index b60c3858c31c6..fbc2cbc164daf 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='lower-matrix-intrinsics'                                  -S < %s | FileCheck --check-prefix=CM_SPLIT_REMAINDER %s
-; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder=0 -S < %s | FileCheck --check-prefix=CM_NO_SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics'                                                  -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=96 -S < %s | FileCheck --check-prefix=NO_SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=64 -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
 
 ; REQUIRES: aarch64-registered-target
 
@@ -8,84 +9,84 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:8:32:64-S128"
 target triple = "aarch64-apple-ios"
 
 define void @matmul(ptr %a, ptr %b, ptr %c) {
-; CM_SPLIT_REMAINDER-LABEL: define void @matmul(
-; CM_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
-; CM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
-; CM_SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3
-; CM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
-; CM_SPLIT_REMAINDER-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6
-; CM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
-; CM_SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4
-; CM_SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
-; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
-; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[BLOCK]], [[SPLAT_SPLAT]]
-; CM_SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
-; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
-; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]]
-; CM_SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
-; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
-; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[BLOCK8]], [[SPLAT_SPLAT10]]
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2>
-; CM_SPLIT_REMAINDER-NEXT:    [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <1 x i32> <i32 2>
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
-; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
-; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP12:%.*]] = fmul <1 x float> [[BLOCK11]], [[SPLAT_SPLAT13]]
-; CM_SPLIT_REMAINDER-NEXT:    [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2>
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
-; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
-; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP14:%.*]] = fmul <1 x float> [[BLOCK14]], [[SPLAT_SPLAT16]]
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
-; CM_SPLIT_REMAINDER-NEXT:    [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <1 x i32> <i32 2>
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
-; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
-; CM_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP17:%.*]] = fmul <1 x float> [[BLOCK17]], [[SPLAT_SPLAT19]]
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
-; CM_SPLIT_REMAINDER-NEXT:    [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3>
-; CM_SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP20]], ptr [[C]], align 4
-; CM_SPLIT_REMAINDER-NEXT:    ret void
+; SPLIT_REMAINDER-LABEL: define void @matmul(
+; SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; SPLIT_REMAINDER-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4
+; SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
+; SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[BLOCK8]], [[SPLAT_SPLAT10]]
+; SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
+; SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2>
+; SPLIT_REMAINDER-NEXT:    [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT:    [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP12:%.*]] = fmul <1 x float> [[BLOCK11]], [[SPLAT_SPLAT13]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT:    [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP14:%.*]] = fmul <1 x float> [[BLOCK14]], [[SPLAT_SPLAT16]]
+; SPLIT_REMAINDER-NEXT:    [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT:    [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP17:%.*]] = fmul <1 x float> [[BLOCK17]], [[SPLAT_SPLAT19]]
+; SPLIT_REMAINDER-NEXT:    [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
+; SPLIT_REMAINDER-NEXT:    [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; SPLIT_REMAINDER-NEXT:    [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3>
+; SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP20]], ptr [[C]], align 4
+; SPLIT_REMAINDER-NEXT:    ret void
 ;
-; CM_NO_SPLIT_REMAINDER-LABEL: define void @matmul(
-; CM_NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <3 x float> [[BLOCK]], [[SPLAT_SPLAT]]
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <3 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]]
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <3 x float> [[BLOCK8]], [[SPLAT_SPLAT10]]
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]]
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CM_NO_SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5>
-; CM_NO_SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP10]], ptr [[C]], align 4
-; CM_NO_SPLIT_REMAINDER-NEXT:    ret void
+; NO_SPLIT_REMAINDER-LABEL: define void @matmul(
+; NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <3 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; NO_SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <3 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]]
+; NO_SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <3 x float> [[BLOCK8]], [[SPLAT_SPLAT10]]
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]]
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5>
+; NO_SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP10]], ptr [[C]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    ret void
 ;
   %a_load = load <9 x float>, ptr %a, align 4
   %b_load = load <3 x float>, ptr %b, align 4

>From d95318b3d62be384dc8b430a3165d9f02bc7a2c4 Mon Sep 17 00:00:00 2001
From: Adam Nemet <anemet at apple.com>
Date: Fri, 17 Oct 2025 11:03:40 -0700
Subject: [PATCH 3/3] Use auto

Co-authored-by: Jon Roelofs <jroelofs at gmail.com>
---
 llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 93dada4824f19..7bd5000c01def 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1733,7 +1733,7 @@ class LowerMatrixIntrinsics {
       return BlockSize;
 
     // If the remainder is also a legal type just use it.
-    FixedVectorType *VecTy = FixedVectorType::get(EltType, Remainder);
+    auto *VecTy = FixedVectorType::get(EltType, Remainder);
     if (TTI.isTypeLegal(VecTy))
       return Remainder;
 



More information about the llvm-commits mailing list