[llvm] [Matrix][IR] Cap stride bitwidth at 64 (PR #163729)

Fri Oct 17 02:08:01 PDT 2025

https://github.com/cofibrant updated https://github.com/llvm/llvm-project/pull/163729

>From a455a6c9bc044dabad26d3feaf2b86ee19b8015a Mon Sep 17 00:00:00 2001
From: Nathan Corbyn <n_corbyn at apple.com>
Date: Thu, 16 Oct 2025 13:06:16 +0100
Subject: [PATCH] [Matrix][IR] Cap stride bitwidth at 64

---
 llvm/docs/LangRef.rst                         |  25 ++-
 llvm/lib/IR/Verifier.cpp                      |   5 +-
 .../data-layout-multiply-fused.ll             | 174 ------------------
 .../LowerMatrixIntrinsics/data-layout.ll      | 162 +---------------
 llvm/test/Verifier/matrix-intrinsics.ll       |  23 ++-
 5 files changed, 43 insertions(+), 346 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 0c54f57f5a113..5b4b53d5ea053 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -21062,12 +21062,15 @@ integer element type.
 
 Syntax:
 """""""
-This is an overloaded intrinsic.
+This is an overloaded intrinsic. You can use ``llvm.matrix.column.major.load``
+to load any vector type with a stride of any bitwidth up to 64.
 
 ::
 
-      declare vectorty @llvm.matrix.column.major.load.*(
+      declare <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(
           ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
+      declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(
+          ptrty %Ptr, i32 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
 
 Overview:
 """""""""
@@ -21086,9 +21089,9 @@ Arguments:
 
 The first argument ``%Ptr`` is a pointer type to the returned vector type, and
 corresponds to the start address to load from. The second argument ``%Stride``
-is a positive, constant integer with ``%Stride >= <Rows>``. ``%Stride`` is used
-to compute the column memory addresses. I.e., for a column ``C``, its start
-memory addresses is calculated with ``%Ptr + C * %Stride``. The third Argument
+is a positive integer for which ``%Stride >= <Rows>``. ``%Stride`` is used to
+compute the column memory addresses. I.e., for a column ``C``, its start memory
+addresses is calculated with ``%Ptr + C * %Stride``. The third Argument
 ``<IsVolatile>`` is a boolean value.  The fourth and fifth arguments,
 ``<Rows>`` and ``<Cols>``, correspond to the number of rows and columns,
 respectively, and must be positive, constant integers. The returned vector must
@@ -21103,11 +21106,17 @@ The :ref:`align <attr_align>` parameter attribute can be provided for the
 
 Syntax:
 """""""
+This is an overloaded intrinsic. ``llvm.matrix.column.major.store`` to store
+any vector type with a stride of any bitwidth up to 64.
 
 ::
 
-      declare void @llvm.matrix.column.major.store.*(
-          vectorty %In, ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
+      declare void @llvm.matrix.column.major.store.v4i32.i64(
+          <4 x i32> %In, ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>,
+          i32 <Cols>)
+      declare void @llvm.matrix.column.major.store.v9f64.i32(
+          <9 x double> %In, ptrty %Ptr, i32 %Stride, i1 <IsVolatile>, i32
+          <Rows>, i32 <Cols>)
 
 Overview:
 """""""""
@@ -21127,7 +21136,7 @@ Arguments:
 The first argument ``%In`` is a vector that corresponds to a ``<Rows> x
 <Cols>`` matrix to be stored to memory. The second argument ``%Ptr`` is a
 pointer to the vector type of ``%In``, and is the start address of the matrix
-in memory. The third argument ``%Stride`` is a positive, constant integer with
+in memory. The third argument ``%Stride`` is a positive integer for which
 ``%Stride >= <Rows>``.  ``%Stride`` is used to compute the column memory
 addresses. I.e., for a column ``C``, its start memory addresses is calculated
 with ``%Ptr + C * %Stride``.  The fourth argument ``<IsVolatile>`` is a boolean
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c79a95087dbdd..357285234ef35 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6479,9 +6479,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
               NumRows->getZExtValue() * NumColumns->getZExtValue(),
           "Result of a matrix operation does not fit in the returned vector!");
 
-    if (Stride)
+    if (Stride) {
+      Check(Stride->getBitWidth() <= 64, "Stride bitwidth cannot exceed 64!",
+            IF);
       Check(Stride->getZExtValue() >= NumRows->getZExtValue(),
             "Stride must be greater or equal than the number of rows!", IF);
+    }
 
     break;
   }
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
index d281905e6f973..abd1d96937b28 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:128:128' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR128
 ; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64
 ; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32
 
@@ -10,179 +9,6 @@
 target triple = "aarch64-unknown-unknown"
 
 define void @multiply(ptr %A, ptr %B, ptr %C) {
-; PTR128-LABEL: @multiply(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i128
-; PTR128-NEXT:    [[STORE_END:%.*]] = add nuw nsw i128 [[STORE_BEGIN]], 128
-; PTR128-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i128
-; PTR128-NEXT:    [[TMP0:%.*]] = icmp ugt i128 [[STORE_END]], [[LOAD_BEGIN]]
-; PTR128-NEXT:    br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
-; PTR128:       alias_cont:
-; PTR128-NEXT:    [[LOAD_END:%.*]] = add nuw nsw i128 [[LOAD_BEGIN]], 128
-; PTR128-NEXT:    [[TMP1:%.*]] = icmp ugt i128 [[LOAD_END]], [[STORE_BEGIN]]
-; PTR128-NEXT:    br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
-; PTR128:       copy:
-; PTR128-NEXT:    [[TMP2:%.*]] = alloca [16 x double], align 8
-; PTR128-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
-; PTR128-NEXT:    br label [[NO_ALIAS]]
-; PTR128:       no_alias:
-; PTR128-NEXT:    [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
-; PTR128-NEXT:    [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i128
-; PTR128-NEXT:    [[STORE_END5:%.*]] = add nuw nsw i128 [[STORE_BEGIN4]], 128
-; PTR128-NEXT:    [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i128
-; PTR128-NEXT:    [[TMP4:%.*]] = icmp ugt i128 [[STORE_END5]], [[LOAD_BEGIN6]]
-; PTR128-NEXT:    br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
-; PTR128:       alias_cont1:
-; PTR128-NEXT:    [[LOAD_END7:%.*]] = add nuw nsw i128 [[LOAD_BEGIN6]], 128
-; PTR128-NEXT:    [[TMP5:%.*]] = icmp ugt i128 [[LOAD_END7]], [[STORE_BEGIN4]]
-; PTR128-NEXT:    br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
-; PTR128:       copy2:
-; PTR128-NEXT:    [[TMP6:%.*]] = alloca [16 x double], align 8
-; PTR128-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
-; PTR128-NEXT:    br label [[NO_ALIAS3]]
-; PTR128:       no_alias3:
-; PTR128-NEXT:    [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
-; PTR128-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
-; PTR128-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
-; PTR128-NEXT:    [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
-; PTR128-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
-; PTR128-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
-; PTR128-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
-; PTR128-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
-; PTR128-NEXT:    [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
-; PTR128-NEXT:    [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
-; PTR128-NEXT:    [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
-; PTR128-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
-; PTR128-NEXT:    [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
-; PTR128-NEXT:    [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
-; PTR128-NEXT:    [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
-; PTR128-NEXT:    [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
-; PTR128-NEXT:    [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
-; PTR128-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
-; PTR128-NEXT:    store <2 x double> [[TMP15]], ptr [[C]], align 8
-; PTR128-NEXT:    [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i128 32
-; PTR128-NEXT:    store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
-; PTR128-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
-; PTR128-NEXT:    [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
-; PTR128-NEXT:    [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
-; PTR128-NEXT:    [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
-; PTR128-NEXT:    [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
-; PTR128-NEXT:    [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
-; PTR128-NEXT:    [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
-; PTR128-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
-; PTR128-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
-; PTR128-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
-; PTR128-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
-; PTR128-NEXT:    [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
-; PTR128-NEXT:    [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
-; PTR128-NEXT:    [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
-; PTR128-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
-; PTR128-NEXT:    [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
-; PTR128-NEXT:    [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
-; PTR128-NEXT:    [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
-; PTR128-NEXT:    [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
-; PTR128-NEXT:    [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
-; PTR128-NEXT:    [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
-; PTR128-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i128 16
-; PTR128-NEXT:    store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
-; PTR128-NEXT:    [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i128 48
-; PTR128-NEXT:    store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
-; PTR128-NEXT:    [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; PTR128-NEXT:    [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
-; PTR128-NEXT:    [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
-; PTR128-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
-; PTR128-NEXT:    [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
-; PTR128-NEXT:    [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
-; PTR128-NEXT:    [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
-; PTR128-NEXT:    [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
-; PTR128-NEXT:    [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
-; PTR128-NEXT:    [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
-; PTR128-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
-; PTR128-NEXT:    [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
-; PTR128-NEXT:    [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
-; PTR128-NEXT:    [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
-; PTR128-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
-; PTR128-NEXT:    [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
-; PTR128-NEXT:    [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
-; PTR128-NEXT:    [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
-; PTR128-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
-; PTR128-NEXT:    [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
-; PTR128-NEXT:    [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
-; PTR128-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i128 64
-; PTR128-NEXT:    store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
-; PTR128-NEXT:    [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i128 96
-; PTR128-NEXT:    store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
-; PTR128-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
-; PTR128-NEXT:    [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
-; PTR128-NEXT:    [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
-; PTR128-NEXT:    [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
-; PTR128-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
-; PTR128-NEXT:    [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
-; PTR128-NEXT:    [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
-; PTR128-NEXT:    [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
-; PTR128-NEXT:    [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
-; PTR128-NEXT:    [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
-; PTR128-NEXT:    [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
-; PTR128-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
-; PTR128-NEXT:    [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
-; PTR128-NEXT:    [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
-; PTR128-NEXT:    [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
-; PTR128-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
-; PTR128-NEXT:    [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
-; PTR128-NEXT:    [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
-; PTR128-NEXT:    [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
-; PTR128-NEXT:    [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
-; PTR128-NEXT:    [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
-; PTR128-NEXT:    [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
-; PTR128-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i128 80
-; PTR128-NEXT:    store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
-; PTR128-NEXT:    [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i128 112
-; PTR128-NEXT:    store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
-; PTR128-NEXT:    ret void
-;
 ; PTR64-LABEL: @multiply(
 ; PTR64-NEXT:  entry:
 ; PTR64-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
index 87def6b33d6cc..3d050140666c2 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:128:128' -S < %s | FileCheck %s --check-prefix=PTR128
 ; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:64:64' -S < %s | FileCheck %s --check-prefix=PTR64
 ; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:32:32' -S < %s | FileCheck %s --check-prefix=PTR32
 
@@ -7,128 +6,13 @@
 ; the need to emit `libc` calls), we perform strided index calculations using
 ; the same pointer bit-width as the matrix pointers, as determined by the data
 ; layout. To verify this behaviour, this test runs several strided loads and
-; stores through the lowering pass with (32|64|128)-bit pointers, and verifies
-; the generated code extends / truncates strides accordingly. Similarly,
+; stores through the lowering pass with (32|64)-bit pointers, and verifies the
+; generated code extends / truncates strides accordingly. Similarly,
 ; `data-layout-multiply-fused.ll` adopts this approach to verify the same
 ; behaviour for index calculations emitted while lowering fused matrix
 ; multiplies.
 
-define <9 x double> @strided_load_3x3_i128(ptr %in, i128 %stride) {
-; PTR128-LABEL: @strided_load_3x3_i128(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[VEC_START:%.*]] = mul i128 0, [[STRIDE:%.*]]
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[VEC_START1:%.*]] = mul i128 1, [[STRIDE]]
-; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
-; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT:    [[VEC_START4:%.*]] = mul i128 2, [[STRIDE]]
-; PTR128-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
-; PTR128-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT:    ret <9 x double> [[TMP2]]
-;
-; PTR64-LABEL: @strided_load_3x3_i128(
-; PTR64-NEXT:  entry:
-; PTR64-NEXT:    [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i64
-; PTR64-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
-; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
-; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR64-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
-; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
-; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR64-NEXT:    [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]]
-; PTR64-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
-; PTR64-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR64-NEXT:    ret <9 x double> [[TMP2]]
-;
-; PTR32-LABEL: @strided_load_3x3_i128(
-; PTR32-NEXT:  entry:
-; PTR32-NEXT:    [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i32
-; PTR32-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]]
-; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
-; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR32-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]]
-; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
-; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR32-NEXT:    [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]]
-; PTR32-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
-; PTR32-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR32-NEXT:    ret <9 x double> [[TMP2]]
-;
-entry:
-  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 %stride, i1 false, i32 3, i32 3)
-  ret <9 x double> %load
-}
-
-define <9 x double> @strided_load_3x3_const_stride_i128(ptr %in) {
-; PTR128-LABEL: @strided_load_3x3_const_stride_i128(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
-; PTR128-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
-; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT:    ret <9 x double> [[TMP2]]
-;
-; PTR64-LABEL: @strided_load_3x3_const_stride_i128(
-; PTR64-NEXT:  entry:
-; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
-; PTR64-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
-; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR64-NEXT:    ret <9 x double> [[TMP2]]
-;
-; PTR32-LABEL: @strided_load_3x3_const_stride_i128(
-; PTR32-NEXT:  entry:
-; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
-; PTR32-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
-; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR32-NEXT:    ret <9 x double> [[TMP2]]
-;
-entry:
-  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 16, i1 false, i32 3, i32 3)
-  ret <9 x double> %load
-}
-
 define <9 x double> @strided_load_3x3_i64(ptr %in, i64 %stride) {
-; PTR128-LABEL: @strided_load_3x3_i64(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[STRIDE_CAST:%.*]] = zext i64 [[STRIDE:%.*]] to i128
-; PTR128-NEXT:    [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
-; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
-; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT:    [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
-; PTR128-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
-; PTR128-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT:    ret <9 x double> [[TMP2]]
-;
 ; PTR64-LABEL: @strided_load_3x3_i64(
 ; PTR64-NEXT:  entry:
 ; PTR64-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
@@ -168,18 +52,6 @@ entry:
 }
 
 define <9 x double> @strided_load_3x3_const_stride_i64(ptr %in) {
-; PTR128-LABEL: @strided_load_3x3_const_stride_i64(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
-; PTR128-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
-; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT:    ret <9 x double> [[TMP2]]
-;
 ; PTR64-LABEL: @strided_load_3x3_const_stride_i64(
 ; PTR64-NEXT:  entry:
 ; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
@@ -210,23 +82,6 @@ entry:
 }
 
 define <9 x double> @strided_load_3x3_i32(ptr %in, i32 %stride) {
-; PTR128-LABEL: @strided_load_3x3_i32(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i128
-; PTR128-NEXT:    [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
-; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
-; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT:    [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
-; PTR128-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
-; PTR128-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT:    ret <9 x double> [[TMP2]]
-;
 ; PTR64-LABEL: @strided_load_3x3_i32(
 ; PTR64-NEXT:  entry:
 ; PTR64-NEXT:    [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
@@ -266,18 +121,6 @@ entry:
 }
 
 define <9 x double> @strided_load_3x3_const_stride_i32(ptr %in) {
-; PTR128-LABEL: @strided_load_3x3_const_stride_i32(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
-; PTR128-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
-; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT:    ret <9 x double> [[TMP2]]
-;
 ; PTR64-LABEL: @strided_load_3x3_const_stride_i32(
 ; PTR64-NEXT:  entry:
 ; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
@@ -307,6 +150,5 @@ entry:
   ret <9 x double> %load
 }
 
-declare <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr, i128, i1, i32, i32)
 declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr, i64, i1, i32, i32)
 declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Verifier/matrix-intrinsics.ll b/llvm/test/Verifier/matrix-intrinsics.ll
index b6d5ad9a3cc49..43d1a79f0853f 100644
--- a/llvm/test/Verifier/matrix-intrinsics.ll
+++ b/llvm/test/Verifier/matrix-intrinsics.ll
@@ -1,8 +1,7 @@
-; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not opt -S %s 2>&1 | FileCheck %s
 
 define <4 x float> @transpose(<4 x float> %m, i32 %arg) {
-; CHECK: assembly parsed, but does not verify as correct!
-; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
+; CHECK: Result of a matrix operation does not fit in the returned vector!
 ; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
 ; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
 ; CHECK-NEXT: immarg operand has non-immediate parameter
@@ -118,16 +117,34 @@ define void @column.major_store_stride_too_small(ptr %m, i64 %arg) {
   ret void
 }
 
+define <4 x float> @column.major_load_stride_i128(ptr %m, i32 %arg) {
+; CHECK-NEXT: Stride bitwidth cannot exceed 64!
+; CHECK-NEXT: ptr @llvm.matrix.column.major.load.v4f32.i128
+  %result.1 = call <4 x float> @llvm.matrix.column.major.load.v4f32.i128(ptr %m, i128 u0x10000000000000000, i1 false, i32 2, i32 2)
+  ret <4 x float> %result.1
+}
+
+define void @column.major_store_stride_i128(ptr %m, i64 %arg) {
+; CHECK-NEXT: Stride bitwidth cannot exceed 64!
+; CHECK-NEXT: ptr @llvm.matrix.column.major.store.v4f32.i128
+  call void @llvm.matrix.column.major.store.v4f32.i128(<4 x float> zeroinitializer, ptr %m, i128 u0x10000000000000000, i1 false, i32 2, i32 2)
+  ret void
+}
+
 declare <4 x i32>   @llvm.matrix.column.major.load.v4i32.i64(ptr, i64, i1, i32, i32)
 declare <4 x float> @llvm.matrix.column.major.load.v4f32.p0(ptr, i64, i1, i32, i32)
 declare <4 x float> @llvm.matrix.column.major.load.v4f32.i64(ptr, i64, i1, i32, i32)
 declare <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr, i64, i1, i32, i32)
+declare <6 x float> @llvm.matrix.column.major.load.v6f32.i8(ptr, i8, i1, i32, i32)
+declare <6 x float> @llvm.matrix.column.major.load.v6f32.i128(ptr, i28, i1, i32, i32)
 
 declare void @llvm.matrix.column.major.store.v4f32.i64(<4 x float>, ptr, i64, i1, i32, i32)
 declare void @llvm.matrix.column.major.store.v6f32.i64(<6 x float>, ptr, i64, i1, i32, i32)
 declare void @llvm.matrix.column.major.store.v4i32.vi32(<4 x i32>, ptr, i64, i1, i32, i32)
 declare void @llvm.matrix.column.major.store.v4f32.p0(<4 x float>, ptr, i64, i1, i32, i32)
 declare void @llvm.matrix.column.major.store.v4p0.i64(<4 x ptr>, ptr, i64, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v4p0.i8(<4 x ptr>, ptr, i8, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v4p0.i128(<4 x ptr>, ptr, i128, i1, i32, i32)
 
 declare <4 x i32>   @llvm.matrix.transpose.v4i32.v4f32(<4 x float>, i32, i32)
 declare <4 x float> @llvm.matrix.transpose.v4f32(<4 x float>, i32, i32)