[llvm] [Matrix] Add a Remark when matrices get flattened (PR #142078)

Fri May 30 12:55:34 PDT 2025

https://github.com/jroelofs updated https://github.com/llvm/llvm-project/pull/142078

>From cd456be9267dfe63946d82bd924ebc10cc9bad19 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Thu, 29 May 2025 19:45:04 -0700
Subject: [PATCH] [Matrix] Add Remarks when matrices get flattened

This is a potential source of overhead, which we might be able to alleviate in
some cases. For example, static element extracts, or shuffles that pluck out a
specific row.
---
 llvm/include/llvm/IR/DiagnosticInfo.h         |   1 +
 llvm/lib/IR/DiagnosticInfo.cpp                |  12 ++
 .../Scalar/LowerMatrixIntrinsics.cpp          |  52 ++++-
 .../LowerMatrixIntrinsics/flatten.ll          |  38 ++++
 .../LowerMatrixIntrinsics/transpose-fold.ll   |  39 +++-
 .../LowerMatrixIntrinsics/transpose-opts.ll   | 181 +++++++++++++++++-
 6 files changed, 316 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/flatten.ll

diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h
index a1113134f6a34..2f5c6e62c7559 100644
--- a/llvm/include/llvm/IR/DiagnosticInfo.h
+++ b/llvm/include/llvm/IR/DiagnosticInfo.h
@@ -505,6 +505,7 @@ class DiagnosticInfoOptimizationBase : public DiagnosticInfoWithLocationBase {
 
     explicit Argument(StringRef Str = "") : Key("String"), Val(Str) {}
     Argument(StringRef Key, const Value *V);
+    Argument(StringRef Key, const Instruction &I);
     Argument(StringRef Key, const Type *T);
     Argument(StringRef Key, StringRef S);
     Argument(StringRef Key, const char *S) : Argument(Key, StringRef(S)) {};
diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index 4315f63cce4f8..3be708e8cb45e 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -211,6 +212,9 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
   else if (isa<Constant>(V)) {
     raw_string_ostream OS(Val);
     V->printAsOperand(OS, /*PrintType=*/false);
+  } else if (auto *I = dyn_cast<IntrinsicInst>(V)) {
+    raw_string_ostream OS(Val);
+    OS << "call " << *V->getType() << " @" << I->getCalledFunction()->getName();
   } else if (auto *I = dyn_cast<Instruction>(V)) {
     Val = I->getOpcodeName();
   } else if (auto *MD = dyn_cast<MetadataAsValue>(V)) {
@@ -219,6 +223,14 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
   }
 }
 
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
+                                                   const Instruction &I)
+    : Key(std::string(Key)) {
+  Loc = I.getDebugLoc();
+  raw_string_ostream OS(Val);
+  OS << I;
+}
+
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Type *T)
     : Key(std::string(Key)) {
   raw_string_ostream OS(Val);
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 756a72e6d97bc..ae5c1195f315f 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -572,6 +572,37 @@ class LowerMatrixIntrinsics {
       SplitVecs.push_back(V);
     }
 
+    if (ORE) {
+      if (Instruction *Inst = dyn_cast<Instruction>(MatrixVal)) {
+        if (Found != Inst2ColumnMatrix.end()) {
+          ORE->emit([&]() {
+            // FIXME: SplitVecs.size() doesn't count the shuffles that
+            // embedInVector created.
+            return OptimizationRemarkAnalysis(DEBUG_TYPE, "mismatched-shape",
+                                              Inst)
+                   << "matrix reshape from "
+                   << ore::NV("FromRows", Found->second.getNumRows()) << "x"
+                   << ore::NV("FromCols", Found->second.getNumColumns())
+                   << " to " << ore::NV("ToRows", SI.NumRows) << "x"
+                   << ore::NV("ToCols", SI.NumColumns) << " using at least "
+                   << ore::NV("Shuffles", SplitVecs.size()) << " shuffles";
+          });
+        } else {
+          ORE->emit([&]() {
+            return OptimizationRemarkMissed(DEBUG_TYPE,
+                                            "unknown-shape-lowering-def", Inst)
+                   << "splitting a " << ore::NV("Rows", SI.NumRows) << "x"
+                   << ore::NV("Cols", SI.NumColumns) << " matrix "
+                   << " with " << ore::NV("Shuffles", SplitVecs.size())
+                   << " shuffles because we do not have a shape-aware lowering "
+                      "for its def: "
+                   << ore::NV("Instr", Inst) << ore::setExtraArgs()
+                   << ore::NV("Opcode", Inst->getOpcodeName());
+          });
+        }
+      }
+    }
+
     return {SplitVecs};
   }
 
@@ -1351,11 +1382,24 @@ class LowerMatrixIntrinsics {
     ToRemove.push_back(Inst);
     Value *Flattened = nullptr;
     for (Use &U : llvm::make_early_inc_range(Inst->uses())) {
-      if (!ShapeMap.contains(U.getUser())) {
-        if (!Flattened)
-          Flattened = Matrix.embedInVector(Builder);
-        U.set(Flattened);
+      if (ShapeMap.contains(U.getUser()))
+        continue;
+
+      if (!Flattened) {
+        Flattened = Matrix.embedInVector(Builder);
+        if (ORE)
+          if (Instruction *User = dyn_cast<Instruction>(U.getUser()))
+            ORE->emit(OptimizationRemarkMissed(
+                          DEBUG_TYPE, "unknown-shape-lowering-use", User)
+                      << "flattening a " << ore::NV("Rows", Matrix.getNumRows())
+                      << "x" << ore::NV("Cols", Matrix.getNumColumns())
+                      << " matrix " << ore::NV("Source", *Inst)
+                      << " because we do not have a shape-aware lowering for "
+                         "its user:"
+                      << ore::NV("Instr", *User) << ore::setExtraArgs()
+                      << ore::NV("Opcode", User->getOpcodeName()));
       }
+      U.set(Flattened);
     }
   }
 
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/flatten.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/flatten.ll
new file mode 100644
index 0000000000000..2752fddd21477
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/flatten.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+; RUN: opt -passes=lower-matrix-intrinsics -pass-remarks-missed=lower-matrix-intrinsics < %s -pass-remarks-output=%t -disable-output && FileCheck --input-file %t %s --check-prefix=REMARK
+
+define void @diag_3x3(ptr %in, ptr %out) {
+; REMARK-LABEL: --- !Missed
+; REMARK-NEXT: Pass:            lower-matrix-intrinsics
+; REMARK-NEXT: Name:            unknown-shape-lowering-use
+; REMARK-NEXT: Function:        diag_3x3
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'flattening a '
+; REMARK-NEXT:   - Rows:            '3'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - Source:          '  %{{.*}} = call <9 x float> @llvm.matrix.column.major.load.v9f32.i64(ptr %{{.*}}, i64 3, i1 false, i32 3, i32 3)'
+; REMARK-NEXT:   - String:          ' because we do not have a shape-aware lowering for its user:'
+; REMARK-NEXT:   - Instr:           '  %{{.*}} = shufflevector <9 x float> %{{.*}}, <9 x float> poison, <3 x i32> <i32 0, i32 4, i32 8>'
+; REMARK-NEXT:   - Opcode:          shufflevector
+; REMARK-NEXT: ...
+; CHECK-LABEL: @diag_3x3(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[IN:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 3
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[IN]], i64 6
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <6 x float> [[TMP1]], <6 x float> [[TMP2]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[DIAG:%.*]] = shufflevector <9 x float> [[TMP3]], <9 x float> poison, <3 x i32> <i32 0, i32 4, i32 8>
+; CHECK-NEXT:    store <3 x float> [[DIAG]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %inv = call <9 x float> @llvm.matrix.column.major.load(ptr %in, i64 3, i1 false, i32 3, i32 3)
+  %diag = shufflevector <9 x float> %inv, <9 x float> poison, <3 x i32> <i32 0, i32 4, i32 8>
+  store <3 x float> %diag, ptr %out
+  ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-fold.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-fold.ll
index ef2e224f69c5e..0c7930fb2492a 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-fold.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-fold.ll
@@ -1,9 +1,46 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s -pass-remarks-output=%t | FileCheck %s && FileCheck --input-file %t --check-prefix=REMARK %s
 
 ; We can only fold matching transposes.
 
 define void @reshape(ptr %in, ptr %out) {
+; REMARK:      --- !Analysis
+; REMARK-NEXT: Pass:            lower-matrix-intrinsics
+; REMARK-NEXT: Name:            mismatched-shape
+; REMARK-NEXT: Function:        reshape
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'matrix reshape from '
+; REMARK-NEXT:   - FromRows:        '4'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - FromCols:        '1'
+; REMARK-NEXT:   - String:          ' to '
+; REMARK-NEXT:   - ToRows:          '2'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - ToCols:          '2'
+; REMARK-NEXT:   - String:          ' using at least '
+; REMARK-NEXT:   - Shuffles:        '2'
+; REMARK-NEXT:   - String:          ' shuffles'
+; REMARK-NEXT: ...
+; REMARK-NEXT: --- !Passed
+; REMARK-NEXT: Pass:            lower-matrix-intrinsics
+; REMARK-NEXT: Name:            matrix-lowered
+; REMARK-NEXT: Function:        reshape
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'Lowered with '
+; REMARK-NEXT:   - NumStores:       '8'
+; REMARK-NEXT:   - String:          ' stores, '
+; REMARK-NEXT:   - NumLoads:        '8'
+; REMARK-NEXT:   - String:          ' loads, '
+; REMARK-NEXT:   - NumComputeOps:   '8'
+; REMARK-NEXT:   - String:          ' compute ops, '
+; REMARK-NEXT:   - NumExposedTransposes: '1'
+; REMARK-NEXT:   - String:          ' exposed transposes'
+; REMARK-NEXT:   - String:           |
+; REMARK-NEXT:{{      }}
+; REMARK-NEXT:       store(
+; REMARK-NEXT:        transpose.4x1.double(load(addr %in)), 
+; REMARK-NEXT:        addr %out)
+; REMARK-NEXT: ...
 ; CHECK-LABEL: define void @reshape(
 ; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts.ll
index 57d038e5cf947..bdf68cfb31f16 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts.ll
@@ -50,7 +50,68 @@ entry:
 }
 
 define void @multiply_ntt(ptr %A, ptr %B, ptr %C, ptr %R) {
-; REMARK:      Pass:            lower-matrix-intrinsics
+; REMARK-LABEL: Name:            unknown-shape-lowering-def
+; REMARK-NEXT: Function:        multiply_ntt
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'splitting a '
+; REMARK-NEXT:   - Rows:            '2'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - String:          ' with '
+; REMARK-NEXT:   - Shuffles:        '3'
+; REMARK-NEXT:   - String:          ' shuffles because we do not have a shape-aware lowering for its def: '
+; REMARK-NEXT:   - Instr:           load
+; REMARK-NEXT:   - Opcode:          load
+; REMARK-NEXT: ...
+; REMARK-NEXT: --- !Missed
+; REMARK-NEXT: Pass:            lower-matrix-intrinsics
+; REMARK-LABEL: Name:            unknown-shape-lowering-def
+; REMARK-NEXT: Function:        multiply_ntt
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'splitting a '
+; REMARK-NEXT:   - Rows:            '4'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - String:          ' with '
+; REMARK-NEXT:   - Shuffles:        '3'
+; REMARK-NEXT:   - String:          ' shuffles because we do not have a shape-aware lowering for its def: '
+; REMARK-NEXT:   - Instr:           'call <12 x double> @llvm.matrix.multiply.v12f64.v8f64.v6f64'
+; REMARK-NEXT:   - Opcode:          call
+; REMARK-NEXT: ...
+; REMARK-NEXT: --- !Missed
+; REMARK-NEXT: Pass:            lower-matrix-intrinsics
+; REMARK-LABEL: Name:            unknown-shape-lowering-use
+; REMARK-NEXT: Function:        multiply_ntt
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'flattening a '
+; REMARK-NEXT:   - Rows:            '2'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - Source:          '  %{{.*}} = load <6 x double>, ptr %{{.*}}, align 16'
+; REMARK-NEXT:   - String:          ' because we do not have a shape-aware lowering for its user:'
+; REMARK-NEXT:   - Instr:           '  %{{.*}} = shufflevector <6 x double> %{{.*}}, <6 x double> poison, <2 x i32> <i32 4, i32 5>'
+; REMARK-NEXT:   - Opcode:          shufflevector
+; REMARK-NEXT: ...
+; REMARK-NEXT: --- !Missed
+; REMARK-NEXT: Pass:            lower-matrix-intrinsics
+; REMARK-LABEL: Name:            unknown-shape-lowering-use
+; REMARK-NEXT: Function:        multiply_ntt
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'flattening a '
+; REMARK-NEXT:   - Rows:            '4'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - Source:          '  %{{.*}} = call <12 x double> @llvm.matrix.multiply.v12f64.v8f64.v6f64(<8 x double> %{{.*}}, <6 x double> %{{.*}}, i32 4, i32 2, i32 3)'
+; REMARK-NEXT:   - String:          ' because we do not have a shape-aware lowering for its user:'
+; REMARK-NEXT:   - Instr:           '  %{{.*}} = shufflevector <12 x double> %{{.*}}, <12 x double> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>'
+; REMARK-NEXT:   - Opcode:          shufflevector
+; REMARK-NEXT: ...
+; REMARK-NEXT: --- !Passed
+; REMARK-NEXT: Pass:            lower-matrix-intrinsics
 ; REMARK-NEXT: Name:            matrix-lowered
 ; REMARK-NEXT: Function:        multiply_ntt
 ; REMARK-NEXT: Args:
@@ -438,7 +499,68 @@ entry:
 }
 
 define void @multiply_nt_t(ptr %A, ptr %B, ptr %C) {
-; REMARK:      Pass:            lower-matrix-intrinsics
+; REMARK-LABEL: Name:            unknown-shape-lowering-def
+; REMARK-NEXT: Function:        multiply_nt_t
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'splitting a '
+; REMARK-NEXT:   - Rows:            '4'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - String:          ' with '
+; REMARK-NEXT:   - Shuffles:        '3'
+; REMARK-NEXT:   - String:          ' shuffles because we do not have a shape-aware lowering for its def: '
+; REMARK-NEXT:   - Instr:           load
+; REMARK-NEXT:   - Opcode:          load
+; REMARK-NEXT: ...
+; REMARK-NEXT: --- !Missed
+; REMARK-NEXT: Pass:            lower-matrix-intrinsics
+; REMARK-LABEL: Name:            unknown-shape-lowering-def
+; REMARK-NEXT: Function:        multiply_nt_t
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'splitting a '
+; REMARK-NEXT:   - Rows:            '2'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - String:          ' with '
+; REMARK-NEXT:   - Shuffles:        '3'
+; REMARK-NEXT:   - String:          ' shuffles because we do not have a shape-aware lowering for its def: '
+; REMARK-NEXT:   - Instr:           load
+; REMARK-NEXT:   - Opcode:          load
+; REMARK-NEXT: ...
+; REMARK-NEXT: --- !Missed
+; REMARK-NEXT: Pass:            lower-matrix-intrinsics
+; REMARK-LABEL: Name:            unknown-shape-lowering-use
+; REMARK-NEXT: Function:        multiply_nt_t
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'flattening a '
+; REMARK-NEXT:   - Rows:            '2'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - Source:          '  %{{.*}} = load <6 x double>, ptr %{{.*}}, align 16'
+; REMARK-NEXT:   - String:          ' because we do not have a shape-aware lowering for its user:'
+; REMARK-NEXT:   - Instr:           '  %{{.*}} = shufflevector <6 x double> %{{.*}}, <6 x double> poison, <2 x i32> <i32 4, i32 5>'
+; REMARK-NEXT:   - Opcode:          shufflevector
+; REMARK-NEXT: ...
+; REMARK-NEXT: --- !Missed
+; REMARK-NEXT: Pass:            lower-matrix-intrinsics
+; REMARK-LABEL: Name:            unknown-shape-lowering-use
+; REMARK-NEXT: Function:        multiply_nt_t
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'flattening a '
+; REMARK-NEXT:   - Rows:            '4'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - Source:          '  %{{.*}} = load <12 x double>, ptr %B, align 16'
+; REMARK-NEXT:   - String:          ' because we do not have a shape-aware lowering for its user:'
+; REMARK-NEXT:   - Instr:           '  %{{.*}} = shufflevector <12 x double> %{{.*}}, <12 x double> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>'
+; REMARK-NEXT:   - Opcode:          shufflevector
+; REMARK-NEXT: ...
+; REMARK-NEXT: --- !Passed
+; REMARK-NEXT: Pass:            lower-matrix-intrinsics
 ; REMARK-NEXT: Name:            matrix-lowered
 ; REMARK-NEXT: Function:        multiply_nt_t
 ; REMARK-NEXT: Args:
@@ -573,6 +695,61 @@ entry:
 }
 
 define void @multiply_ntt_t(ptr %A, ptr %B, ptr %C, ptr %R) {
+; REMARK-LABEL: Name:            unknown-shape-lowering-def
+; REMARK-NEXT: Function:        multiply_ntt_t
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'splitting a '
+; REMARK-NEXT:   - Rows:            '3'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - String:          ' with '
+; REMARK-NEXT:   - Shuffles:        '3'
+; REMARK-NEXT:   - String:          ' shuffles because we do not have a shape-aware lowering for its def: '
+; REMARK-NEXT:   - Instr:           'call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64'
+; REMARK-NEXT:   - Opcode:          call
+; REMARK-NEXT: ...
+; REMARK-LABEL: Name:            unknown-shape-lowering-def
+; REMARK-NEXT: Function:        multiply_ntt_t
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'splitting a '
+; REMARK-NEXT:   - Rows:            '3'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - String:          ' with '
+; REMARK-NEXT:   - Shuffles:        '3'
+; REMARK-NEXT:   - String:          ' shuffles because we do not have a shape-aware lowering for its def: '
+; REMARK-NEXT:   - Instr:           load
+; REMARK-NEXT:   - Opcode:          load
+; REMARK-NEXT: ...
+; REMARK-LABEL: Name:            unknown-shape-lowering-use
+; REMARK-NEXT: Function:        multiply_ntt_t
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'flattening a '
+; REMARK-NEXT:   - Rows:            '3'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - Source:          '  %{{.*}} = load <9 x double>, ptr %A, align 16'
+; REMARK-NEXT:   - String:          ' because we do not have a shape-aware lowering for its user:'
+; REMARK-NEXT:   - Instr:           '  %{{.*}} = shufflevector <9 x double> %{{.*}}, <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>'
+; REMARK-NEXT:   - Opcode:          shufflevector
+; REMARK-NEXT: ...
+; REMARK-LABEL: Pass:            lower-matrix-intrinsics
+; REMARK-NEXT: Name:            unknown-shape-lowering-use
+; REMARK-NEXT: Function:        multiply_ntt_t
+; REMARK-NEXT: Args:
+; REMARK-NEXT:   - String:          'flattening a '
+; REMARK-NEXT:   - Rows:            '3'
+; REMARK-NEXT:   - String:          x
+; REMARK-NEXT:   - Cols:            '3'
+; REMARK-NEXT:   - String:          ' matrix '
+; REMARK-NEXT:   - Source:          '  %{{.*}} = call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %{{.*}}, <9 x double> %{{.*}}, i32 3, i32 3, i32 3)'
+; REMARK-NEXT:   - String:          ' because we do not have a shape-aware lowering for its user:'
+; REMARK-NEXT:   - Instr:           '  %{{.*}} = shufflevector <9 x double> %{{.*}}, <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>'
+; REMARK-NEXT:   - Opcode:          shufflevector
+; REMARK-NEXT: ...
 ; REMARK:      Pass:            lower-matrix-intrinsics
 ; REMARK-NEXT: Name:            matrix-lowered
 ; REMARK-NEXT: Function:        multiply_ntt_t