[clang] Enable matrices in HLSL (PR #111415)

Mon Oct 7 11:09:13 PDT 2024

https://github.com/pow2clk updated https://github.com/llvm/llvm-project/pull/111415

>From f4751fcfe13b971de5c2071d3bbdc35e72051817 Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth at microsoft.com>
Date: Tue, 27 Aug 2024 09:53:13 -0600
Subject: [PATCH 1/2] Enable matrices in HLSL

HLSL needs matrix support. This allows matrices when the target
language is HLSL and defines a matrix template alias that allows
the short forms of matrix types to be defined in typedefs in a
default header. Makes some changes to how matrices are printed
in HLSL policy. This required a tweak to how existing matrices are
printed in diagnostics that is more consistent with other types.

These matrix types will function exactly as the clang matrix extension
dictates. Alterations to that behavior both specific to HLSL and
also potentially expanding on the matrix extension will follow.

fixes #109839
---
 .../clang/Sema/HLSLExternalSemaSource.h       |    1 +
 clang/lib/AST/ASTContext.cpp                  |    2 +-
 clang/lib/AST/TypePrinter.cpp                 |   40 +-
 clang/lib/Headers/hlsl/hlsl_basic_types.h     |  232 +++
 clang/lib/Sema/HLSLExternalSemaSource.cpp     |   73 +
 clang/lib/Sema/SemaType.cpp                   |    2 +-
 clang/test/AST/HLSL/matrix-alias.hlsl         |   49 +
 clang/test/AST/HLSL/vector-alias.hlsl         |    2 +-
 .../test/CodeGenCXX/matrix-type-operators.cpp |    8 +-
 .../BuiltinMatrix/matrix-cast-template.hlsl   |  351 ++++
 .../Types/BuiltinMatrix/matrix-cast.hlsl      |  162 ++
 .../matrix-transpose-template.hlsl            |   82 +
 .../Types/BuiltinMatrix/matrix-transpose.hlsl |   95 ++
 .../matrix-type-operators-template.hlsl       |  447 +++++
 .../BuiltinMatrix/matrix-type-operators.hlsl  | 1515 +++++++++++++++++
 .../Types/BuiltinMatrix/matrix-type.hlsl      |  217 +++
 clang/test/CodeGenHLSL/basic_types.hlsl       |   18 +
 clang/test/CodeGenHLSL/matrix-types.hlsl      |  348 ++++
 clang/test/Sema/matrix-type-operators.c       |   16 +-
 .../Types/BuiltinMatrix/matrix-cast.hlsl      |  138 ++
 .../matrix-index-operator-type.hlsl           |   27 +
 .../Types/BuiltinMatrix/matrix-transpose.hlsl |   56 +
 .../BuiltinMatrix/matrix-type-operators.hlsl  |  307 ++++
 .../Types/BuiltinMatrix/matrix-type.hlsl      |   48 +
 clang/test/SemaTemplate/matrix-type.cpp       |    2 +-
 25 files changed, 4209 insertions(+), 29 deletions(-)
 create mode 100644 clang/test/AST/HLSL/matrix-alias.hlsl
 create mode 100644 clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast-template.hlsl
 create mode 100644 clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast.hlsl
 create mode 100644 clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose-template.hlsl
 create mode 100644 clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl
 create mode 100644 clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators-template.hlsl
 create mode 100644 clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl
 create mode 100644 clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type.hlsl
 create mode 100644 clang/test/CodeGenHLSL/matrix-types.hlsl
 create mode 100644 clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-cast.hlsl
 create mode 100644 clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-index-operator-type.hlsl
 create mode 100644 clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl
 create mode 100644 clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl
 create mode 100644 clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type.hlsl

diff --git a/clang/include/clang/Sema/HLSLExternalSemaSource.h b/clang/include/clang/Sema/HLSLExternalSemaSource.h
index 3c7495e66055dc..6f4b72045a9464 100644
--- a/clang/include/clang/Sema/HLSLExternalSemaSource.h
+++ b/clang/include/clang/Sema/HLSLExternalSemaSource.h
@@ -28,6 +28,7 @@ class HLSLExternalSemaSource : public ExternalSemaSource {
   llvm::DenseMap<CXXRecordDecl *, CompletionFunction> Completions;
 
   void defineHLSLVectorAlias();
+  void defineHLSLMatrixAlias();
   void defineTrivialHLSLTypes();
   void defineHLSLTypesWithForwardDeclarations();
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index a81429ad6a2380..ed10c210ed170f 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1381,7 +1381,7 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
   if (LangOpts.OpenACC && !LangOpts.OpenMP) {
     InitBuiltinType(ArraySectionTy, BuiltinType::ArraySection);
   }
-  if (LangOpts.MatrixTypes)
+  if (LangOpts.MatrixTypes || LangOpts.HLSL)
     InitBuiltinType(IncompleteMatrixIdxTy, BuiltinType::IncompleteMatrixIdx);
 
   // Builtin types for 'id', 'Class', and 'SEL'.
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index ca75bb97c158e1..142717201557f3 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -852,34 +852,50 @@ void TypePrinter::printExtVectorAfter(const ExtVectorType *T, raw_ostream &OS) {
 
 void TypePrinter::printConstantMatrixBefore(const ConstantMatrixType *T,
                                             raw_ostream &OS) {
+  if (Policy.UseHLSLTypes)
+    OS << "matrix<";
   printBefore(T->getElementType(), OS);
-  OS << " __attribute__((matrix_type(";
-  OS << T->getNumRows() << ", " << T->getNumColumns();
-  OS << ")))";
 }
 
 void TypePrinter::printConstantMatrixAfter(const ConstantMatrixType *T,
                                            raw_ostream &OS) {
   printAfter(T->getElementType(), OS);
+  if (Policy.UseHLSLTypes) {
+    OS << ", ";
+    OS << T->getNumRows() << ", " << T->getNumColumns();
+    OS << ">";
+  } else {
+    OS << " __attribute__((matrix_type(";
+    OS << T->getNumRows() << ", " << T->getNumColumns();
+    OS << ")))";
+  }
 }
 
 void TypePrinter::printDependentSizedMatrixBefore(
     const DependentSizedMatrixType *T, raw_ostream &OS) {
+  if (Policy.UseHLSLTypes)
+    OS << "matrix<";
   printBefore(T->getElementType(), OS);
-  OS << " __attribute__((matrix_type(";
-  if (T->getRowExpr()) {
-    T->getRowExpr()->printPretty(OS, nullptr, Policy);
-  }
-  OS << ", ";
-  if (T->getColumnExpr()) {
-    T->getColumnExpr()->printPretty(OS, nullptr, Policy);
-  }
-  OS << ")))";
 }
 
 void TypePrinter::printDependentSizedMatrixAfter(
     const DependentSizedMatrixType *T, raw_ostream &OS) {
   printAfter(T->getElementType(), OS);
+  if (Policy.UseHLSLTypes)
+    OS << ", ";
+  else
+    OS << " __attribute__((matrix_type(";
+
+  if (Expr *E = T->getRowExpr())
+    E->printPretty(OS, nullptr, Policy);
+  OS << ", ";
+  if (Expr *E = T->getColumnExpr())
+    E->printPretty(OS, nullptr, Policy);
+
+  if (Policy.UseHLSLTypes)
+    OS << ">";
+  else
+    OS << ")))";
 }
 
 void
diff --git a/clang/lib/Headers/hlsl/hlsl_basic_types.h b/clang/lib/Headers/hlsl/hlsl_basic_types.h
index eff94e0d7f9500..b6eeffa2f5e362 100644
--- a/clang/lib/Headers/hlsl/hlsl_basic_types.h
+++ b/clang/lib/Headers/hlsl/hlsl_basic_types.h
@@ -115,6 +115,238 @@ typedef vector<float64_t, 2> float64_t2;
 typedef vector<float64_t, 3> float64_t3;
 typedef vector<float64_t, 4> float64_t4;
 
+#ifdef __HLSL_ENABLE_16_BIT
+typedef matrix<int16_t, 1, 1> int16_t1x1;
+typedef matrix<int16_t, 1, 2> int16_t1x2;
+typedef matrix<int16_t, 1, 3> int16_t1x3;
+typedef matrix<int16_t, 1, 4> int16_t1x4;
+typedef matrix<int16_t, 2, 1> int16_t2x1;
+typedef matrix<int16_t, 2, 2> int16_t2x2;
+typedef matrix<int16_t, 2, 3> int16_t2x3;
+typedef matrix<int16_t, 2, 4> int16_t2x4;
+typedef matrix<int16_t, 3, 1> int16_t3x1;
+typedef matrix<int16_t, 3, 2> int16_t3x2;
+typedef matrix<int16_t, 3, 3> int16_t3x3;
+typedef matrix<int16_t, 3, 4> int16_t3x4;
+typedef matrix<int16_t, 4, 1> int16_t4x1;
+typedef matrix<int16_t, 4, 2> int16_t4x2;
+typedef matrix<int16_t, 4, 3> int16_t4x3;
+typedef matrix<int16_t, 4, 4> int16_t4x4;
+typedef matrix<uint16_t, 1, 1> uint16_t1x1;
+typedef matrix<uint16_t, 1, 2> uint16_t1x2;
+typedef matrix<uint16_t, 1, 3> uint16_t1x3;
+typedef matrix<uint16_t, 1, 4> uint16_t1x4;
+typedef matrix<uint16_t, 2, 1> uint16_t2x1;
+typedef matrix<uint16_t, 2, 2> uint16_t2x2;
+typedef matrix<uint16_t, 2, 3> uint16_t2x3;
+typedef matrix<uint16_t, 2, 4> uint16_t2x4;
+typedef matrix<uint16_t, 3, 1> uint16_t3x1;
+typedef matrix<uint16_t, 3, 2> uint16_t3x2;
+typedef matrix<uint16_t, 3, 3> uint16_t3x3;
+typedef matrix<uint16_t, 3, 4> uint16_t3x4;
+typedef matrix<uint16_t, 4, 1> uint16_t4x1;
+typedef matrix<uint16_t, 4, 2> uint16_t4x2;
+typedef matrix<uint16_t, 4, 3> uint16_t4x3;
+typedef matrix<uint16_t, 4, 4> uint16_t4x4;
+#endif
+typedef matrix<int, 1, 1> int1x1;
+typedef matrix<int, 1, 2> int1x2;
+typedef matrix<int, 1, 3> int1x3;
+typedef matrix<int, 1, 4> int1x4;
+typedef matrix<int, 2, 1> int2x1;
+typedef matrix<int, 2, 2> int2x2;
+typedef matrix<int, 2, 3> int2x3;
+typedef matrix<int, 2, 4> int2x4;
+typedef matrix<int, 3, 1> int3x1;
+typedef matrix<int, 3, 2> int3x2;
+typedef matrix<int, 3, 3> int3x3;
+typedef matrix<int, 3, 4> int3x4;
+typedef matrix<int, 4, 1> int4x1;
+typedef matrix<int, 4, 2> int4x2;
+typedef matrix<int, 4, 3> int4x3;
+typedef matrix<int, 4, 4> int4x4;
+typedef matrix<uint, 1, 1> uint1x1;
+typedef matrix<uint, 1, 2> uint1x2;
+typedef matrix<uint, 1, 3> uint1x3;
+typedef matrix<uint, 1, 4> uint1x4;
+typedef matrix<uint, 2, 1> uint2x1;
+typedef matrix<uint, 2, 2> uint2x2;
+typedef matrix<uint, 2, 3> uint2x3;
+typedef matrix<uint, 2, 4> uint2x4;
+typedef matrix<uint, 3, 1> uint3x1;
+typedef matrix<uint, 3, 2> uint3x2;
+typedef matrix<uint, 3, 3> uint3x3;
+typedef matrix<uint, 3, 4> uint3x4;
+typedef matrix<uint, 4, 1> uint4x1;
+typedef matrix<uint, 4, 2> uint4x2;
+typedef matrix<uint, 4, 3> uint4x3;
+typedef matrix<uint, 4, 4> uint4x4;
+typedef matrix<int32_t, 1, 1> int32_t1x1;
+typedef matrix<int32_t, 1, 2> int32_t1x2;
+typedef matrix<int32_t, 1, 3> int32_t1x3;
+typedef matrix<int32_t, 1, 4> int32_t1x4;
+typedef matrix<int32_t, 2, 1> int32_t2x1;
+typedef matrix<int32_t, 2, 2> int32_t2x2;
+typedef matrix<int32_t, 2, 3> int32_t2x3;
+typedef matrix<int32_t, 2, 4> int32_t2x4;
+typedef matrix<int32_t, 3, 1> int32_t3x1;
+typedef matrix<int32_t, 3, 2> int32_t3x2;
+typedef matrix<int32_t, 3, 3> int32_t3x3;
+typedef matrix<int32_t, 3, 4> int32_t3x4;
+typedef matrix<int32_t, 4, 1> int32_t4x1;
+typedef matrix<int32_t, 4, 2> int32_t4x2;
+typedef matrix<int32_t, 4, 3> int32_t4x3;
+typedef matrix<int32_t, 4, 4> int32_t4x4;
+typedef matrix<uint32_t, 1, 1> uint32_t1x1;
+typedef matrix<uint32_t, 1, 2> uint32_t1x2;
+typedef matrix<uint32_t, 1, 3> uint32_t1x3;
+typedef matrix<uint32_t, 1, 4> uint32_t1x4;
+typedef matrix<uint32_t, 2, 1> uint32_t2x1;
+typedef matrix<uint32_t, 2, 2> uint32_t2x2;
+typedef matrix<uint32_t, 2, 3> uint32_t2x3;
+typedef matrix<uint32_t, 2, 4> uint32_t2x4;
+typedef matrix<uint32_t, 3, 1> uint32_t3x1;
+typedef matrix<uint32_t, 3, 2> uint32_t3x2;
+typedef matrix<uint32_t, 3, 3> uint32_t3x3;
+typedef matrix<uint32_t, 3, 4> uint32_t3x4;
+typedef matrix<uint32_t, 4, 1> uint32_t4x1;
+typedef matrix<uint32_t, 4, 2> uint32_t4x2;
+typedef matrix<uint32_t, 4, 3> uint32_t4x3;
+typedef matrix<uint32_t, 4, 4> uint32_t4x4;
+typedef matrix<int64_t, 1, 1> int64_t1x1;
+typedef matrix<int64_t, 1, 2> int64_t1x2;
+typedef matrix<int64_t, 1, 3> int64_t1x3;
+typedef matrix<int64_t, 1, 4> int64_t1x4;
+typedef matrix<int64_t, 2, 1> int64_t2x1;
+typedef matrix<int64_t, 2, 2> int64_t2x2;
+typedef matrix<int64_t, 2, 3> int64_t2x3;
+typedef matrix<int64_t, 2, 4> int64_t2x4;
+typedef matrix<int64_t, 3, 1> int64_t3x1;
+typedef matrix<int64_t, 3, 2> int64_t3x2;
+typedef matrix<int64_t, 3, 3> int64_t3x3;
+typedef matrix<int64_t, 3, 4> int64_t3x4;
+typedef matrix<int64_t, 4, 1> int64_t4x1;
+typedef matrix<int64_t, 4, 2> int64_t4x2;
+typedef matrix<int64_t, 4, 3> int64_t4x3;
+typedef matrix<int64_t, 4, 4> int64_t4x4;
+typedef matrix<uint64_t, 1, 1> uint64_t1x1;
+typedef matrix<uint64_t, 1, 2> uint64_t1x2;
+typedef matrix<uint64_t, 1, 3> uint64_t1x3;
+typedef matrix<uint64_t, 1, 4> uint64_t1x4;
+typedef matrix<uint64_t, 2, 1> uint64_t2x1;
+typedef matrix<uint64_t, 2, 2> uint64_t2x2;
+typedef matrix<uint64_t, 2, 3> uint64_t2x3;
+typedef matrix<uint64_t, 2, 4> uint64_t2x4;
+typedef matrix<uint64_t, 3, 1> uint64_t3x1;
+typedef matrix<uint64_t, 3, 2> uint64_t3x2;
+typedef matrix<uint64_t, 3, 3> uint64_t3x3;
+typedef matrix<uint64_t, 3, 4> uint64_t3x4;
+typedef matrix<uint64_t, 4, 1> uint64_t4x1;
+typedef matrix<uint64_t, 4, 2> uint64_t4x2;
+typedef matrix<uint64_t, 4, 3> uint64_t4x3;
+typedef matrix<uint64_t, 4, 4> uint64_t4x4;
+
+typedef matrix<half, 1, 1> half1x1;
+typedef matrix<half, 1, 2> half1x2;
+typedef matrix<half, 1, 3> half1x3;
+typedef matrix<half, 1, 4> half1x4;
+typedef matrix<half, 2, 1> half2x1;
+typedef matrix<half, 2, 2> half2x2;
+typedef matrix<half, 2, 3> half2x3;
+typedef matrix<half, 2, 4> half2x4;
+typedef matrix<half, 3, 1> half3x1;
+typedef matrix<half, 3, 2> half3x2;
+typedef matrix<half, 3, 3> half3x3;
+typedef matrix<half, 3, 4> half3x4;
+typedef matrix<half, 4, 1> half4x1;
+typedef matrix<half, 4, 2> half4x2;
+typedef matrix<half, 4, 3> half4x3;
+typedef matrix<half, 4, 4> half4x4;
+typedef matrix<float, 1, 1> float1x1;
+typedef matrix<float, 1, 2> float1x2;
+typedef matrix<float, 1, 3> float1x3;
+typedef matrix<float, 1, 4> float1x4;
+typedef matrix<float, 2, 1> float2x1;
+typedef matrix<float, 2, 2> float2x2;
+typedef matrix<float, 2, 3> float2x3;
+typedef matrix<float, 2, 4> float2x4;
+typedef matrix<float, 3, 1> float3x1;
+typedef matrix<float, 3, 2> float3x2;
+typedef matrix<float, 3, 3> float3x3;
+typedef matrix<float, 3, 4> float3x4;
+typedef matrix<float, 4, 1> float4x1;
+typedef matrix<float, 4, 2> float4x2;
+typedef matrix<float, 4, 3> float4x3;
+typedef matrix<float, 4, 4> float4x4;
+typedef matrix<double, 1, 1> double1x1;
+typedef matrix<double, 1, 2> double1x2;
+typedef matrix<double, 1, 3> double1x3;
+typedef matrix<double, 1, 4> double1x4;
+typedef matrix<double, 2, 1> double2x1;
+typedef matrix<double, 2, 2> double2x2;
+typedef matrix<double, 2, 3> double2x3;
+typedef matrix<double, 2, 4> double2x4;
+typedef matrix<double, 3, 1> double3x1;
+typedef matrix<double, 3, 2> double3x2;
+typedef matrix<double, 3, 3> double3x3;
+typedef matrix<double, 3, 4> double3x4;
+typedef matrix<double, 4, 1> double4x1;
+typedef matrix<double, 4, 2> double4x2;
+typedef matrix<double, 4, 3> double4x3;
+typedef matrix<double, 4, 4> double4x4;
+
+#ifdef __HLSL_ENABLE_16_BIT
+typedef matrix<float16_t, 1, 1> float16_t1x1;
+typedef matrix<float16_t, 1, 2> float16_t1x2;
+typedef matrix<float16_t, 1, 3> float16_t1x3;
+typedef matrix<float16_t, 1, 4> float16_t1x4;
+typedef matrix<float16_t, 2, 1> float16_t2x1;
+typedef matrix<float16_t, 2, 2> float16_t2x2;
+typedef matrix<float16_t, 2, 3> float16_t2x3;
+typedef matrix<float16_t, 2, 4> float16_t2x4;
+typedef matrix<float16_t, 3, 1> float16_t3x1;
+typedef matrix<float16_t, 3, 2> float16_t3x2;
+typedef matrix<float16_t, 3, 3> float16_t3x3;
+typedef matrix<float16_t, 3, 4> float16_t3x4;
+typedef matrix<float16_t, 4, 1> float16_t4x1;
+typedef matrix<float16_t, 4, 2> float16_t4x2;
+typedef matrix<float16_t, 4, 3> float16_t4x3;
+typedef matrix<float16_t, 4, 4> float16_t4x4;
+#endif
+
+typedef matrix<float32_t, 1, 1> float32_t1x1;
+typedef matrix<float32_t, 1, 2> float32_t1x2;
+typedef matrix<float32_t, 1, 3> float32_t1x3;
+typedef matrix<float32_t, 1, 4> float32_t1x4;
+typedef matrix<float32_t, 2, 1> float32_t2x1;
+typedef matrix<float32_t, 2, 2> float32_t2x2;
+typedef matrix<float32_t, 2, 3> float32_t2x3;
+typedef matrix<float32_t, 2, 4> float32_t2x4;
+typedef matrix<float32_t, 3, 1> float32_t3x1;
+typedef matrix<float32_t, 3, 2> float32_t3x2;
+typedef matrix<float32_t, 3, 3> float32_t3x3;
+typedef matrix<float32_t, 3, 4> float32_t3x4;
+typedef matrix<float32_t, 4, 1> float32_t4x1;
+typedef matrix<float32_t, 4, 2> float32_t4x2;
+typedef matrix<float32_t, 4, 3> float32_t4x3;
+typedef matrix<float32_t, 4, 4> float32_t4x4;
+typedef matrix<float64_t, 1, 1> float64_t1x1;
+typedef matrix<float64_t, 1, 2> float64_t1x2;
+typedef matrix<float64_t, 1, 3> float64_t1x3;
+typedef matrix<float64_t, 1, 4> float64_t1x4;
+typedef matrix<float64_t, 2, 1> float64_t2x1;
+typedef matrix<float64_t, 2, 2> float64_t2x2;
+typedef matrix<float64_t, 2, 3> float64_t2x3;
+typedef matrix<float64_t, 2, 4> float64_t2x4;
+typedef matrix<float64_t, 3, 1> float64_t3x1;
+typedef matrix<float64_t, 3, 2> float64_t3x2;
+typedef matrix<float64_t, 3, 3> float64_t3x3;
+typedef matrix<float64_t, 3, 4> float64_t3x4;
+typedef matrix<float64_t, 4, 1> float64_t4x1;
+typedef matrix<float64_t, 4, 2> float64_t4x2;
+typedef matrix<float64_t, 4, 3> float64_t4x3;
+typedef matrix<float64_t, 4, 4> float64_t4x4;
+
 } // namespace hlsl
 
 #endif //_HLSL_HLSL_BASIC_TYPES_H_
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index 2913d16fca4823..d1a53d2ad88864 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -472,8 +472,81 @@ void HLSLExternalSemaSource::defineHLSLVectorAlias() {
   HLSLNamespace->addDecl(Template);
 }
 
+void HLSLExternalSemaSource::defineHLSLMatrixAlias() {
+  ASTContext &AST = SemaPtr->getASTContext();
+
+  llvm::SmallVector<NamedDecl *> TemplateParams;
+
+  auto *TypeParam = TemplateTypeParmDecl::Create(
+      AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 0,
+      &AST.Idents.get("element", tok::TokenKind::identifier), false, false);
+  TypeParam->setDefaultArgument(
+      AST, SemaPtr->getTrivialTemplateArgumentLoc(
+               TemplateArgument(AST.FloatTy), QualType(), SourceLocation()));
+
+  TemplateParams.emplace_back(TypeParam);
+
+  // these should be 64 bit to be consistent with other clang matrices.
+  auto *RowsParam = NonTypeTemplateParmDecl::Create(
+      AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 1,
+      &AST.Idents.get("rows_count", tok::TokenKind::identifier), AST.IntTy,
+      false, AST.getTrivialTypeSourceInfo(AST.IntTy));
+  llvm::APInt RVal(AST.getIntWidth(AST.IntTy), 4);
+  TemplateArgument RDefault(AST, llvm::APSInt(std::move(RVal)), AST.IntTy,
+                           /*IsDefaulted=*/true);
+  RowsParam->setDefaultArgument(
+      AST, SemaPtr->getTrivialTemplateArgumentLoc(RDefault, AST.IntTy,
+                                                  SourceLocation(), RowsParam));
+  TemplateParams.emplace_back(RowsParam);
+
+  auto *ColsParam = NonTypeTemplateParmDecl::Create(
+      AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 2,
+      &AST.Idents.get("cols_count", tok::TokenKind::identifier), AST.IntTy,
+      false, AST.getTrivialTypeSourceInfo(AST.IntTy));
+  llvm::APInt CVal(AST.getIntWidth(AST.IntTy), 4);
+  TemplateArgument CDefault(AST, llvm::APSInt(std::move(CVal)), AST.IntTy,
+                           /*IsDefaulted=*/true);
+  ColsParam->setDefaultArgument(
+      AST, SemaPtr->getTrivialTemplateArgumentLoc(CDefault, AST.IntTy,
+                                                  SourceLocation(), ColsParam));
+  TemplateParams.emplace_back(RowsParam);
+
+  auto *ParamList =
+      TemplateParameterList::Create(AST, SourceLocation(), SourceLocation(),
+                                    TemplateParams, SourceLocation(), nullptr);
+
+  IdentifierInfo &II = AST.Idents.get("matrix", tok::TokenKind::identifier);
+
+  QualType AliasType = AST.getDependentSizedMatrixType(
+      AST.getTemplateTypeParmType(0, 0, false, TypeParam),
+      DeclRefExpr::Create(
+          AST, NestedNameSpecifierLoc(), SourceLocation(), RowsParam, false,
+          DeclarationNameInfo(RowsParam->getDeclName(), SourceLocation()),
+          AST.IntTy, VK_LValue),
+      DeclRefExpr::Create(
+          AST, NestedNameSpecifierLoc(), SourceLocation(), ColsParam, false,
+          DeclarationNameInfo(ColsParam->getDeclName(), SourceLocation()),
+          AST.IntTy, VK_LValue),
+      SourceLocation());
+
+  auto *Record = TypeAliasDecl::Create(AST, HLSLNamespace, SourceLocation(),
+                                       SourceLocation(), &II,
+                                       AST.getTrivialTypeSourceInfo(AliasType));
+  Record->setImplicit(true);
+
+  auto *Template =
+      TypeAliasTemplateDecl::Create(AST, HLSLNamespace, SourceLocation(),
+                                    Record->getIdentifier(), ParamList, Record);
+
+  Record->setDescribedAliasTemplate(Template);
+  Template->setImplicit(true);
+  Template->setLexicalDeclContext(Record->getDeclContext());
+  HLSLNamespace->addDecl(Template);
+}
+
 void HLSLExternalSemaSource::defineTrivialHLSLTypes() {
   defineHLSLVectorAlias();
+  defineHLSLMatrixAlias();
 }
 
 /// Set up common members and attributes for buffer types
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index c44fc9c4194ca4..9213b4d95a70d9 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -2447,7 +2447,7 @@ QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize,
 
 QualType Sema::BuildMatrixType(QualType ElementTy, Expr *NumRows, Expr *NumCols,
                                SourceLocation AttrLoc) {
-  assert(Context.getLangOpts().MatrixTypes &&
+  assert((getLangOpts().MatrixTypes || getLangOpts().HLSL) &&
          "Should never build a matrix type when it is disabled");
 
   // Check element type, if it is not dependent.
diff --git a/clang/test/AST/HLSL/matrix-alias.hlsl b/clang/test/AST/HLSL/matrix-alias.hlsl
new file mode 100644
index 00000000000000..afac2cfed7604b
--- /dev/null
+++ b/clang/test/AST/HLSL/matrix-alias.hlsl
@@ -0,0 +1,49 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
+
+// Test that matrix aliases are set up properly for HLSL
+
+// CHECK: NamespaceDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> implicit hlsl
+// CHECK-NEXT: TypeAliasTemplateDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> implicit vector
+// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> class depth 0 index 0 element
+// CHECK-NEXT: TemplateArgument type 'float'
+// CHECK-NEXT: BuiltinType 0x{{[0-9a-fA-F]+}} 'float'
+// CHECK-NEXT: NonTypeTemplateParmDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> 'int' depth 0 index 1 element_count
+// CHECK-NEXT: TemplateArgument expr
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> 'int' 4
+// CHECK-NEXT: TypeAliasDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> implicit vector 'vector<element, element_count>'
+// CHECK-NEXT: DependentSizedExtVectorType 0x{{[0-9a-fA-F]+}} 'vector<element, element_count>' dependent <invalid sloc>
+// CHECK-NEXT: TemplateTypeParmType 0x{{[0-9a-fA-F]+}} 'element' dependent depth 0 index 0
+// CHECK-NEXT: TemplateTypeParm 0x{{[0-9a-fA-F]+}} 'element'
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <<invalid sloc>> 'int' lvalue
+// CHECK-SAME: NonTypeTemplateParm 0x{{[0-9a-fA-F]+}} 'element_count' 'int'
+
+// Make sure we got a using directive at the end.
+// CHECK: UsingDirectiveDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> Namespace 0x{{[0-9a-fA-F]+}} 'hlsl'
+
+[numthreads(1,1,1)]
+int entry() {
+  // Verify that the alias is generated inside the hlsl namespace.
+  hlsl::matrix<float, 2, 2> Mat2x2;
+
+  // CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:26:3, col:35>
+  // CHECK-NEXT: VarDecl 0x{{[0-9a-fA-F]+}} <col:3, col:29> col:29 Mat2x2 'hlsl::matrix<float, 2, 2>'
+
+  // Verify that you don't need to specify the namespace.
+  matrix<int, 2, 2> Vec2x2a;
+
+  // CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:32:3, col:28>
+  // CHECK-NEXT: VarDecl 0x{{[0-9a-fA-F]+}} <col:3, col:21> col:21 Vec2x2a 'matrix<int, 2, 2>'
+
+  // Build a bigger matrix.
+  matrix<double, 4, 4> Mat4x4;
+
+  // CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:38:3, col:30>
+  // CHECK-NEXT: VarDecl 0x{{[0-9a-fA-F]+}} <col:3, col:24> col:24 Mat4x4 'matrix<double, 4, 4>'
+
+  // Verify that the implicit arguments generate the correct type.
+  matrix<> ImpMat4x4;
+
+  // CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:44:3, col:21>
+  // CHECK-NEXT: VarDecl 0x{{[0-9a-fA-F]+}} <col:3, col:12> col:12 ImpMat4x4 'matrix<>':'matrix<float, 4, 4>'
+  return 1;
+}
diff --git a/clang/test/AST/HLSL/vector-alias.hlsl b/clang/test/AST/HLSL/vector-alias.hlsl
index 3d112ee1b22303..e7c72d51a6338c 100644
--- a/clang/test/AST/HLSL/vector-alias.hlsl
+++ b/clang/test/AST/HLSL/vector-alias.hlsl
@@ -13,7 +13,7 @@
 // CHECK-NEXT: TemplateTypeParmType 0x{{[0-9a-fA-F]+}} 'element' dependent depth 0 index 0
 // CHECK-NEXT: TemplateTypeParm 0x{{[0-9a-fA-F]+}} 'element'
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <<invalid sloc>> 'int' lvalue
-// NonTypeTemplateParm 0x{{[0-9a-fA-F]+}} 'element_count' 'int'
+// CHECK-SAME: NonTypeTemplateParm 0x{{[0-9a-fA-F]+}} 'element_count' 'int'
 
 // Make sure we got a using directive at the end.
 // CHECK: UsingDirectiveDecl 0x{{[0-9a-fA-F]+}} <<invalid sloc>> <invalid sloc> Namespace 0x{{[0-9a-fA-F]+}} 'hlsl'
diff --git a/clang/test/CodeGenCXX/matrix-type-operators.cpp b/clang/test/CodeGenCXX/matrix-type-operators.cpp
index 8974d2b2600292..7ade233900bd2b 100644
--- a/clang/test/CodeGenCXX/matrix-type-operators.cpp
+++ b/clang/test/CodeGenCXX/matrix-type-operators.cpp
@@ -1,7 +1,5 @@
 // RUN: %clang_cc1 -O0 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck --check-prefixes=CHECK,NOOPT %s
 // RUN: %clang_cc1 -O1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck --check-prefixes=CHECK,OPT %s
-typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
-using fx2x3_t = float __attribute__((matrix_type(2, 3)));
 
 template <typename EltTy, unsigned Rows, unsigned Columns>
 struct MyMatrix {
@@ -140,7 +138,7 @@ void test_IntWrapper_Add(MyMatrix<double, 10, 9> &m) {
   // NOOPT:       [[MATRIX:%.*]] = load <90 x double>, ptr {{.*}}, align 8{{$}}
   // OPT:         [[MATRIX:%.*]] = load <90 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
   // CHECK-NEXT:  [[SCALAR:%.*]] = call noundef i32 @_ZN10IntWrappercviEv(ptr {{[^,]*}} %w3)
-  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 %call to double
+  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 [[SCALAR]] to double
   // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR_FP]], i64 0
   // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> poison, <90 x i32> zeroinitializer
   // CHECK-NEXT:  [[RES:%.*]] = fadd <90 x double> [[MATRIX]], [[SCALAR_EMBED1]]
@@ -154,7 +152,7 @@ void test_IntWrapper_Add(MyMatrix<double, 10, 9> &m) {
 void test_IntWrapper_Sub(MyMatrix<double, 10, 9> &m) {
   // CHECK-LABEL: define{{.*}} void @_Z19test_IntWrapper_SubR8MyMatrixIdLj10ELj9EE(
   // CHECK:       [[SCALAR:%.*]] = call noundef i32 @_ZN10IntWrappercviEv(ptr {{[^,]*}} %w3)
-  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 %call to double
+  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 [[SCALAR]] to double
   // NOOPT:       [[MATRIX:%.*]] = load <90 x double>, ptr {{.*}}, align 8{{$}}
   // OPT:         [[MATRIX:%.*]] = load <90 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
   // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR_FP]], i64 0
@@ -197,7 +195,7 @@ MyMatrix<float, 2, 2> test_multiply_template(MyMatrix<float, 2, 5> Mat1,
 void test_IntWrapper_Multiply(MyMatrix<double, 10, 9> &m, IntWrapper &w3) {
   // CHECK-LABEL: define{{.*}} void @_Z24test_IntWrapper_MultiplyR8MyMatrixIdLj10ELj9EER10IntWrapper(
   // CHECK:       [[SCALAR:%.*]] = call noundef i32 @_ZN10IntWrappercviEv(ptr noundef {{.*}})
-  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 %call to double
+  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 [[SCALAR]] to double
   // NOOPT:       [[MATRIX:%.*]] = load <90 x double>, ptr {{.*}}, align 8{{$}}
   // OPT:         [[MATRIX:%.*]] = load <90 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
   // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR_FP]], i64 0
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast-template.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast-template.hlsl
new file mode 100644
index 00000000000000..3eb2042aab611a
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast-template.hlsl
@@ -0,0 +1,351 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - -DSPIRV | FileCheck %s --check-prefixes=CHECK,SPIRV
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s
+
+
+template <typename X>
+using matrix_3_3 = matrix<X, 3, 3>;
+
+template <typename Y>
+using matrix_4_4 = matrix<Y, 4, 4>;
+
+// CHECK-LABEL: define {{.*}}CastCharMatrixToIntCStyle
+void CastCharMatrixToIntCStyle() {
+  // CHECK: [[C:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT: [[CONV:%.*]] = sext <16 x i16> [[C]] to <16 x i32>
+  // CHECK-NEXT: store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+
+  matrix_4_4<int16_t> c;
+  matrix_4_4<int> i;
+  i = (matrix_4_4<int>)c;
+}
+
+// CHECK-LABEL: define {{.*}}CastCharMatrixToIntStaticCast
+void CastCharMatrixToIntStaticCast() {
+  // CHECK: [[C:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT: [[CONV:%.*]] = sext <16 x i16> [[C]] to <16 x i32>
+  // CHECK-NEXT: store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+
+  matrix_4_4<int16_t> c;
+  matrix_4_4<int> i;
+  i = static_cast<matrix_4_4<int>>(c);
+}
+
+// CHECK-LABEL: define {{.*}}CastCharMatrixToUnsignedIntCStyle
+void CastCharMatrixToUnsignedIntCStyle() {
+  // CHECK:       [[C:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i16> [[C]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int16_t> c;
+  matrix_4_4<unsigned int> u;
+  u = (matrix_4_4<unsigned int>)c;
+}
+
+// CHECK-LABEL: define {{.*}}CastCharMatrixToUnsignedIntStaticCast
+void CastCharMatrixToUnsignedIntStaticCast() {
+  // CHECK:       [[C:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i16> [[C]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int16_t> c;
+  matrix_4_4<unsigned int> u;
+  u = static_cast<matrix_4_4<unsigned int>>(c);
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedLongIntMatrixToShortCStyle
+void CastUnsignedLongIntMatrixToShortCStyle() {
+  // CHECK:      [[U:%.*]] = load <16 x i64>, ptr {{.*}}, align 8
+  // CHECK-NEXT: [[CONV:%.*]] = trunc <16 x i64> {{.*}} to <16 x i16>
+  // CHECK-NEXT: store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT: ret void
+
+  matrix_4_4<unsigned long int> u;
+  matrix_4_4<short int> s;
+  s = (matrix_4_4<short int>)u;
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedLongIntMatrixToShortStaticCast
+void CastUnsignedLongIntMatrixToShortStaticCast() {
+  // CHECK:      [[U:%.*]] = load <16 x i64>, ptr {{.*}}, align 8
+  // CHECK-NEXT: [[CONV:%.*]] = trunc <16 x i64> {{.*}} to <16 x i16>
+  // CHECK-NEXT: store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT: ret void
+
+  matrix_4_4<unsigned long int> u;
+  matrix_4_4<short int> s;
+  s = static_cast<matrix_4_4<short int>>(u);
+}
+
+// CHECK-LABEL: define {{.*}}CastIntMatrixToShortCStyle
+void CastIntMatrixToShortCStyle() {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i32> [[I]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int> i;
+  matrix_4_4<short int> s;
+  s = (matrix_4_4<short int>)i;
+}
+
+// CHECK-LABEL: define {{.*}}CastIntMatrixToShortStaticCast
+void CastIntMatrixToShortStaticCast() {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i32> [[I]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int> i;
+  matrix_4_4<short int> s;
+  s = static_cast<matrix_4_4<short int>>(i);
+}
+
+// CHECK-LABEL: define {{.*}}CastIntMatrixToFloatCStyle
+void CastIntMatrixToFloatCStyle() {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV]] = sitofp <16 x i32> {{.*}} to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int> i;
+  matrix_4_4<float> f;
+  f = (matrix_4_4<float>)i;
+}
+
+// CHECK-LABEL: define {{.*}}CastIntMatrixToFloatStaticCast
+void CastIntMatrixToFloatStaticCast() {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV]] = sitofp <16 x i32> {{.*}} to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int> i;
+  matrix_4_4<float> f;
+  f = static_cast<matrix_4_4<float>>(i);
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedIntMatrixToFloatCStyle
+void CastUnsignedIntMatrixToFloatCStyle() {
+  // CHECK:       [[U:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = uitofp <16 x i16> [[U]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<unsigned short int> u;
+  matrix_4_4<float> f;
+  f = (matrix_4_4<float>)u;
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedIntMatrixToFloatStaticCast
+void CastUnsignedIntMatrixToFloatStaticCast() {
+  // CHECK:       [[U:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = uitofp <16 x i16> [[U]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<unsigned short int> u;
+  matrix_4_4<float> f;
+  f = static_cast<matrix_4_4<float>>(u);
+}
+
+// CHECK-LABEL: define {{.*}}CastDoubleMatrixToIntCStyle
+void CastDoubleMatrixToIntCStyle() {
+  // CHECK:       [[D:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = fptosi <16 x double> [[D]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<double> d;
+  matrix_4_4<int> i;
+  i = (matrix_4_4<int>)d;
+}
+
+// CHECK-LABEL: define {{.*}}CastDoubleMatrixToIntStaticCast
+void CastDoubleMatrixToIntStaticCast() {
+  // CHECK:       [[D:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = fptosi <16 x double> [[D]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<double> d;
+  matrix_4_4<int> i;
+  i = static_cast<matrix_4_4<int>>(d);
+}
+
+// CHECK-LABEL: define {{.*}}CastFloatMatrixToUnsignedShortIntCStyle
+void CastFloatMatrixToUnsignedShortIntCStyle() {
+  // CHECK:       [[F:%.*]] = load <16 x float>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = fptoui <16 x float> [[F]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<float> f;
+  matrix_4_4<unsigned short int> i;
+  i = (matrix_4_4<unsigned short int>)f;
+}
+
+// CHECK-LABEL: define {{.*}}CastFloatMatrixToUnsignedShortIntStaticCast
+void CastFloatMatrixToUnsignedShortIntStaticCast() {
+  // CHECK:       [[F:%.*]] = load <16 x float>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = fptoui <16 x float> [[F]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<float> f;
+  matrix_4_4<unsigned short int> i;
+  i = static_cast<matrix_4_4<unsigned short int>>(f);
+}
+
+// CHECK-LABEL: define {{.*}}CastDoubleMatrixToFloatCStyle
+void CastDoubleMatrixToFloatCStyle() {
+  // CHECK:       [[D:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = fptrunc <16 x double> [[D]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<double> d;
+  matrix_4_4<float> f;
+  f = (matrix_4_4<float>)d;
+}
+
+// CHECK-LABEL: define {{.*}}CastDoubleMatrixToFloatStaticCast
+void CastDoubleMatrixToFloatStaticCast() {
+  // CHECK:       [[D:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = fptrunc <16 x double> [[D]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<double> d;
+  matrix_4_4<float> f;
+  f = static_cast<matrix_4_4<float>>(d);
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedShortIntToUnsignedIntCStyle
+void CastUnsignedShortIntToUnsignedIntCStyle() {
+  // CHECK:       [[S:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = zext <16 x i16> [[S]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<unsigned short int> s;
+  matrix_4_4<unsigned int> i;
+  i = (matrix_4_4<unsigned int>)s;
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedShortIntToUnsignedIntStaticCast
+void CastUnsignedShortIntToUnsignedIntStaticCast() {
+  // CHECK:       [[S:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = zext <16 x i16> [[S]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<unsigned short int> s;
+  matrix_4_4<unsigned int> i;
+  i = static_cast<matrix_4_4<unsigned int>>(s);
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedLongIntToUnsignedShortIntCStyle
+void CastUnsignedLongIntToUnsignedShortIntCStyle() {
+  // CHECK:       [[L:%.*]] = load <16 x i64>, ptr %l, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i64> [[L]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<unsigned long int> l;
+  matrix_4_4<unsigned short int> s;
+  s = (matrix_4_4<unsigned short int>)l;
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedLongIntToUnsignedShortIntStaticCast
+void CastUnsignedLongIntToUnsignedShortIntStaticCast() {
+  // CHECK:       [[L:%.*]] = load <16 x i64>, ptr %l, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i64> [[L]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<unsigned long int> l;
+  matrix_4_4<unsigned short int> s;
+  s = static_cast<matrix_4_4<unsigned short int>>(l);
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedShortIntToIntCStyle
+void CastUnsignedShortIntToIntCStyle() {
+  // CHECK:       [[U:%.*]] = load <16 x i16>, ptr %u, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = zext <16 x i16> [[U]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<unsigned short int> u;
+  matrix_4_4<int> i;
+  i = (matrix_4_4<int>)u;
+}
+
+// CHECK-LABEL: define {{.*}}CastUnsignedShortIntToIntStaticCast
+void CastUnsignedShortIntToIntStaticCast() {
+  // CHECK:       [[U:%.*]] = load <16 x i16>, ptr %u, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = zext <16 x i16> [[U]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<unsigned short int> u;
+  matrix_4_4<int> i;
+  i = static_cast<matrix_4_4<int>>(u);
+}
+
+// CHECK-LABEL: define {{.*}}CastIntToUnsignedLongIntCStyle
+void CastIntToUnsignedLongIntCStyle() {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr %i, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i32> [[I]] to <16 x i64>
+  // CHECK-NEXT:  store <16 x i64> [[CONV]], ptr {{.*}}, align 8
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int> i;
+  matrix_4_4<unsigned long int> u;
+  u = (matrix_4_4<unsigned long int>)i;
+}
+
+// CHECK-LABEL: define {{.*}}CastIntToUnsignedLongIntStaticCast
+void CastIntToUnsignedLongIntStaticCast() {
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr %i, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i32> [[I]] to <16 x i64>
+  // CHECK-NEXT:  store <16 x i64> [[CONV]], ptr {{.*}}, align 8
+  // CHECK-NEXT:  ret void
+
+  matrix_4_4<int> i;
+  matrix_4_4<unsigned long int> u;
+  u = static_cast<matrix_4_4<unsigned long int>>(i);
+}
+
+class Foo {
+  int x[10];
+
+  Foo(matrix_4_4<int> x);
+};
+
+#ifdef SPIRV
+// These require mangling. DXIL uses MicrosoftMangle which doesn't support mangling matrices yet.
+// SPIRV-LABEL: define {{.*}}class_constructor_matrix_ty
+Foo class_constructor_matrix_ty(matrix_4_4<int> m) {
+  // SPIRV:         [[M:%.*]]  = load <16 x i32>, ptr {{.*}}, align 4
+  // SPIRV-NEXT:    call{{.*}} void @_ZN3FooC1Eu11matrix_typeILj4ELj4EiE(ptr noundef nonnull align 4 dereferenceable(40) %agg.result, <16 x i32> noundef [[M]])
+  // SPIRV-NEXT:    ret void
+
+  return Foo(m);
+}
+
+struct Bar {
+  float x[10];
+  Bar(matrix_3_3<float> x);
+};
+
+// SPIRV-LABEL: define {{.*}}struct_constructor_matrix_ty
+Bar struct_constructor_matrix_ty(matrix_3_3<float> m) {
+  // SPIRV:         [[M:%.*]] = load <9 x float>, ptr {{.*}}, align 4
+  // SPIRV-NEXT:    call{{.*}} void @_ZN3BarC1Eu11matrix_typeILj3ELj3EfE(ptr noundef nonnull align 4 dereferenceable(40) %agg.result, <9 x float> noundef [[M]])
+  // SPIRV-NEXT:    ret void
+
+  return Bar(m);
+}
+#endif
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast.hlsl
new file mode 100644
index 00000000000000..dd06137e0a7e4c
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-cast.hlsl
@@ -0,0 +1,162 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - -DSPIRV | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s
+
+// Test explicit matrix casts.
+// This is adapted to HLSL from CodeGen/matrix-cast.c.
+
+// CHECK-LABEL: define {{.*}}cast_int16_matrix_to_int
+void cast_int16_matrix_to_int() {
+  int16_t4x4 c;
+  int4x4 i;
+
+  // CHECK:       [[C:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i16> [[C]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  i = (int4x4)c;
+}
+
+// CHECK-LABEL: define {{.*}}cast_int16_matrix_to_uint
+void cast_int16_matrix_to_uint() {
+  int16_t4x4 c;
+  uint4x4 u;
+  // CHECK:       [[C:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i16> [[C]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  u = (uint4x4)c;
+}
+
+// CHECK-LABEL: define {{.*}}cast_uint64_matrix_to_int16
+void cast_uint64_matrix_to_int16() {
+  uint64_t4x4 u;
+  int16_t4x4 s;
+  // CHECK:       [[U:%.*]] = load <16 x i64>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i64> [[U]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  s = (int16_t4x4)u;
+}
+
+// CHECK-LABEL: define {{.*}}cast_int_matrix_to_int16
+void cast_int_matrix_to_int16() {
+  int4x4 i;
+  int16_t4x4 s;
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i32> [[I]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  s = (int16_t4x4)i;
+}
+
+// CHECK-LABEL: define {{.*}}cast_int_matrix_to_float
+void cast_int_matrix_to_float() {
+  int4x4 i;
+  float4x4 f;
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = sitofp <16 x i32> [[I]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  f = (float4x4)i;
+}
+
+// CHECK-LABEL: define {{.*}}cast_uint_matrix_to_float
+void cast_uint_matrix_to_float() {
+  uint16_t4x4 u;
+  float4x4 f;
+  // CHECK:       [[U:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = uitofp <16 x i16> [[U]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  f = (float4x4)u;
+}
+
+// CHECK-LABEL: define {{.*}}cast_double_matrix_to_int
+void cast_double_matrix_to_int() {
+  double4x4 d;
+  int4x4 i;
+  // CHECK:       [[D:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = fptosi <16 x double> [[D]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  i = (int4x4)d;
+}
+
+// CHECK-LABEL: define {{.*}}cast_float_matrix_to_uint16
+void cast_float_matrix_to_uint16() {
+  float4x4 f;
+  uint16_t4x4 i;
+  // CHECK:       [[F:%.*]] = load <16 x float>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = fptoui <16 x float> [[F]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  i = (uint16_t4x4)f;
+}
+
+// CHECK-LABEL: define {{.*}}cast_double_matrix_to_float
+void cast_double_matrix_to_float() {
+  double4x4 d;
+  float4x4 f;
+  // CHECK:       [[D:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = fptrunc <16 x double> [[D]] to <16 x float>
+  // CHECK-NEXT:  store <16 x float> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  f = (float4x4)d;
+}
+
+// CHECK-LABEL: define {{.*}}cast_uint16_to_uint
+void cast_uint16_to_uint() {
+  uint16_t4x4 s;
+  uint4x4 i;
+  // CHECK:       [[S:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = zext <16 x i16> [[S]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  i = (uint4x4)s;
+}
+
+// CHECK-LABEL: define {{.*}}cast_uint64_to_uint16
+void cast_uint64_to_uint16() {
+  uint64_t4x4 l;
+  uint16_t4x4 s;
+  // CHECK:       [[L:%.*]] = load <16 x i64>, ptr {{.*}}, align 8
+  // CHECK-NEXT:  [[CONV:%.*]] = trunc <16 x i64> [[L]] to <16 x i16>
+  // CHECK-NEXT:  store <16 x i16> [[CONV]], ptr {{.*}}, align 2
+  // CHECK-NEXT:  ret void
+
+  s = (uint16_t4x4)l;
+}
+
+// CHECK-LABEL: define {{.*}}cast_uint16_to_int
+void cast_uint16_to_int() {
+  uint16_t4x4 u;
+  int4x4 i;
+  // CHECK:       [[U:%.*]] = load <16 x i16>, ptr {{.*}}, align 2
+  // CHECK-NEXT:  [[CONV:%.*]] = zext <16 x i16> [[U]] to <16 x i32>
+  // CHECK-NEXT:  store <16 x i32> [[CONV]], ptr {{.*}}, align 4
+  // CHECK-NEXT:  ret void
+
+  i = (int4x4)u;
+}
+
+// CHECK-LABEL: define {{.*}}cast_int_to_uint64
+void cast_int_to_uint64() {
+  int4x4 i;
+  uint64_t4x4 u;
+  // CHECK:       [[I:%.*]] = load <16 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:  [[CONV:%.*]] = sext <16 x i32> [[I]] to <16 x i64>
+  // CHECK-NEXT:  store <16 x i64> [[CONV]], ptr {{.*}}, align 8
+  // CHECK-NEXT:  ret void
+
+  u = (uint64_t4x4)i;
+}
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose-template.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose-template.hlsl
new file mode 100644
index 00000000000000..9d59d821b8ebdf
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose-template.hlsl
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple spirv-unknown-vulkan-compute -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// Test the matrix type transpose builtin.
+// Since all these cases require mangling, DXIL is not tested for now.
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+using matrix_t = matrix<EltTy, Rows, Columns>;
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  matrix_t<EltTy, Rows, Columns> value;
+};
+
+// Can't test utility function with matrix param without mangling.
+template <typename T, unsigned R, unsigned C>
+MyMatrix<T, C, R> transpose(const MyMatrix<T, R, C> M) {
+  MyMatrix<T, C, R> Res;
+  Res.value = __builtin_matrix_transpose(M.value);
+  return Res;
+}
+
+void test_transpose_template1() {
+  // SPIRV-LABEL: define{{.*}} void @_Z24test_transpose_template1v()
+  // SPIRV:         call{{.*}} void @_Z9transposeIiLj3ELj4EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(ptr dead_on_unwind writable sret(%struct.MyMatrix.0) align 4 %M1_t, ptr byval(%struct.MyMatrix) align 4 %agg.tmp)
+  // SPIRV-LABEL: define{{.*}} void @_Z9transposeIiLj3ELj4EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(
+  // SPIRV:         [[M:%.*]] = load <12 x i32>, ptr {{.*}}, align 4
+  // SPIRV-NEXT:    [[M_T:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[M]], i32 3, i32 4)
+
+  MyMatrix<int, 3, 4> M1;
+  MyMatrix<int, 4, 3> M1_t = transpose(M1);
+}
+
+void test_transpose_template2(inout MyMatrix<double, 3, 2> M) {
+  // SPIRV-LABEL: define{{.*}} void @_Z24test_transpose_template2R8MyMatrixIdLj3ELj2EE(
+  // SPIRV:         call{{.*}} void @_Z9transposeIdLj3ELj2EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(ptr dead_on_unwind writable sret(%struct.MyMatrix.1) align 8 %agg.tmp1, ptr byval(%struct.MyMatrix.2) align 8 %agg.tmp2)
+  // SPIRV-NEXT:    call{{.*}} void @_Z9transposeIdLj2ELj3EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(ptr dead_on_unwind writable sret(%struct.MyMatrix.2) align 8 %agg.tmp, ptr byval(%struct.MyMatrix.1) align 8 %agg.tmp1)
+  // SPIRV-NEXT:    call{{.*}} void @_Z9transposeIdLj3ELj2EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(ptr dead_on_unwind writable sret(%struct.MyMatrix.1) align 8 %M2_t, ptr byval(%struct.MyMatrix.2) align 8 %agg.tmp)
+
+  // SPIRV-LABEL: define{{.*}} void @_Z9transposeIdLj3ELj2EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(
+  // SPIRV:         [[M:%.*]] = load <6 x double>, ptr {{.*}}, align 8
+  // SPIRV-NEXT:    [[M_T:%.*]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[M]], i32 3, i32 2)
+  // SPIRV-NEXT:    [[RES_ADDR:%.*]] = getelementptr inbounds nuw %struct.MyMatrix.1, ptr %agg.result, i32 0, i32 0
+  // SPIRV-NEXT:    store <6 x double> [[M_T]], ptr [[RES_ADDR]], align 8
+
+  // SPIRV-LABEL: define{{.*}} void @_Z9transposeIdLj2ELj3EE8MyMatrixIT_XT1_EXT0_EES0_IS1_XT0_EXT1_EE(
+  // SPIRV:         [[M:%.*]] = load <6 x double>, ptr {{.*}}, align 8
+  // SPIRV-NEXT:    [[M_T:%.*]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[M]], i32 2, i32 3)
+  // SPIRV-NEXT:    [[RES_ADDR:%.*]] = getelementptr inbounds nuw %struct.MyMatrix.2, ptr %agg.result, i32 0, i32 0
+  // SPIRV-NEXT:    store <6 x double> [[M_T]], ptr [[RES_ADDR]], align 8
+
+  MyMatrix<double, 2, 3> M2_t = transpose(transpose(transpose(M)));
+}
+
+matrix_t<float, 3, 3> get_matrix();
+
+void test_transpose_rvalue() {
+  // CHECK-LABEL: define{{.*}} void @_Z21test_transpose_rvaluev()
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    [[M_T_ADDR:%.*]] = alloca [9 x float], align 4
+  // CHECK-NEXT:    [[CALL_RES:%.*]] = call{{.*}} <9 x float> @_Z10get_matrixv()
+  // CHECK-NEXT:    [[ADD:%.*]] = fadd <9 x float> [[CALL_RES]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  // CHECK-NEXT:    [[M_T:%.*]] = call <9 x float> @llvm.matrix.transpose.v9f32(<9 x float> [[ADD]], i32 3, i32 3)
+  // CHECK-NEXT:    store <9 x float> [[M_T]], ptr [[M_T_ADDR]], align 4
+  matrix_t<float, 3, 3> m_t = __builtin_matrix_transpose(get_matrix() + 2.0);
+}
+
+void test_transpose_const(const matrix_t<float, 3, 3> m) {
+  // CHECK-LABEL:  define{{.*}} void @_Z20test_transpose_constu11matrix_typeILj3ELj3EfE(
+  // CHECK:         [[MATRIX:%.*]] = load <9 x float>, ptr {{.*}}, align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <9 x float> @llvm.matrix.transpose.v9f32(<9 x float> [[MATRIX]], i32 3, i32 3)
+  // CHECK-NEXT:    store <9 x float> [[M_T]], ptr %m_t, align 4
+  matrix_t<float, 3, 3> m_t = __builtin_matrix_transpose(m);
+}
+
+// TODO: Enable once initialization support is defined and implemented for
+//       matrix types.
+// void test_lvalue_conversion() {
+//  constexpr double4x4 m = {};
+//  [] { return __builtin_matrix_transpose(m); }
+//}
+
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl
new file mode 100644
index 00000000000000..426c415f9dce99
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl
@@ -0,0 +1,95 @@
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple spirv-unknown-vulkan-compute -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - -DSPIRV | FileCheck %s --check-prefixes=CHECK,SPIRV
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple dxil-pc-shadermodel6.3-compute -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// Tests the matrix type transformation builtin.
+
+// CHECK-LABEL: define {{.*}}transpose_double_4x4
+void transpose_double_4x4() {
+double4x4 a;
+  // CHECK:       [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8
+  // CHECK-NEXT:   [[TRANS:%.*]] = call <16 x double> @llvm.matrix.transpose.v16f64(<16 x double> [[A]], i32 4, i32 4)
+  // CHECK-NEXT:  store <16 x double> [[TRANS]], ptr %a_t, align 8
+
+  double4x4 a_t = __builtin_matrix_transpose(a);
+}
+
+// CHECK-LABEL: define {{.*}}transpose_float_3x2
+void transpose_float_3x2() {
+float3x2 a;
+  // CHECK:        [[A:%.*]] = load <6 x float>, ptr {{.*}}, align 4
+  // CHECK-NEXT:   [[TRANS:%.*]] = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> [[A]], i32 3, i32 2)
+  // CHECK-NEXT:   store <6 x float> [[TRANS]], ptr %a_t, align 4
+
+  float2x3 a_t = __builtin_matrix_transpose(a);
+}
+
+// CHECK-LABEL: define {{.*}}transpose_int_4x3
+void transpose_int_4x3() {
+int4x3 a;
+  // CHECK:         [[A:%.*]] = load <12 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:    [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 4, i32 3)
+  // CHECK-NEXT:    store <12 x i32> [[TRANS]], ptr %a_t, align 4
+
+  int3x4 a_t = __builtin_matrix_transpose(a);
+}
+
+struct Foo {
+  uint1x4 In;
+  uint4x1 Out;
+};
+
+// CHECK-LABEL: define {{.*}}transpose_struct_member
+void transpose_struct_member() {
+struct Foo F;
+  // CHECK:          [[F:%.*]] = alloca %struct.Foo, align 4
+  // CHECK:         [[M:%.*]] = load <4 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <4 x i32> @llvm.matrix.transpose.v4i32(<4 x i32> [[M]], i32 1, i32 4)
+  // CHECK-NEXT:    [[OUT_PTR:%.*]] = getelementptr inbounds nuw %struct.Foo, ptr [[F]], i32 0, i32 1
+  // CHECK-NEXT:    store <4 x i32> [[M_T]], ptr [[OUT_PTR]], align 4
+
+  F.Out = __builtin_matrix_transpose(F.In);
+}
+
+// CHECK-LABEL: define {{.*}}transpose_transpose_struct_member
+void transpose_transpose_struct_member() {
+struct Foo F;
+  // CHECK:          [[F:%.*]] = alloca %struct.Foo, align 4
+  // CHECK:         [[M:%.*]] = load <4 x i32>, ptr {{.*}}, align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <4 x i32> @llvm.matrix.transpose.v4i32(<4 x i32> [[M]], i32 1, i32 4)
+  // CHECK-NEXT:    [[M_T2:%.*]] = call <4 x i32> @llvm.matrix.transpose.v4i32(<4 x i32> [[M_T]], i32 4, i32 1)
+  // CHECK-NEXT:    [[IN_PTR:%.*]] = getelementptr inbounds nuw %struct.Foo, ptr [[F]], i32 0, i32 0
+  // CHECK-NEXT:    store <4 x i32> [[M_T2]], ptr [[IN_PTR]], align 4
+
+  F.In = __builtin_matrix_transpose(__builtin_matrix_transpose(F.In));
+}
+
+#ifdef SPIRV
+double4x4 get_matrix(void);
+
+// SPIRV-LABEL: define {{.*}}transpose_rvalue
+void transpose_rvalue(void) {
+  // SPIRV-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // SPIRV-NEXT:    [[M_T_ADDR:%.*]] = alloca [16 x double], align 8
+  // SPIRV-NEXT:    [[CALL:%.*]] = call{{.*}} <16 x double> @_Z10get_matrixv()
+  // SPIRV-NEXT:    [[M_T:%.*]] = call <16 x double> @llvm.matrix.transpose.v16f64(<16 x double> [[CALL]], i32 4, i32 4)
+  // SPIRV-NEXT:   store <16 x double> [[M_T]], ptr [[M_T_ADDR]], align 8
+
+  double4x4 m_t = __builtin_matrix_transpose(get_matrix());
+}
+
+double4x4 global_matrix;
+
+// SPIRV-LABEL: define {{.*}}transpose_global
+void transpose_global(void) {
+  // SPIRV-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // SPIRV-NEXT:    [[M_T_ADDR:%.*]] = alloca [16 x double], align 8
+  // SPIRV-NEXT:    [[GLOBAL_MATRIX:%.*]] = load <16 x double>, ptr @global_matrix, align 8
+  // SPIRV-NEXT:    [[M_T:%.*]] = call <16 x double> @llvm.matrix.transpose.v16f64(<16 x double> [[GLOBAL_MATRIX]], i32 4, i32 4)
+  // SPIRV-NEXT:    store <16 x double> [[M_T]], ptr [[M_T_ADDR]], align 8
+
+  double4x4 m_t = __builtin_matrix_transpose(global_matrix);
+}
+
+#endif
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators-template.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators-template.hlsl
new file mode 100644
index 00000000000000..9b36dfabce1385
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators-template.hlsl
@@ -0,0 +1,447 @@
+// RUN: %clang_cc1 -O0 -triple spirv-unknown-vulkan-compute -std=hlsl202y -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,NOOPT
+// RUN: %clang_cc1 -O1 -triple spirv-unknown-vulkan-compute -std=hlsl202y -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,OPT
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = matrix<EltTy, Rows, Columns>;
+
+  matrix_t value;
+};
+
+template <typename EltTy0, unsigned R0, unsigned C0>
+typename MyMatrix<EltTy0, R0, C0>::matrix_t add(inout MyMatrix<EltTy0, R0, C0> A, inout MyMatrix<EltTy0, R0, C0> B) {
+  return A.value + B.value;
+}
+
+// CHECK-LABEL: define {{.*}}test_add_template
+void test_add_template() {
+  // CHECK:       call{{.*}} <8 x float> @_Z3addIfLj2ELj4EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tES2_S2_(ptr noalias noundef nonnull align 4 dereferenceable(32) %{{.*}}, ptr noalias noundef nonnull align 4 dereferenceable(32) %{{.*}})
+
+  // CHECK-LABEL: define{{.*}} <8 x float> @_Z3addIfLj2ELj4EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tES2_S2_(
+  // NOOPT:       [[MAT1:%.*]] = load <8 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT:       [[MAT2:%.*]] = load <8 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[MAT1:%.*]] = load <8 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT:         [[MAT2:%.*]] = load <8 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <8 x float> [[MAT1]], [[MAT2]]
+  // CHECK-NEXT:  ret <8 x float> [[RES]]
+
+  MyMatrix<float, 2, 4> Mat1;
+  MyMatrix<float, 2, 4> Mat2;
+  Mat1.value = add(Mat1, Mat2);
+}
+
+template <typename EltTy0, unsigned R0, unsigned C0>
+typename MyMatrix<EltTy0, R0, C0>::matrix_t subtract(inout MyMatrix<EltTy0, R0, C0> A, inout MyMatrix<EltTy0, R0, C0> B) {
+  return A.value - B.value;
+}
+
+// CHECK-LABEL: define {{.*}}test_subtract_template
+void test_subtract_template() {
+  // CHECK:       call{{.*}} <8 x float> @_Z8subtractIfLj2ELj4EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tES2_S2_(ptr noalias noundef nonnull align 4 dereferenceable(32) %{{.*}}, ptr noalias noundef nonnull align 4 dereferenceable(32) %{{.*}})
+
+  // CHECK-LABEL: define{{.*}} <8 x float> @_Z8subtractIfLj2ELj4EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tES2_S2_(
+  // NOOPT:       [[MAT1:%.*]] = load <8 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT:       [[MAT2:%.*]] = load <8 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[MAT1:%.*]] = load <8 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT:         [[MAT2:%.*]] = load <8 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <8 x float> [[MAT1]], [[MAT2]]
+  // CHECK-NEXT:  ret <8 x float> [[RES]]
+
+  MyMatrix<float, 2, 4> Mat1;
+  MyMatrix<float, 2, 4> Mat2;
+  Mat1.value = subtract(Mat1, Mat2);
+}
+
+struct DoubleWrapper1 {
+  int x;
+  operator double() {
+    return x;
+  }
+};
+
+// CHECK-LABEL: define {{.*}}test_DoubleWrapper1_Sub1
+void test_DoubleWrapper1_Sub1(inout MyMatrix<double, 4, 3> m) {
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR:%.*]] = call{{.*}} double @_ZN14DoubleWrapper1cvdEv(ptr {{[^,]*}} %w1)
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <12 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+
+  DoubleWrapper1 w1;
+  w1.x = 10;
+  m.value = m.value - w1;
+}
+
+// CHECK-LABEL: define {{.*}}test_DoubleWrapper1_Sub2
+void test_DoubleWrapper1_Sub2(inout MyMatrix<double, 4, 3> m) {
+  // CHECK:       [[SCALAR:%.*]] = call{{.*}} double @_ZN14DoubleWrapper1cvdEv(ptr {{[^,]*}} %w1)
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <12 x double> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+
+  DoubleWrapper1 w1;
+  w1.x = 10;
+  m.value = w1 - m.value;
+}
+
+struct DoubleWrapper2 {
+  int x;
+  operator double() {
+    return x;
+  }
+};
+
+// CHECK-LABEL: define {{.*}}test_DoubleWrapper2_Add1
+void test_DoubleWrapper2_Add1(inout MyMatrix<double, 4, 3> m) {
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.+}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.+}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK:       [[SCALAR:%.*]] = call{{.*}} double @_ZN14DoubleWrapper2cvdEv(ptr {{[^,]*}} %w2)
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <12 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+
+  DoubleWrapper2 w2;
+  w2.x = 20;
+  m.value = m.value + w2;
+}
+
+// CHECK-LABEL: define {{.*}}test_DoubleWrapper2_Add2
+void test_DoubleWrapper2_Add2(inout MyMatrix<double, 4, 3> m) {
+  // CHECK:       [[SCALAR:%.*]] = call{{.*}} double @_ZN14DoubleWrapper2cvdEv(ptr {{[^,]*}} %w2)
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <12 x double> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+
+  DoubleWrapper2 w2;
+  w2.x = 20;
+  m.value = w2 + m.value;
+}
+
+struct IntWrapper {
+  uint16_t x;
+  operator int() {
+    return x;
+  }
+};
+
+// CHECK-LABEL: define {{.*}}test_IntWrapper_Add
+void test_IntWrapper_Add(inout MyMatrix<double, 4, 3> m) {
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR:%.*]] = call{{.*}} i32 @_ZN10IntWrappercviEv(ptr {{[^,]*}} %w3)
+  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 [[SCALAR]] to double
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR_FP]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <12 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+
+  IntWrapper w3;
+  w3.x = 13;
+  m.value = m.value + w3;
+}
+
+// CHECK-LABEL: define {{.*}}test_IntWrapper_Sub
+void test_IntWrapper_Sub(inout MyMatrix<double, 4, 3> m) {
+  // CHECK:       [[SCALAR:%.*]] = call{{.*}} i32 @_ZN10IntWrappercviEv(ptr {{[^,]*}} %w3)
+  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 [[SCALAR]] to double
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR_FP]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <12 x double> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+
+  IntWrapper w3;
+  w3.x = 13;
+  m.value = w3 - m.value;
+}
+
+template <typename EltTy0, unsigned R0, unsigned C0, unsigned C1>
+typename MyMatrix<EltTy0, R0, C1>::matrix_t multiply(inout MyMatrix<EltTy0, R0, C0> A, inout MyMatrix<EltTy0, C0, C1> B) {
+  return A.value * B.value;
+}
+
+// CHECK-LABEL: define {{.*}}test_multiply_template
+MyMatrix<float, 2, 2> test_multiply_template(MyMatrix<float, 2, 4> Mat1,
+                                             MyMatrix<float, 4, 2> Mat2) {
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %tmp = alloca %struct.MyMatrix, align 4
+  // CHECK-NEXT:    %tmp1 = alloca %struct.MyMatrix.2, align 4
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp, ptr align 4 %Mat1, i64 32, i1 false)
+  // OPT-NEXT:      call void @llvm.lifetime.start.p0(i64 32, ptr %tmp)
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp1, ptr align 4 %Mat2, i64 32, i1 false)
+  // OPT-NEXT:      call void @llvm.lifetime.start.p0(i64 32, ptr %tmp1)
+  // CHECK-NEXT:    [[RES:%.*]] = call{{.*}} <4 x float> @_Z8multiplyIfLj2ELj4ELj2EEN8MyMatrixIT_XT0_EXT2_EE8matrix_tES0_IS1_XT0_EXT1_EES0_IS1_XT1_EXT2_EE(ptr noalias noundef nonnull align 4 dereferenceable(32) %tmp, ptr noalias noundef nonnull align 4 dereferenceable(32) %tmp1)
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 %Mat1, ptr align 4 %tmp, i64 32, i1 false)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 32, ptr %tmp)
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 %Mat2, ptr align 4 %tmp1, i64 32, i1 false)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 32, ptr %tmp1)
+  // CHECK-NEXT:    %value = getelementptr inbounds nuw %struct.MyMatrix.1, ptr %agg.result, i32 0, i32 0
+  // CHECK-NEXT:    store <4 x float> [[RES]], ptr %value, align 4
+  // CHECK-NEXT:    ret void
+  //
+  // CHECK-LABEL:  define{{.*}} <4 x float> @_Z8multiplyIfLj2ELj4ELj2EEN8MyMatrixIT_XT0_EXT2_EE8matrix_tES0_IS1_XT0_EXT1_EES0_IS1_XT1_EXT2_EE(
+  // NOOPT:         [[MAT1:%.*]] = load <8 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT:         [[MAT2:%.*]] = load <8 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:           [[MAT1:%.*]] = load <8 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT:           [[MAT2:%.*]] = load <8 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.matrix.multiply.v4f32.v8f32.v8f32(<8 x float> [[MAT1]], <8 x float> [[MAT2]], i32 2, i32 4, i32 2)
+  // CHECK-NEXT:    ret <4 x float> [[RES]]
+
+  MyMatrix<float, 2, 2> Res;
+  Res.value = multiply(Mat1, Mat2);
+  return Res;
+}
+
+// CHECK-LABEL: define {{.*}}test_IntWrapper_Multiply
+void test_IntWrapper_Multiply(inout MyMatrix<double, 4, 3> m, inout IntWrapper w3) {
+  // CHECK:       [[SCALAR:%.*]] = call{{.*}} i32 @_ZN10IntWrappercviEv(ptr noundef {{.*}})
+  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 [[SCALAR]] to double
+  // NOOPT:       [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <12 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x double> poison, double [[SCALAR_FP]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x double> [[SCALAR_EMBED]], <12 x double> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fmul <12 x double> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK:       store <12 x double> [[RES]], ptr {{.*}}, align 8
+  // CHECK-NEXT:  ret void
+  m.value = w3 * m.value;
+}
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+void insert(inout MyMatrix<EltTy, Rows, Columns> Mat, EltTy e, unsigned i, unsigned j) {
+  Mat.value[i][j] = e;
+}
+
+// CHECK-LABEL: define {{.*}}test_insert_template1
+void test_insert_template1(inout MyMatrix<unsigned, 2, 2> Mat, unsigned e, unsigned i, unsigned j) {
+  // NOOPT:         [[MAT_ADDR:%.*]] = load ptr, ptr %Mat.addr, align 8{{$}}
+  // NOOPT:         [[E:%.*]] = load i32, ptr %e.addr, align 4{{$}}
+  // NOOPT-NEXT:    [[I:%.*]] = load i32, ptr %i.addr, align 4{{$}}
+  // NOOPT-NEXT:    [[J:%.*]] = load i32, ptr %j.addr, align 4{{$}}
+  // OPT:           [[MAT_ADDR:%.*]] = load ptr, ptr %Mat.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT:           [[E:%.*]] = load i32, ptr %e.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[I:%.*]] = load i32, ptr %i.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i32, ptr %j.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    call{{.*}} void @_Z6insertIjLj2ELj2EEv8MyMatrixIT_XT0_EXT1_EES1_jj(ptr noalias noundef nonnull align 4 dereferenceable(16) %{{.*}}, i32 noundef [[E]], i32 noundef [[I]], i32 noundef [[J]])
+  // CHECK:         ret void
+  //
+  // CHECK-LABEL: define{{.*}} void @_Z6insertIjLj2ELj2EEv8MyMatrixIT_XT0_EXT1_EES1_jj(
+  // NOOPT:         [[E:%.*]] = load i32, ptr %e.addr, align 4{{$}}
+  // NOOPT:         [[I:%.*]] = load i32, ptr %i.addr, align 4{{$}}
+  // OPT:           [[E:%.*]] = load i32, ptr %e.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT:           [[I:%.*]] = load i32, ptr %i.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[I_EXT:%.*]] = zext i32 [[I]] to i64
+  // NOOPT-NEXT:    [[J:%.*]] = load i32, ptr %j.addr, align 4{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i32, ptr %j.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[J_EXT:%.*]] = zext i32 [[J]] to i64
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_EXT]], 2
+  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 4
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <4 x i32>, ptr {{.*}}, align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <4 x i32> [[MAT]], i32 [[E]], i64 [[IDX2]]
+  // CHECK-NEXT:    store <4 x i32> [[MATINS]], ptr {{.*}}, align 4
+  // CHECK-NEXT:    ret void
+
+  insert(Mat, e, i, j);
+}
+
+// CHECK-LABEL: define {{.*}}test_insert_template2
+void test_insert_template2(inout MyMatrix<float, 3, 4> Mat, float e) {
+  // NOOPT:         [[MAT_ADDR:%.*]] = load ptr, ptr %Mat.addr, align 8{{$}}
+  // NOOPT:         [[E:%.*]] = load float, ptr %e.addr, align 4{{$}}
+  // OPT:           [[MAT_ADDR:%.*]] = load ptr, ptr %Mat.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT:           [[E:%.*]] = load float, ptr %e.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    call{{.*}} void @_Z6insertIfLj3ELj4EEv8MyMatrixIT_XT0_EXT1_EES1_jj(ptr noalias noundef nonnull align 4 dereferenceable(48) %{{.*}}, float noundef [[E]], i32 noundef 2, i32 noundef 3)
+  // CHECK:         ret void
+  //
+  // CHECK-LABEL: define{{.*}} void @_Z6insertIfLj3ELj4EEv8MyMatrixIT_XT0_EXT1_EES1_jj(
+  // NOOPT:         [[E:%.*]] = load float, ptr %e.addr, align 4{{$}}
+  // NOOPT:         [[I:%.*]] = load i32, ptr %i.addr, align 4{{$}}
+  // OPT:           [[E:%.*]] = load float, ptr %e.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT:           [[I:%.*]] = load i32, ptr %i.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[I_EXT:%.*]] = zext i32 [[I]] to i64
+  // NOOPT-NEXT:    [[J:%.*]] = load i32, ptr %j.addr, align 4{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i32, ptr %j.addr, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[J_EXT:%.*]] = zext i32 [[J]] to i64
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_EXT]], 3
+  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 12
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <12 x float>, ptr {{.*}}, align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <12 x float> [[MAT]], float [[E]], i64 [[IDX2]]
+  // CHECK-NEXT:    store <12 x float> [[MATINS]], ptr {{.*}}, align 4
+  // CHECK-NEXT:    ret void
+
+  insert(Mat, e, 2, 3);
+}
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+EltTy extract(inout MyMatrix<EltTy, Rows, Columns> Mat) {
+  return Mat.value[1u][0u];
+}
+
+// CHECK-LABEL: define {{.*}}test_extract_template
+int test_extract_template(MyMatrix<int, 2, 2> Mat1) {
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %tmp = alloca %struct.MyMatrix.5, align 4
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp, ptr align 4 %Mat1, i64 16, i1 false)
+  // OPT-NEXT:      call void @llvm.lifetime.start.p0(i64 16, ptr %tmp)
+  // CHECK-NEXT:    [[CALL:%.*]] = call{{.*}} i32 @_Z7extractIiLj2ELj2EET_8MyMatrixIS0_XT0_EXT1_EE(ptr noalias noundef nonnull align 4 dereferenceable(16) %tmp)
+  // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 %Mat1, ptr align 4 %tmp, i64 16, i1 false)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 16, ptr %tmp)
+  // CHECK-NEXT:    ret i32 [[CALL]]
+  //
+  // CHECK-LABEL: define{{.*}} i32 @_Z7extractIiLj2ELj2EET_8MyMatrixIS0_XT0_EXT1_EE(
+  // NOOPT:         [[MAT:%.*]] = load <4 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT:           [[MAT:%.*]] = load <4 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <4 x i32> [[MAT]], i64 1
+  // CHECK-NEXT:    ret i32 [[MATEXT]]
+
+  return extract(Mat1);
+}
+
+template <class R, class C>
+auto matrix_subscript(double4x4 m, R r, C c) -> decltype(m[r][c]) {}
+
+// CHECK-LABEL: define {{.*}}test_matrix_subscript
+double test_matrix_subscript(double4x4 m) {
+  // NOOPT:         [[MAT:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:           [[MAT:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[CALL:%.*]] = call{{.*}} nonnull align 8 dereferenceable(8) ptr @_Z16matrix_subscriptIiiEDTixixfp_fp0_fp1_Eu11matrix_typeILj4ELj4EdET_T0_(<16 x double> noundef [[MAT]], i32 noundef 1, i32 noundef 2)
+  // NOOPT-NEXT:    [[RES:%.*]] = load double, ptr [[CALL]], align 8{{$}}
+  // OPT-NEXT:      [[RES:%.*]] = load double, ptr [[CALL]], align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    ret double [[RES]]
+
+  return matrix_subscript(m, 1, 2);
+}
+
+// CHECK-LABEL: define {{.*}}test_matrix_subscript_const
+const double test_matrix_subscript_const(const double4x4 m) {
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [16 x double], align 8
+  // CHECK-NEXT:    store <16 x double> [[M:%.*]], ptr [[M_ADDR]], align 8
+  // NOOPT:         [[NAMELESS1:%.*]] = load <16 x double>, ptr [[M_ADDR]], align 8{{$}}
+  // OPT:           [[NAMELESS1:%.*]] = load <16 x double>, ptr [[M_ADDR]], align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <16 x double> [[NAMELESS1]], i64 4
+  // CHECK-NEXT:    ret double [[MATEXT]]
+
+  return m[0][1];
+}
+
+struct UnsignedWrapper {
+  char x;
+  operator unsigned() {
+    return x;
+  }
+};
+
+// CHECK-LABEL: define {{.*}}extract_IntWrapper_idx
+double extract_IntWrapper_idx(inout double4x4 m, IntWrapper i, UnsignedWrapper j) {
+  // CHECK:         [[I:%.*]] = call{{.*}} i32 @_ZN10IntWrappercviEv(ptr {{[^,]*}} %i)
+  // CHECK-NEXT:    [[I_ADD:%.*]] = add nsw i32 [[I]], 1
+  // CHECK-NEXT:    [[I_ADD_EXT:%.*]] = sext i32 [[I_ADD]] to i64
+  // CHECK-NEXT:    [[J:%.*]] = call{{.*}} i32 @_ZN15UnsignedWrappercvjEv(ptr {{[^,]*}} %j)
+  // CHECK-NEXT:    [[J_SUB:%.*]] = sub i32 [[J]], 1
+  // CHECK-NEXT:    [[J_SUB_EXT:%.*]] = zext i32 [[J_SUB]] to i64
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4
+  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]]
+  // NOOPT-NEXT:    [[MAT_ADDR:%.*]] = load ptr, ptr %m.addr, align 8{{$}}
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <16 x double>, ptr [[MAT_ADDR]], align 8{{$}}
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // OPT-NEXT:      [[MAT_ADDR:%.*]] = load ptr, ptr %m.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <16 x double>, ptr [[MAT_ADDR]], align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]]  = extractelement <16 x double> [[MAT]], i64 [[IDX2]]
+  // CHECK-NEXT:    ret double [[MATEXT]]
+  return m[i + 1][j - 1];
+}
+
+template <class T, unsigned R, unsigned C>
+using matrix_type = matrix<T, R, C>;
+struct identmatrix_t {
+  template <class T, unsigned N>
+  operator matrix_type<T, N, N>() const {
+    matrix_type<T, N, N> result;
+    for (unsigned i = 0; i != N; ++i)
+      result[i][i] = 1;
+    return result;
+  }
+};
+
+constexpr identmatrix_t identmatrix;
+
+// CHECK-LABEL: define {{.*}}test_constexpr1
+void test_constexpr1(inout matrix_type<float, 4, 4> m) {
+  // NOOPT:         [[MAT:%.*]] = load <16 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:           [[MAT:%.*]] = load <16 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[IM:%.*]] = call{{.*}} <16 x float> @_ZNK13identmatrix_tcvu11matrix_typeIXT0_EXT0_ET_EIfLj4EEEv(ptr {{[^,]*}} @_ZL11identmatrix)
+  // CHECK-NEXT:    [[ADD:%.*]] = fadd <16 x float> [[MAT]], [[IM]]
+  // NOOPT-NEXT:    [[MAT_ADDR:%.*]] = load ptr, ptr %m.addr, align 8{{$}}
+  // OPT-NEXT:      [[MAT_ADDR:%.*]] = load ptr, ptr %m.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    store <16 x float> [[ADD]], ptr [[MAT_ADDR]], align 4
+  // CHECK-NEXT:    ret voi
+
+  // CHECK-LABEL: define{{.*}} <16 x float> @_ZNK13identmatrix_tcvu11matrix_typeIXT0_EXT0_ET_EIfLj4EEEv(
+  // CHECK-LABEL: for.body:                                         ; preds = %for.cond
+  // NOOPT-NEXT:   [[I:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT-NEXT:     [[I:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[I_EXT:%.*]] = zext i32 [[I]] to i64
+  // NOOPT-NEXT:   [[I2:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT-NEXT:     [[I2:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[I2_EXT:%.*]] = zext i32 [[I2]] to i64
+  // CHECK-NEXT:   [[IDX1:%.*]] = mul i64 [[I2_EXT]], 4
+  // CHECK-NEXT:   [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
+  // OPT-NEXT:     [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16
+  // OPT-NEXT:     call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:   [[MAT:%.*]] = load <16 x float>, ptr %result, align 4{{$}}
+  // CHECK-NEXT:   [[MATINS:%.*]] = insertelement <16 x float> [[MAT]], float 1.000000e+00, i64 [[IDX2]]
+  // CHECK-NEXT:   store <16 x float> [[MATINS]], ptr %result, align 4
+  // CHECK-NEXT:   br label %for.inc
+  m = m + identmatrix;
+}
+
+// CHECK-LABEL: define {{.*}}test_constexpr2
+void test_constexpr2(inout matrix_type<int, 4, 4> m) {
+  // CHECK:         [[IM:%.*]] = call{{.*}} <16 x i32> @_ZNK13identmatrix_tcvu11matrix_typeIXT0_EXT0_ET_EIiLj4EEEv(ptr {{[^,]*}} @_ZL11identmatrix)
+  // NOOPT:         [[MAT:%.*]] = load <16 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT:           [[MAT:%.*]] = load <16 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[SUB:%.*]] = sub <16 x i32> [[IM]], [[MAT]]
+  // CHECK-NEXT:    [[SUB2:%.*]] = add <16 x i32> [[SUB]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  // NOOPT-NEXT:    [[MAT_ADDR:%.*]] = load ptr, ptr %m.addr, align 8{{$}}
+  // OPT-NEXT:      [[MAT_ADDR:%.*]] = load ptr, ptr %m.addr, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    store <16 x i32> [[SUB2]], ptr [[MAT_ADDR]], align 4
+  // CHECK-NEXT:    ret void
+  //
+
+  // CHECK-LABEL: define{{.*}} <16 x i32> @_ZNK13identmatrix_tcvu11matrix_typeIXT0_EXT0_ET_EIiLj4EEEv(
+  // CHECK-LABEL: for.body:                                         ; preds = %for.cond
+  // NOOPT-NEXT:   [[I:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT-NEXT:     [[I:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[I_EXT:%.*]] = zext i32 [[I]] to i64
+  // NOOPT-NEXT:   [[I2:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT-NEXT:     [[I2:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[I2_EXT:%.*]] = zext i32 [[I2]] to i64
+  // CHECK-NEXT:   [[IDX1:%.*]] = mul i64 [[I2_EXT]], 4
+  // CHECK-NEXT:   [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
+  // OPT-NEXT:     [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16
+  // OPT-NEXT:     call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:   [[MAT:%.*]] = load <16 x i32>, ptr %result, align 4{{$}}
+  // CHECK-NEXT:   [[MATINS:%.*]] = insertelement <16 x i32> [[MAT]], i32 1, i64 [[IDX2]]
+  // CHECK-NEXT:   store <16 x i32> [[MATINS]], ptr %result, align 4
+  // CHECK-NEXT:   br label %for.inc
+
+  m = identmatrix - m + 1;
+}
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl
new file mode 100644
index 00000000000000..6bf5d4b67f54fd
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl
@@ -0,0 +1,1515 @@
+// RUN: %clang_cc1 -O0 -triple spirv-unknown-vulkan-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,NOOPT
+// RUN: %clang_cc1 -O1 -triple spirv-unknown-vulkan-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,OPT
+// RUIN: %clang_cc1 -O0 -triple dxil-pc-shadermodel6.3-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,NOOPT
+// RUIN: %clang_cc1 -O1 -triple dxil-pc-shadermodel6.3-compute -finclude-default-header -fnative-half-type -emit-llvm -disable-llvm-passes  %s -o - | FileCheck %s --check-prefixes=CHECK,OPT
+
+// Test arithmetic operations on matrix types.
+// This is adapted to HLSL from CodeGen/matrix-type-operators.c.
+
+// Floating point matrix/scalar additions.
+
+// CHECK-LABEL: define {{.*}}add_matrix_matrix_double
+void add_matrix_matrix_double() {
+double4x4 a;
+double4x4 b;
+double4x4 c;
+  // NOOPT:       [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[C:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[C:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <16 x double> [[B]], [[C]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a = b + c;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_assign_matrix_double
+void add_compound_assign_matrix_double() {
+double4x4 a;
+double4x4 b;
+  // NOOPT:       [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <16 x double> [[A]], [[B]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a += b;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_assign_matrix_double
+void subtract_compound_assign_matrix_double() {
+double4x4 a;
+double4x4 b;
+  // NOOPT:       [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <16 x double> [[A]], [[B]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a -= b;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_matrix_float
+void add_matrix_matrix_float() {
+float2x3 a;
+float2x3 b;
+float2x3 c;
+  // NOOPT:       [[B:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[C:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[B:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[C:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[B]], [[C]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+
+  a = b + c;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_assign_matrix_float
+void add_compound_assign_matrix_float() {
+float2x3 a;
+float2x3 b;
+  // NOOPT:       [[B:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[B:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[A]], [[B]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+
+  a += b;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_assign_matrix_float
+void subtract_compound_assign_matrix_float() {
+float2x3 a;
+float2x3 b;
+  // NOOPT:       [[B:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[B:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <6 x float> [[A]], [[B]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+
+  a -= b;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_double_float
+void add_matrix_scalar_double_float() {
+double4x4 a;
+float vf;
+  // NOOPT:       [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[SCALAR:%.*]] = load float, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[SCALAR:%.*]] = load float, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = fpext float [[SCALAR]] to double
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <16 x double> poison, double [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <16 x double> [[SCALAR_EMBED]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <16 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a = a + vf;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_double_float
+void add_compound_matrix_scalar_double_float() {
+double4x4 a;
+float vf;
+  // NOOPT:  [[SCALAR:%.*]] = load float, ptr {{.*}}, align 4{{$}}
+  // OPT:    [[SCALAR:%.*]] = load float, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = fpext float [[SCALAR]] to double
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <16 x double> poison, double [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <16 x double> [[SCALAR_EMBED]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <16 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a += vf;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_double_float
+void subtract_compound_matrix_scalar_double_float() {
+double4x4 a;
+float vf;
+  // NOOPT:  [[SCALAR:%.*]] = load float, ptr %vf, align 4{{$}}
+  // OPT:    [[SCALAR:%.*]] = load float, ptr %vf, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = fpext float [[SCALAR]] to double
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <16 x double> poison, double [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <16 x double> [[SCALAR_EMBED]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <16 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a -= vf;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_double_double
+void add_matrix_scalar_double_double() {
+double4x4 a;
+double vd;
+  // NOOPT:       [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[SCALAR:%.*]] = load double, ptr %vd, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[SCALAR:%.*]] = load double, ptr %vd, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <16 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <16 x double> [[SCALAR_EMBED]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <16 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <16 x double> [[RES]], ptr {{.*}}, align 8
+
+  a = a + vd;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_double_double
+void add_compound_matrix_scalar_double_double() {
+double4x4 a;
+double vd;
+  // NOOPT:       [[SCALAR:%.*]] = load double, ptr %vd, align 8{{$}}
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[SCALAR:%.*]] = load double, ptr %vd, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <16 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <16 x double> [[SCALAR_EMBED]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <16 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // store <16 x double> [[RES]], ptr {{.*}}, align 8
+  a += vd;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_double_double
+void subtract_compound_matrix_scalar_double_double() {
+double4x4 a;
+double vd;
+  // NOOPT:       [[SCALAR:%.*]] = load double, ptr %vd, align 8{{$}}
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[SCALAR:%.*]] = load double, ptr %vd, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <16 x double> poison, double [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <16 x double> [[SCALAR_EMBED]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <16 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // store <16 x double> [[RES]], ptr {{.*}}, align 8
+  a -= vd;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_float_float
+void add_matrix_scalar_float_float() {
+float2x3 b;
+float vf;
+  // NOOPT:       [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[SCALAR:%.*]] = load float, ptr %vf, align 4{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[SCALAR:%.*]] = load float, ptr %vf, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> poison, float [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+
+  b = b + vf;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_float_float
+void add_compound_matrix_scalar_float_float() {
+float2x3 b;
+float vf;
+  // NOOPT:       [[SCALAR:%.*]] = load float, ptr %vf, align 4{{$}}
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <6 x float>, ptr %b, align 4{{$}}
+  // OPT:         [[SCALAR:%.*]] = load float, ptr %vf, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <6 x float>, ptr %b, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> poison, float [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+  b += vf;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_float_float
+void subtract_compound_matrix_scalar_float_float() {
+float2x3 b;
+float vf;
+  // NOOPT:       [[SCALAR:%.*]] = load float, ptr %vf, align 4{{$}}
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <6 x float>, ptr %b, align 4{{$}}
+  // OPT:         [[SCALAR:%.*]] = load float, ptr %vf, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <6 x float>, ptr %b, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> poison, float [[SCALAR]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+  b -= vf;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_float_double
+void add_matrix_scalar_float_double() {
+float2x3 b;
+double vd;
+  // NOOPT:       [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[SCALAR:%.*]] = load double, ptr %vd, align 8{{$}}
+  // OPT:         [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[SCALAR:%.*]] = load double, ptr %vd, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_TRUNC:%.*]] = fptrunc double [[SCALAR]] to float
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> poison, float [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+
+  b = b + vd;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_float_double
+void add_compound_matrix_scalar_float_double() {
+float2x3 b;
+double vd;
+  // NOOPT:       [[SCALAR:%.*]] = load double, ptr %vd, align 8{{$}}
+  // OPT:         [[SCALAR:%.*]] = load double, ptr %vd, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_TRUNC:%.*]] = fptrunc double [[SCALAR]] to float
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> poison, float [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+  b += vd;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_float_double
+void subtract_compound_matrix_scalar_float_double() {
+float2x3 b;
+double vd;
+  // NOOPT:       [[SCALAR:%.*]] = load double, ptr %vd, align 8{{$}}
+  // OPT:         [[SCALAR:%.*]] = load double, ptr %vd, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_TRUNC:%.*]] = fptrunc double [[SCALAR]] to float
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <6 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> poison, float [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], ptr {{.*}}, align 4
+  b -= vd;
+}
+
+// Integer matrix/scalar additions
+
+// CHECK-LABEL: define {{.*}}add_matrix_matrix_int
+void add_matrix_matrix_int() {
+int4x3 a;
+int4x3 b;
+int4x3 c;
+  // NOOPT:       [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[C:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[C:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = add <12 x i32> [[B]], [[C]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr {{.*}}, align 4
+  a = b + c;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_matrix_int
+void add_compound_matrix_matrix_int() {
+int4x3 a;
+int4x3 b;
+  // NOOPT:       [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = add <12 x i32> [[A]], [[B]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr {{.*}}, align 4
+  a += b;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_matrix_int
+void subtract_compound_matrix_matrix_int() {
+int4x3 a;
+int4x3 b;
+  // NOOPT:       [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT:         [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = sub <12 x i32> [[A]], [[B]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr {{.*}}, align 4
+  a -= b;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_matrix_uint64
+void add_matrix_matrix_uint64() {
+uint64_t4x2 a;
+uint64_t4x2 b;
+uint64_t4x2 c;
+  // NOOPT:       [[B:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[C:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[B:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[C:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = add <8 x i64> [[B]], [[C]]
+  // CHECK-NEXT:  store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  a = b + c;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_matrix_uint64
+void add_compound_matrix_matrix_uint64() {
+uint64_t4x2 a;
+uint64_t4x2 b;
+  // NOOPT:       [[B:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[B:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = add <8 x i64> [[A]], [[B]]
+  // CHECK-NEXT:  store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  a += b;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_matrix_uint64
+void subtract_compound_matrix_matrix_uint64() {
+uint64_t4x2 a;
+uint64_t4x2 b;
+  // NOOPT:       [[B:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT:         [[B:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // NOOPT-NEXT:  [[A:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT-NEXT:    [[A:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[RES:%.*]] = sub <8 x i64> [[A]], [[B]]
+  // CHECK-NEXT:  store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  a -= b;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_int_int16
+void add_matrix_scalar_int_int16() {
+int4x3 a;
+int16_t vs;
+  // NOOPT:        [[MATRIX:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4{{$}}
+  // NOOPT-NEXT:   [[SCALAR:%.*]] = load i16, ptr %vs, align 2{{$}}
+  // OPT:          [[MATRIX:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[SCALAR:%.*]] = load i16, ptr %vs, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i32
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a = a + vs;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_int_int16
+void add_compound_matrix_scalar_int_int16() {
+int4x3 a;
+int16_t vs;
+  // NOOPT:       [[SCALAR:%.*]] = load i16, ptr %vs, align 2{{$}}
+  // OPT:         [[SCALAR:%.*]] = load i16, ptr %vs, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i32
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <12 x i32>, ptr %a, align 4{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <12 x i32>, ptr %a, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_EXT:%.*]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = add <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a += vs;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_int_int16
+void subtract_compound_matrix_scalar_int_int16() {
+int4x3 a;
+int16_t
+  vs;
+  // NOOPT:       [[SCALAR:%.*]] = load i16, ptr %vs, align 2{{$}}
+  // OPT:         [[SCALAR:%.*]] = load i16, ptr %vs, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i32
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <12 x i32>, ptr %a, align 4{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <12 x i32>, ptr %a, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_EXT:%.*]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = sub <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a -= vs;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_int_int64
+void add_matrix_scalar_int_int64() {
+int4x3 a;
+int64_t vli;
+  // NOOPT:        [[MATRIX:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4{{$}}
+  // NOOPT-NEXT:   [[SCALAR:%.*]] = load i64, ptr %vli, align 8{{$}}
+  // OPT:          [[MATRIX:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[SCALAR:%.*]] = load i64, ptr %vli, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a = a + vli;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_int_int64
+void add_compound_matrix_scalar_int_int64() {
+int4x3 a;
+int64_t vli;
+  // NOOPT:       [[SCALAR:%.*]] = load i64, ptr %vli, align 8{{$}}
+  // OPT:         [[SCALAR:%.*]] = load i64, ptr %vli, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <12 x i32>, ptr %a, align 4{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <12 x i32>, ptr %a, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = add <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a += vli;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_int_int64
+void subtract_compound_matrix_scalar_int_int64() {
+int4x3 a;
+int64_t vli;
+  // NOOPT:       [[SCALAR:%.*]] = load i64, ptr %vli, align 8{{$}}
+  // OPT:         [[SCALAR:%.*]] = load i64, ptr %vli, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <12 x i32>, ptr %a, align 4{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <12 x i32>, ptr %a, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = sub <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a -= vli;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_int_uint64
+void add_matrix_scalar_int_uint64() {
+int4x3 a;
+uint64_t vulli;
+  // NOOPT:        [[MATRIX:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4{{$}}
+  // NOOPT-NEXT:   [[SCALAR:%.*]] = load i64, ptr %vulli, align 8{{$}}
+  // OPT:          [[MATRIX:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[SCALAR:%.*]] = load i64, ptr %vulli, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a = a + vulli;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_int_uint64
+void add_compound_matrix_scalar_int_uint64() {
+int4x3 a;
+uint64_t vulli;
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vulli, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vulli, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <12 x i32>, ptr [[MATRIX_ADDR:%.*]], align 4{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <12 x i32>, ptr [[MATRIX_ADDR:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a += vulli;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_int_uint64
+void subtract_compound_matrix_scalar_int_uint64() {
+int4x3 a;
+uint64_t vulli;
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vulli, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vulli, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <12 x i32>, ptr [[MATRIX_ADDR:%.*]], align 4{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <12 x i32>, ptr [[MATRIX_ADDR:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <12 x i32> poison, i32 [[SCALAR_TRUNC]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <12 x i32> [[SCALAR_EMBED]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = sub <12 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <12 x i32> [[RES]], ptr [[MAT_ADDR]], align 4
+
+  a -= vulli;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_uint64_short
+void add_matrix_scalar_uint64_short() {
+uint64_t4x2 b;
+short vs;
+  // NOOPT:         [[SCALAR:%.*]] = load i16, ptr %vs, align 2{{$}}
+  // OPT:           [[SCALAR:%.*]] = load i16, ptr %vs, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i64
+  // NOOPT-NEXT:    [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT-NEXT:      [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:    [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = add <8 x i64> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK-NEXT:    store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b = vs + b;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_uint64_short
+void add_compound_matrix_scalar_uint64_short() {
+uint64_t4x2 b;
+short vs;
+  // NOOPT:       [[SCALAR:%.*]] = load i16, ptr %vs, align 2{{$}}
+  // OPT:         [[SCALAR:%.*]] = load i16, ptr %vs, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i64
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <8 x i64>, ptr %b, align 8{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <8 x i64>, ptr %b, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = add <8 x i64> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b += vs;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_uint64_short
+void subtract_compound_matrix_scalar_uint64_short() {
+uint64_t4x2 b;
+short vs;
+  // NOOPT:       [[SCALAR:%.*]] = load i16, ptr %vs, align 2{{$}}
+  // OPT:         [[SCALAR:%.*]] = load i16, ptr %vs, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i64
+  // NOOPT-NEXT:  [[MATRIX:%.*]] = load <8 x i64>, ptr %b, align 8{{$}}
+  // OPT-NEXT:    [[MATRIX:%.*]] = load <8 x i64>, ptr %b, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR_EXT]], i64 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = sub <8 x i64> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b -= vs;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_uint64_int
+void add_matrix_scalar_uint64_int() {
+uint64_t4x2 b;
+int64_t vli;
+  // NOOPT:         [[SCALAR:%.*]] = load i64, ptr %vli, align 8{{$}}
+  // NOOPT-NEXT:    [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT:           [[SCALAR:%.*]] = load i64, ptr %vli, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR]], i64 0
+  // CHECK-NEXT:    [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = add <8 x i64> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK-NEXT:    store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b = vli + b;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_uint64_int
+void add_compound_matrix_scalar_uint64_int() {
+uint64_t4x2 b;
+int64_t vli;
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vli, align 8{{$}}
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vli, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <8 x i64> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b += vli;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_uint64_int
+void subtract_compound_matrix_scalar_uint64_int() {
+uint64_t4x2 b;
+int64_t vli;
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vli, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vli, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <8 x i64>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = sub <8 x i64> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b -= vli;
+}
+
+// CHECK-LABEL: define {{.*}}add_matrix_scalar_uint64_uint64
+void add_matrix_scalar_uint64_uint64() {
+uint64_t4x2 b;
+uint64_t vulli;
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vulli, align 8{{$}}
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <8 x i64>, ptr %b, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vulli, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <8 x i64>, ptr %b, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <8 x i64> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK-NEXT:   store <8 x i64> [[RES]], ptr {{.*}}, align 8
+  b = vulli + b;
+}
+
+// CHECK-LABEL: define {{.*}}add_compound_matrix_scalar_uint64_uint64
+void add_compound_matrix_scalar_uint64_uint64() {
+uint64_t4x2 b;
+uint64_t vulli;
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vulli, align 8{{$}}
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <8 x i64>, ptr %b, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vulli, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <8 x i64>, ptr %b, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <8 x i64> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b += vulli;
+}
+
+// CHECK-LABEL: define {{.*}}subtract_compound_matrix_scalar_uint64_uint64
+void subtract_compound_matrix_scalar_uint64_uint64() {
+uint64_t4x2 b;
+uint64_t vulli;
+  // NOOPT:        [[SCALAR:%.*]] = load i64, ptr %vulli, align 8{{$}}
+  // NOOPT-NEXT:   [[MATRIX:%.*]] = load <8 x i64>, ptr %b, align 8{{$}}
+  // OPT:          [[SCALAR:%.*]] = load i64, ptr %vulli, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[MATRIX:%.*]] = load <8 x i64>, ptr %b, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> poison, i64 [[SCALAR]], i64 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = sub <8 x i64> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <8 x i64> [[RES]], ptr {{.*}}, align 8
+
+  b -= vulli;
+}
+
+// Tests for matrix multiplication.
+
+// CHECK-LABEL: define {{.*}}multiply_matrix_matrix_double
+void multiply_matrix_matrix_double() {
+double4x4 b;
+double4x4 c;
+  // NOOPT:         [[B:%.*]] = load <16 x double>, ptr %b, align 8{{$}}
+  // NOOPT-NEXT:    [[C:%.*]] = load <16 x double>, ptr %c, align 8{{$}}
+  // OPT:           [[B:%.*]] = load <16 x double>, ptr %b, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[C:%.*]] = load <16 x double>, ptr %c, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = call <16 x double> @llvm.matrix.multiply.v16f64.v16f64.v16f64(<16 x double> [[B]], <16 x double> [[C]], i32 4, i32 4, i32 4)
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr %a, align 8
+  // OPT-NEXT:     call void @llvm.lifetime.end.p0(i64 128, ptr %a)
+  // OPT-NEXT:     call void @llvm.lifetime.end.p0(i64 128, ptr %c)
+  // OPT-NEXT:     call void @llvm.lifetime.end.p0(i64 128, ptr %b)
+  // CHECK-NEXT:    ret void
+
+  double4x4 a;
+  a = b * c;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_matrix_matrix_double
+void multiply_compound_matrix_matrix_double() {
+double4x4 b;
+double4x4 c;
+  // NOOPT:        [[C:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:   [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:          [[C:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:     [[B:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[RES:%.*]] = call <16 x double> @llvm.matrix.multiply.v16f64.v16f64.v16f64(<16 x double> [[B]], <16 x double> [[C]], i32 4, i32 4, i32 4)
+  // CHECK-NEXT:   store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // OPT-NEXT:     call void @llvm.lifetime.end.p0(i64 128, ptr %c)
+  // OPT-NEXT:     call void @llvm.lifetime.end.p0(i64 128, ptr %b)
+  // CHECK-NEXT:   ret void
+  b *= c;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_matrix_matrix_int
+void multiply_matrix_matrix_int() {
+int4x3 b;
+int3x4 c;
+  // NOOPT:         [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // NOOPT-NEXT:    [[C:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT:           [[B:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[C:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.matrix.multiply.v16i32.v12i32.v12i32(<12 x i32> [[B]], <12 x i32> [[C]], i32 4, i32 3, i32 4)
+  // CHECK-NEXT:    store <16 x i32> [[RES]], ptr %a, align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 64, ptr %a)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %c)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %b)
+  // CHECK:         ret void
+  int4x4 a;
+  a = b * c;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_double_matrix_scalar_float
+void multiply_double_matrix_scalar_float() {
+double4x4 a;
+float s;
+  // NOOPT:         [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load float, ptr %s, align 4{{$}}
+  // OPT:           [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load float, ptr %s, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_EXT:%.*]] = fpext float [[S]] to double
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <16 x double> poison, double [[S_EXT]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <16 x double> [[VECINSERT]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <16 x double> [[A]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 128, ptr %a)
+  // CHECK-NEXT:    ret void
+  a = a * s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_double_matrix_scalar_float
+void multiply_compound_double_matrix_scalar_float() {
+double4x4 a;
+float s;
+  // NOOPT:         [[S:%.*]] = load float, ptr %s, align 4{{$}}
+  // OPT:           [[S:%.*]] = load float, ptr %s, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_EXT:%.*]] = fpext float [[S]] to double
+  // NOOPT-NEXT:    [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT-NEXT:      [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <16 x double> poison, double [[S_EXT]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <16 x double> [[VECINSERT]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <16 x double> [[A]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 128, ptr %a)
+  // CHECK-NEXT:    ret void
+  a *= s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_double_matrix_scalar_double
+void multiply_double_matrix_scalar_double() {
+double4x4 a;
+double s;
+  // NOOPT:         [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load double, ptr %s, align 8{{$}}
+  // OPT:           [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load double, ptr %s, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <16 x double> poison, double [[S]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <16 x double> [[VECINSERT]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <16 x double> [[A]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 128, ptr %a)
+  // CHECK-NEXT:    ret void
+  a = a * s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_double_matrix_scalar_double
+void multiply_compound_double_matrix_scalar_double() {
+double4x4 a;
+double s;
+  // NOOPT:         [[S:%.*]] = load double, ptr %s, align 8{{$}}
+  // NOOPT-NEXT:    [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:           [[S:%.*]] = load double, ptr %s, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <16 x double> poison, double [[S]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <16 x double> [[VECINSERT]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <16 x double> [[A]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 128, ptr %a)
+  // CHECK-NEXT:    ret void
+  a *= s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_float_matrix_scalar_double
+void multiply_float_matrix_scalar_double() {
+float2x3 b;
+double s;
+  // NOOPT:         [[S:%.*]] = load double, ptr %s, align 8{{$}}
+  // OPT:           [[S:%.*]] = load double, ptr %s, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_TRUNC:%.*]] = fptrunc double [[S]] to float
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <6 x float> poison, float [[S_TRUNC]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <6 x float> [[VECINSERT]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <6 x float> [[VECSPLAT]], [[MAT]]
+  // CHECK-NEXT:    store <6 x float> [[RES]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 24, ptr %b)
+  // CHECK-NEXT:    ret void
+  b = s * b;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_float_matrix_scalar_double
+void multiply_compound_float_matrix_scalar_double() {
+float2x3 b;
+double s;
+  // NOOPT:         [[S:%.*]] = load double, ptr %s, align 8{{$}}
+  // OPT:           [[S:%.*]] = load double, ptr %s, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_TRUNC:%.*]] = fptrunc double [[S]] to float
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <6 x float> poison, float [[S_TRUNC]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <6 x float> [[VECINSERT]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <6 x float> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <6 x float> %3, ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 24, ptr %b)
+  // ret void
+  b *= s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_int_matrix_scalar_int16
+void multiply_int_matrix_scalar_int16() {
+int4x3 b;
+int16_t s;
+  // NOOPT:         [[S:%.*]] = load i16, ptr %s, align 2{{$}}
+  // OPT:           [[S:%.*]] = load i16, ptr %s, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_EXT:%.*]] = sext i16 [[S]] to i32
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <12 x i32> poison, i32 [[S_EXT]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <12 x i32> [[VECINSERT]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = mul <12 x i32> [[VECSPLAT]], [[MAT]]
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 2, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %b)
+  // CHECK-NEXT:    ret void
+  b = s * b;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_int_matrix_scalar_int16
+void multiply_compound_int_matrix_scalar_int16() {
+int4x3 b;
+int16_t s;
+  // NOOPT:        [[S:%.*]] = load i16, ptr %s, align 2{{$}}
+  // OPT:          [[S:%.*]] = load i16, ptr %s, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[S_EXT:%.*]] = sext i16 [[S]] to i32
+  // NOOPT-NEXT:   [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // OPT-NEXT:     [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:   [[VECINSERT:%.*]] = insertelement <12 x i32> poison, i32 [[S_EXT]], i64 0
+  // CHECK-NEXT:   [[VECSPLAT:%.*]] = shufflevector <12 x i32> [[VECINSERT]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = mul <12 x i32> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:   store <12 x i32> [[RES]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 2, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %b)
+  // CHECK-NEXT:   ret void
+  b *= s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_int_matrix_scalar_ull
+void multiply_int_matrix_scalar_ull() {
+int4x3 b;
+uint64_t s;
+  // NOOPT:         [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // OPT:           [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load i64, ptr %s, align 8{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load i64, ptr %s, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_TRUNC:%.*]] = trunc i64 [[S]] to i32
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <12 x i32> poison, i32 [[S_TRUNC]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <12 x i32> [[VECINSERT]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = mul <12 x i32> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %b)
+  // CHECK-NEXT:    ret void
+  b = b * s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_int_matrix_scalar_ull
+void multiply_compound_int_matrix_scalar_ull() {
+int4x3 b;
+uint64_t s;
+  // NOOPT:         [[S:%.*]] = load i64, ptr %s, align 8{{$}}
+  // OPT:           [[S:%.*]] = load i64, ptr %s, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_TRUNC:%.*]] = trunc i64 [[S]] to i32
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <12 x i32> poison, i32 [[S_TRUNC]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <12 x i32> [[VECINSERT]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = mul <12 x i32> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %b)
+  // CHECK-NEXT:    ret void
+
+  b *= s;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_float_matrix_constant
+void multiply_float_matrix_constant() {
+float2x3 a;
+  // CHECK:         [[A_ADDR:%.*]] = alloca [6 x float], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.start.p0(i64 24, ptr %a)
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[A_ADDR]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x float>, ptr [[A_ADDR]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <6 x float> [[MAT]], <float 2.500000e+00, float 2.500000e+00, float 2.500000e+00, float 2.500000e+00, float 2.500000e+00, float 2.500000e+00>
+  // CHECK-NEXT:    store <6 x float> [[RES]], ptr [[A_ADDR]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 24, ptr %a)
+  // CHECK-NEXT:    ret void
+  a = a * 2.5;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_float_matrix_constant
+void multiply_compound_float_matrix_constant() {
+float2x3 a;
+  // CHECK:         [[A_ADDR:%.*]] = alloca [6 x float], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.start.p0(i64 24, ptr %a)
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[A_ADDR]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x float>, ptr [[A_ADDR]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = fmul <6 x float> [[MAT]], <float 2.500000e+00, float 2.500000e+00, float 2.500000e+00, float 2.500000e+00, float 2.500000e+00, float 2.500000e+00>
+  // CHECK-NEXT:    store <6 x float> [[RES]], ptr [[A_ADDR]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 24, ptr %a)
+  // CHECK-NEXT:    ret void
+  a *= 2.5;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_int_matrix_constant
+void multiply_int_matrix_constant() {
+int4x3 a;
+  // CHECK:         [[A_ADDR:%.*]] = alloca [12 x i32], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.start.p0(i64 48, ptr %a)
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = mul <12 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, [[MAT]]
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[A_ADDR]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %a)
+  // CHECK-NEXT:    ret void
+  a = 5 * a;
+}
+
+// CHECK-LABEL: define {{.*}}multiply_compound_int_matrix_constant
+void multiply_compound_int_matrix_constant() {
+int4x3 a;
+  // CHECK:         [[A_ADDR:%.*]] = alloca [12 x i32], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.start.p0(i64 48, ptr %a)
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = mul <12 x i32> [[MAT]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[A_ADDR]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %a)
+  // CHECK-NEXT:    ret void
+  a *= 5;
+}
+
+// CHECK-LABEL: define {{.*}}divide_double_matrix_scalar_float
+void divide_double_matrix_scalar_float() {
+double4x4 a;
+float s;
+  // NOOPT:         [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load float, ptr %s, align 4{{$}}
+  // OPT:           [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load float, ptr %s, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_EXT:%.*]] = fpext float [[S]] to double
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <16 x double> poison, double [[S_EXT]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <16 x double> [[VECINSERT]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fdiv <16 x double> [[A]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 128, ptr %a)
+  // CHECK-NEXT:    ret void
+  a = a / s;
+}
+
+// CHECK-LABEL: define {{.*}}divide_double_matrix_scalar_double
+void divide_double_matrix_scalar_double() {
+double4x4 a;
+double s;
+  // NOOPT:         [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load double, ptr %s, align 8{{$}}
+  // OPT:           [[A:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load double, ptr %s, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <16 x double> poison, double [[S]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <16 x double> [[VECINSERT]], <16 x double> poison, <16 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fdiv <16 x double> [[A]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <16 x double> [[RES]], ptr {{.*}}, align 8
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 128, ptr %a)
+  // CHECK-NEXT:    ret void
+  a = a / s;
+}
+
+// CHECK-LABEL: define {{.*}}divide_float_matrix_scalar_double
+void divide_float_matrix_scalar_double() {
+float2x3 b;
+double s;
+  // NOOPT:         [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load double, ptr %s, align 8{{$}}
+  // OPT:           [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load double, ptr %s, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_TRUNC:%.*]] = fptrunc double [[S]] to float
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <6 x float> poison, float [[S_TRUNC]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <6 x float> [[VECINSERT]], <6 x float> poison, <6 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = fdiv <6 x float> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <6 x float> [[RES]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 24, ptr %b)
+  // CHECK-NEXT:    ret void
+  b = b / s;
+}
+
+// CHECK-LABEL: define {{.*}}divide_int_matrix_scalar_int16
+void divide_int_matrix_scalar_int16() {
+int4x3 b;
+int16_t s;
+  // NOOPT:         [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load i16, ptr %s, align 2{{$}}
+  // OPT:           [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load i16, ptr %s, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_EXT:%.*]] = sext i16 [[S]] to i32
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <12 x i32> poison, i32 [[S_EXT]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <12 x i32> [[VECINSERT]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = sdiv <12 x i32> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 2, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %b)
+  // CHECK-NEXT:    ret void
+  b = b / s;
+}
+
+// CHECK-LABEL: define {{.*}}divide_int_matrix_scalar_ull
+void divide_int_matrix_scalar_ull() {
+int4x3 b;
+uint64_t s;
+  // NOOPT:         [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load i64, ptr %s, align 8{{$}}
+  // OPT:           [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load i64, ptr %s, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[S_TRUNC:%.*]] = trunc i64 [[S]] to i32
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <12 x i32> poison, i32 [[S_TRUNC]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <12 x i32> [[VECINSERT]], <12 x i32> poison, <12 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = sdiv <12 x i32> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <12 x i32> [[RES]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %b)
+  // CHECK-NEXT:    ret void
+  b = b / s;
+}
+
+// CHECK-LABEL: define {{.*}}divide_ull_matrix_scalar_ull
+void divide_ull_matrix_scalar_ull() {
+uint64_t4x2 b;
+uint64_t s;
+  // NOOPT:         [[MAT:%.*]] = load <8 x i64>, ptr [[B:%.*]], align 8{{$}}
+  // NOOPT-NEXT:    [[S:%.*]] = load i64, ptr %s, align 8{{$}}
+  // OPT:           [[MAT:%.*]] = load <8 x i64>, ptr [[B:%.*]], align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[S:%.*]] = load i64, ptr %s, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[VECINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[S]], i64 0
+  // CHECK-NEXT:    [[VECSPLAT:%.*]] = shufflevector <8 x i64> [[VECINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = udiv <8 x i64> [[MAT]], [[VECSPLAT]]
+  // CHECK-NEXT:    store <8 x i64> [[RES]], ptr [[B]], align 8
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %s)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 64, ptr %b)
+  // CHECK-NEXT:    ret void
+  b = b / s;
+}
+
+// CHECK-LABEL: define {{.*}}divide_float_matrix_constant
+void divide_float_matrix_constant() {
+float2x3 a;
+  // CHECK:         [[A_ADDR:%.*]] = alloca [6 x float], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.start.p0(i64 24, ptr %a)
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[A_ADDR]], align 4{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x float>, ptr [[A_ADDR]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[RES:%.*]] = fdiv <6 x float> [[MAT]], <float 2.500000e+00, float 2.500000e+00, float 2.500000e+00, float 2.500000e+00, float 2.500000e+00, float 2.500000e+00>
+  // CHECK-NEXT:    store <6 x float> [[RES]], ptr [[A_ADDR]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 24, ptr %a)
+  // CHECK-NEXT:    ret void
+  a = a / 2.5;
+}
+
+  // Tests for the matrix type operators.
+
+  // Check that we can use matrix index expression on different floating point
+  // matrixes and indices.
+// CHECK-LABEL: define {{.*}}insert_double_matrix_const_idx_ll_u_double
+void insert_double_matrix_const_idx_ll_u_double() {
+double4x4 a;
+double d;
+float2x3 b;
+float e;
+int j;
+uint k;
+  // NOOPT:         [[D:%.*]] = load double, ptr %d, align 8{{$}}
+  // OPT:           [[D:%.*]] = load double, ptr %d, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MAT:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <16 x double> [[MAT]], double [[D]], i64 4
+  // CHECK-NEXT:    store <16 x double> [[MATINS]], ptr {{.*}}, align 8
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %k)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %j)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %e)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 24, ptr %b)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %d)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 128, ptr %a)
+  // CHECK-NEXT:    ret void
+
+  a[0ll][1u] = d;
+}
+
+// CHECK-LABEL: define {{.*}}insert_double_matrix_const_idx_i_u_double
+void insert_double_matrix_const_idx_i_u_double() {
+double4x4 a;
+double d;
+  // NOOPT:         [[D:%.*]] = load double, ptr %d, align 8{{$}}
+  // OPT:           [[D:%.*]] = load double, ptr %d, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MAT:%.*]] = load <16 x double>, ptr [[B:%.*]], align 8{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <16 x double> [[MAT]], double [[D]], i64 13
+  // CHECK-NEXT:    store <16 x double> [[MATINS]], ptr [[B]], align 8
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %d)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 128, ptr %a)
+  // CHECK-NEXT:    ret void
+
+  a[1][3u] = d;
+}
+
+// CHECK-LABEL: define {{.*}}insert_float_matrix_const_idx_ull_i_float
+void insert_float_matrix_const_idx_ull_i_float() {
+float2x3 b;
+float e;
+  // NOOPT:         [[E:%.*]] = load float, ptr %e, align 4{{$}}
+  // OPT:           [[E:%.*]] = load float, ptr %e, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 3
+  // CHECK-NEXT:    store <6 x float> [[MATINS]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %e)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 24, ptr %b)
+  // CHECK-NEXT:    ret void
+
+  b[1ull][1] = e;
+}
+
+// CHECK-LABEL: define {{.*}}insert_float_matrix_idx_i_u_float
+void insert_float_matrix_idx_i_u_float() {
+float2x3 b;
+float e;
+int j;
+uint k;
+  // NOOPT:         [[E:%.*]] = load float, ptr %e, align 4{{$}}
+  // NOOPT-NEXT:    [[J:%.*]] = load i32, ptr %j, align 4{{$}}
+  // OPT:           [[E:%.*]] = load float, ptr %e, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i32, ptr %j, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[J_EXT:%.*]] = sext i32 [[J]] to i64
+  // NOOPT-NEXT:    [[K:%.*]] = load i32, ptr %k, align 4{{$}}
+  // OPT-NEXT:      [[K:%.*]] = load i32, ptr %k, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[K_EXT:%.*]] = zext i32 [[K]] to i64
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[K_EXT]], 2
+  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]]
+  // CHECK-NEXT:    store <6 x float> [[MATINS]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %k)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %j)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %e)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 24, ptr %b)
+  // CHECK-NEXT:    ret void
+
+  b[j][k] = e;
+}
+
+// CHECK-LABEL: define {{.*}}insert_float_matrix_idx_s_ull_float
+void insert_float_matrix_idx_s_ull_float() {
+float2x3 b;
+float e;
+int16_t j;
+uint64_t k;
+  // NOOPT:         [[E:%.*]] = load float, ptr %e, align 4{{$}}
+  // NOOPT-NEXT:    [[J:%.*]] = load i16, ptr %j, align 2{{$}}
+  // OPT:           [[E:%.*]] = load float, ptr %e, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i16, ptr %j, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[J_EXT:%.*]] = sext i16 [[J]] to i64
+  // NOOPT-NEXT:    [[K:%.*]] = load i64, ptr %k, align 8{{$}}
+  // OPT-NEXT:      [[K:%.*]] = load i64, ptr %k, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[K]], 2
+  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr [[B:%.*]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]]
+  // CHECK-NEXT:    store <6 x float> [[MATINS]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %k)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 2, ptr %j)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %e)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 24, ptr %b)
+  // CHECK-NEXT:    ret void
+
+  (b)[j][k] = e;
+}
+
+  // Check that we can can use matrix index expressions on integer matrixes.
+// CHECK-LABEL: define {{.*}}insert_int_idx_expr
+void insert_int_idx_expr() {
+int4x3 a;
+int i;
+  // NOOPT:         [[I1:%.*]] = load i32, ptr %i, align 4{{$}}
+  // NOOPT-NEXT:    [[I2:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT:           [[I1:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[I2:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[I2_ADD:%.*]] = add nsw i32 4, [[I2]]
+  // CHECK-NEXT:    [[ADD_EXT:%.*]] = sext i32 [[I2_ADD]] to i64
+  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 8, [[ADD_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 12
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr [[B:%.*]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <12 x i32> [[MAT]], i32 [[I1]], i64 [[IDX2]]
+  // CHECK-NEXT:    store <12 x i32> [[MATINS]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %i)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %a)
+  // CHECK-NEXT:    ret void
+
+  a[4 + i][1 + 1u] = i;
+}
+
+  // Check that we can can use matrix index expressions on FP and integer
+  // matrixes.
+// CHECK-LABEL: define {{.*}}insert_float_into_int_matrix
+void insert_float_into_int_matrix() {
+int4x3 a;
+int i;
+  // NOOPT:         [[I:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT:           [[I:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr [[MAT_ADDR:%.*]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <12 x i32> [[MAT]], i32 [[I]], i64 7
+  // CHECK-NEXT:    store <12 x i32> [[MATINS]], ptr [[MAT_ADDR]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %i)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %a)
+  // CHECK-NEXT:    ret void
+
+  a[3][1] = i;
+}
+
+  // Check that we can use overloaded matrix index expressions on matrixes with
+  // matching dimensions, but different element types.
+// CHECK-LABEL: define {{.*}}insert_matching_dimensions1
+void insert_matching_dimensions1() {
+double3x3 a;
+double i;
+  // NOOPT:         [[I:%.*]] = load double, ptr %i, align 8{{$}}
+  // OPT:           [[I:%.*]] = load double, ptr %i, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MAT:%.*]] = load <9 x double>, ptr [[B:%.*]], align 8{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <9 x double> [[MAT]], double [[I]], i64 5
+  // CHECK-NEXT:    store <9 x double> [[MATINS]], ptr [[B]], align 8
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %i)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 72, ptr %a)
+  // CHECK-NEXT:    ret void
+
+  a[2u][1u] = i;
+}
+
+// CHECK-LABEL: define {{.*}}insert_matching_dimensions
+void insert_matching_dimensions() {
+float3x3 b;
+float e;
+  // NOOPT:         [[E:%.*]] = load float, ptr %e, align 4{{$}}
+  // OPT:           [[E:%.*]] = load float, ptr %e, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MAT:%.*]] = load <9 x float>, ptr [[B:%.*]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <9 x float> [[MAT]], float [[E]], i64 7
+  // CHECK-NEXT:    store <9 x float> [[MATINS]], ptr [[B]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %e)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 36, ptr %b)
+  // CHECK-NEXT:    ret void
+
+  b[1u][2u] = e;
+}
+
+// CHECK-LABEL: define {{.*}}extract_double
+double extract_double() {
+double4x4 a;
+  // NOOPT:         [[MAT:%.*]] = load <16 x double>, ptr {{.*}}, align 8{{$}}
+  // OPT:           [[MAT:%.*]] = load <16 x double>, ptr {{.*}}, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 10
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 128, ptr %a)
+  // CHECK-NEXT:    ret double [[MATEXT]]
+
+  return a[2][3 - 1u];
+}
+
+// CHECK-LABEL: define {{.*}}extract_float
+double extract_float() {
+float3x3 b;
+  // NOOPT:         [[MAT:%.*]] = load <9 x float>, ptr {{.*}}, align 4{{$}}
+  // OPT:           [[MAT:%.*]] = load <9 x float>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 5
+  // CHECK-NEXT:    [[TO_DOUBLE:%.*]] = fpext float [[MATEXT]] to double
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 36, ptr %b)
+  // CHECK-NEXT:    ret double [[TO_DOUBLE]]
+
+  return b[2][1];
+}
+
+// CHECK-LABEL: define {{.*}}extract_int
+int extract_int() {
+int4x3 c;
+uint64_t j;
+  // NOOPT:         [[J1:%.*]] = load i64, ptr %j, align 8{{$}}
+  // NOOPT-NEXT:    [[J2:%.*]] = load i64, ptr %j, align 8{{$}}
+  // OPT:           [[J1:%.*]] = load i64, ptr %j, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // OPT-NEXT:      [[J2:%.*]] = load i64, ptr %j, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J2]], 4
+  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[J1]]
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 12
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // OPT-NEXT:      [[MAT:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <12 x i32> [[MAT]], i64 [[IDX2]]
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %j)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 48, ptr %c)
+  // CHECK-NEXT:    ret i32 [[MATEXT]]
+
+  return c[j][j];
+}
+
+// CHECK-LABEL: define {{.*}}test_extract_matrix_pointer1
+double test_extract_matrix_pointer1() {
+double3x2 ptr[3][3];
+uint j;
+  // NOOPT:         [[J:%.*]] = load i32, ptr %j, align 4{{$}}
+  // OPT:           [[J:%.*]] = load i32, ptr %j, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[J_EXT:%.*]] = zext i32 [[J]] to i64
+  // CHECK-NEXT:    [[IDX:%.*]] = add i64 3, [[J_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[ARIX:%.*]] = getelementptr inbounds [3 x [3 x [6 x double]]], ptr %ptr, i64 0, i64 1
+  // CHECK-NEXT:    [[ARIX1:%.*]] = getelementptr inbounds [3 x [6 x double]], ptr [[ARIX]], i64 0, i64 2
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <6 x double>, ptr [[ARIX1]], align 8{{$}}
+  // OPT-NEXT:      [[MAT:%.*]] = load <6 x double>, ptr [[ARIX1]], align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <6 x double> [[MAT]], i64 [[IDX]]
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %j)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 432, ptr %ptr)
+  // CHECK-NEXT:    ret double [[MATEXT]]
+
+  return ptr[1][2][j][1];
+}
+
+// CHECK-LABEL: define {{.*}}test_extract_matrix_pointer2
+double test_extract_matrix_pointer2() {
+double3x2 ptr[7][7];
+  // CHECK:         [[ARIX:%.*]] = getelementptr inbounds [7 x [7 x [6 x double]]], ptr %ptr, i64 0, i64 4
+  // CHECK-NEXT:    [[ARIX1:%.*]] = getelementptr inbounds [7 x [6 x double]], ptr [[ARIX]], i64 0, i64 6
+  // NOOPT:         [[MAT:%.*]] = load <6 x double>, ptr [[ARIX1]], align 8{{$}}
+  // OPT:           [[MAT:%.*]] = load <6 x double>, ptr [[ARIX1]], align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <6 x double> [[MAT]], i64 5
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 2352, ptr %ptr)
+  // CHECK-NEXT:    ret double [[MATEXT]]
+
+  return ptr[4][6][2][1 * 3 - 2];
+}
+
+// CHECK-LABEL: define {{.*}}insert_extract
+void insert_extract() {
+double4x4 a;
+float3x3 b;
+uint64_t j;
+int16_t k;
+  // NOOPT:         [[K:%.*]] = load i16, ptr %k, align 2{{$}}
+  // OPT:           [[K:%.*]] = load i16, ptr %k, align 2, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[K_EXT:%.*]] = sext i16 [[K]] to i64
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[K_EXT]], 3
+  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], 0
+  // NOOPT-NEXT:    [[MAT:%.*]] = load <9 x float>, ptr [[MAT_ADDR:%.*]], align 4{{$}}
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 9
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // OPT-NEXT:      [[MAT:%.*]] = load <9 x float>, ptr [[MAT_ADDR:%.*]], align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX2]]
+  // NOOPT-NEXT:    [[J:%.*]] = load i64, ptr %j, align 8{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i64, ptr %j, align 8, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[IDX3:%.*]] = mul i64 [[J]], 3
+  // CHECK-NEXT:    [[IDX4:%.*]] = add i64 [[IDX3]], 2
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX4]], 9
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT2:%.*]] = load <9 x float>, ptr [[MAT_ADDR]], align 4{{$}}
+  // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <9 x float> [[MAT2]], float [[MATEXT]], i64 [[IDX4]]
+  // CHECK-NEXT:    store <9 x float> [[MATINS]], ptr [[MAT_ADDR]], align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 2, ptr %k)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 8, ptr %j)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 36, ptr %b)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 128, ptr %a)
+  // CHECK-NEXT:    ret void
+
+  b[2][j] = b[0][k];
+}
+
+// CHECK-LABEL: define {{.*}}insert_compound_stmt
+void insert_compound_stmt() {
+double4x4 a;
+  // CHECK:        [[A:%.*]] = load <16 x double>, ptr [[A_PTR:%.*]], align 8{{$}}
+  // CHECK-NEXT:   [[EXT:%.*]] = extractelement <16 x double> [[A]], i64 14
+  // CHECK-NEXT:   [[SUB:%.*]] = fsub double [[EXT]], 1.000000e+00
+  // CHECK-NEXT:   [[A2:%.*]] = load <16 x double>, ptr [[A_PTR]], align 8{{$}}
+  // CHECK-NEXT:   [[INS:%.*]] = insertelement <16 x double> [[A2]], double [[SUB]], i64 14
+  // CHECK-NEXT:   store <16 x double> [[INS]], ptr [[A_PTR]], align 8
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 128, ptr %a) #5
+  // CHECK-NEXT:   ret void
+
+  a[2][3] -= 1.0;
+}
+
+struct Foo {
+  float2x3 mat;
+};
+
+// CHECK-LABEL: define {{.*}}insert_compound_stmt_field
+void insert_compound_stmt_field() {
+struct Foo a;
+float f;
+uint i;
+uint j;
+  // NOOPT:         [[I:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT:           [[I:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[I_EXT:%.*]] = zext i32 [[I]] to i64
+  // NOOPT-NEXT:    [[J:%.*]] = load i32, ptr %j, align 4{{$}}
+  // OPT-NEXT:      [[J:%.*]] = load i32, ptr %j, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:    [[J_EXT:%.*]] = zext i32 [[J]] to i64
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_EXT]], 2
+  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, ptr %mat, align 4{{$}}
+  // CHECK-NEXT:    [[EXT:%.*]] = extractelement <6 x float> [[MAT]], i64 [[IDX2]]
+  // CHECK-NEXT:    [[SUM:%.*]] = fadd float [[EXT]], {{.*}}
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT2:%.*]] = load <6 x float>, ptr %mat, align 4{{$}}
+  // CHECK-NEXT:    [[INS:%.*]] = insertelement <6 x float> [[MAT2]], float [[SUM]], i64 [[IDX2]]
+  // CHECK-NEXT:    store <6 x float> [[INS]], ptr %mat, align 4
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %j)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %i)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 4, ptr %f)
+  // OPT-NEXT:      call void @llvm.lifetime.end.p0(i64 24, ptr %a)
+  // CHECK-NEXT:    ret void
+
+  a.mat[i][j] += f;
+}
+
+// CHECK-LABEL: define {{.*}}matrix_as_idx
+void matrix_as_idx() {
+int4x3 a;
+int i;
+int j;
+double4x4 b;
+  // NOOPT:       [[I1:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT:         [[I1:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[I1_EXT:%.*]] = sext i32 [[I1]] to i64
+  // NOOPT-NEXT:  [[J1:%.*]] = load i32, ptr %j, align 4{{$}}
+  // OPT-NEXT:    [[J1:%.*]] = load i32, ptr %j, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[J1_EXT:%.*]] = sext i32 [[J1]] to i64
+  // CHECK-NEXT:  [[IDX1_1:%.*]] = mul i64 [[J1_EXT]], 4
+  // CHECK-NEXT:  [[IDX1_2:%.*]] = add i64 [[IDX1_1]], [[I1_EXT]]
+  // NOOPT-NEXT:  [[A:%.*]] = load <12 x i32>, ptr %a, align 4{{$}}
+  // OPT-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX1_2]], 12
+  // OPT-NEXT:    call void @llvm.assume(i1 [[CMP]])
+  // OPT-NEXT:    [[A:%.*]] = load <12 x i32>, ptr %a, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[MI1:%.*]] = extractelement <12 x i32> [[A]], i64 [[IDX1_2]]
+  // CHECK-NEXT:  [[MI1_EXT:%.*]] = sext i32 [[MI1]] to i64
+  // NOOPT-NEXT:  [[J2:%.*]] = load i32, ptr %j, align 4{{$}}
+  // OPT-NEXT:    [[J2:%.*]] = load i32, ptr %j, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[J2_EXT:%.*]] = sext i32 [[J2]] to i64
+  // NOOPT-NEXT:  [[I2:%.*]] = load i32, ptr %i, align 4{{$}}
+  // OPT-NEXT:    [[I2:%.*]] = load i32, ptr %i, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[I2_EXT:%.*]] = sext i32 [[I2]] to i64
+  // CHECK-NEXT:  [[IDX2_1:%.*]] = mul i64 [[I2_EXT]], 4
+  // CHECK-NEXT:  [[IDX2_2:%.*]] = add i64 [[IDX2_1]], [[J2_EXT]]
+  // NOOPT-NEXT:  [[A2:%.*]] = load <12 x i32>, ptr {{.*}}, align 4{{$}}
+  // OPT-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX2_2]], 12
+  // OPT-NEXT:    call void @llvm.assume(i1 [[CMP]])
+  // OPT-NEXT:    [[A2:%.*]] = load <12 x i32>, ptr {{.*}}, align 4, !tbaa !{{[0-9]+}}{{$}}
+  // CHECK-NEXT:  [[MI2:%.*]] = extractelement <12 x i32> [[A2]], i64 [[IDX2_2]]
+  // CHECK-NEXT:  [[MI3:%.*]] = add nsw i32 [[MI2]], 2
+  // CHECK-NEXT:  [[MI3_EXT:%.*]] = sext i32 [[MI3]] to i64
+  // CHECK-NEXT:  [[IDX3_1:%.*]] = mul i64 [[MI3_EXT]], 4
+  // CHECK-NEXT:  [[IDX3_2:%.*]] = add i64 [[IDX3_1]], [[MI1_EXT]]
+  // OPT-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX3_2]], 16
+  // OPT-NEXT:    call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:  [[B:%.*]] = load <16 x double>, ptr [[B_PTR:%.*]], align 8{{$}}
+  // CHECK-NEXT:  [[INS:%.*]] = insertelement <16 x double> [[B]], double 1.500000e+00, i64 [[IDX3_2]]
+  // CHECK-NEXT:  store <16 x double> [[INS]], ptr [[B_PTR]], align 8
+  // OPT-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %b) #5
+  // OPT-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %j) #5
+  // OPT-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %i) #5
+  // OPT-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr %a) #5
+  // CHECK-NEXT:  ret void
+
+  b[a[i][j]][a[j][i] + 2] = 1.5;
+}
+
+
diff --git a/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type.hlsl b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type.hlsl
new file mode 100644
index 00000000000000..310b8b717e72b8
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Types/BuiltinMatrix/matrix-type.hlsl
@@ -0,0 +1,217 @@
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple spirv-unknown-vulkan-compute -fnative-half-type -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - -DSPIRV | FileCheck %s --check-prefixes=CHECK,SPIRV
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple dxil-pc-shadermodel6.3-compute -fnative-half-type -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// CHECK: %struct.Matrix = type { i16, [12 x float], float }
+
+// CHECK-LABEL:  define {{.*}}load_store_double
+void load_store_double() {
+  double4x4 a;
+  double4x4 b;
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a = alloca [16 x double], align 8
+  // CHECK-NEXT:    %b = alloca [16 x double], align 8
+  // CHECK-NEXT:    [[tmp:%[0-9]*]] = load <16 x double>, ptr %b, align 8
+  // CHECK-NEXT:    store <16 x double> [[tmp]], ptr %a, align 8
+  // CHECK-NEXT:   ret void
+
+  a = b;
+}
+
+// CHECK-LABEL:  define {{.*}}load_store_float
+void load_store_float() {
+  float3x4 a;
+  float3x4 b;
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a = alloca [12 x float], align 4
+  // CHECK-NEXT:    %b = alloca [12 x float], align 4
+  // CHECK-NEXT:    [[tmp:%[0-9]*]] = load <12 x float>, ptr %b, align 4
+  // CHECK-NEXT:    store <12 x float> [[tmp]], ptr %a, align 4
+  // CHECK-NEXT:   ret void
+
+  a = b;
+}
+
+// CHECK-LABEL:  define {{.*}}load_store_int
+void load_store_int() {
+  int3x4 a;
+  int3x4 b;
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a = alloca [12 x i32], align 4
+  // CHECK-NEXT:    %b = alloca [12 x i32], align 4
+  // CHECK-NEXT:    [[tmp:%[0-9]*]] = load <12 x i32>, ptr %b, align 4
+  // CHECK-NEXT:    store <12 x i32> [[tmp]], ptr %a, align 4
+  // CHECK-NEXT:   ret void
+
+  a = b;
+}
+
+// CHECK-LABEL:  define {{.*}}load_store_ull
+void load_store_ull() {
+  uint64_t3x4 a;
+  uint64_t3x4 b;
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a = alloca [12 x i64], align 8
+  // CHECK-NEXT:    %b = alloca [12 x i64], align 8
+  // CHECK-NEXT:    [[tmp:%[0-9]*]] = load <12 x i64>, ptr %b, align 8
+  // CHECK-NEXT:    store <12 x i64> [[tmp]], ptr %a, align 8
+  // CHECK-NEXT:   ret void
+
+  a = b;
+}
+
+// CHECK-LABEL:  define {{.*}}load_store_fp16
+void load_store_fp16() {
+  float16_t3x4 a;
+  float16_t3x4 b;
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a = alloca [12 x half], align 2
+  // CHECK-NEXT:    %b = alloca [12 x half], align 2
+  // CHECK-NEXT:    [[tmp:%[0-9]*]] = load <12 x half>, ptr %b, align 2
+  // CHECK-NEXT:    store <12 x half> [[tmp]], ptr %a, align 2
+  // CHECK-NEXT:   ret void
+
+  a = b;
+}
+
+
+typedef struct {
+  uint16_t Tmp1;
+  float3x4 Data;
+  float Tmp2;
+} Matrix;
+
+// CHECK-LABEL: define {{.*}}matrix_struct
+void matrix_struct() {
+  Matrix a;
+  Matrix b;
+  // CHECK-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // CHECK-NEXT:    %a = alloca %struct.Matrix, align 4
+  // CHECK-NEXT:    %b = alloca %struct.Matrix, align 4
+  // CHECK-NEXT:    %Data = getelementptr inbounds nuw %struct.Matrix, ptr %a, i32 0, i32 1
+  // CHECK-NEXT:    [[tmp:%[0-9]*]] = load <12 x float>, ptr %Data, align 4
+  // CHECK-NEXT:    %Data1 = getelementptr inbounds nuw %struct.Matrix, ptr %b, i32 0, i32 1
+  // CHECK-NEXT:    store <12 x float> [[tmp]], ptr %Data1, align 4
+  // CHECK-NEXT:    ret void
+  b.Data = a.Data;
+}
+
+// The following require matrix mangling, which is currenlty only available to SPIRV.
+#ifdef SPIRV
+
+// SPIRV-LABEL: define {{.*}}parameter_passing
+void parameter_passing(in float3x3 a, inout float3x3 b, out float3x3 c) {
+  // SPIRV-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // SPIRV-NEXT:    %a.addr = alloca [9 x float], align 4
+  // SPIRV-NEXT:    %b.addr = alloca ptr, align 8
+  // SPIRV-NEXT:    %c.addr = alloca ptr, align 8
+  // SPIRV-NEXT:    store <9 x float> %a, ptr %a.addr, align 4
+  // SPIRV-NEXT:    store ptr %b, ptr %b.addr, align 8
+  // SPIRV-NEXT:    store ptr %c, ptr %c.addr, align 8
+  // SPIRV-NEXT:    %1 = load <9 x float>, ptr %a.addr, align 4
+  // SPIRV-NEXT:    %2 = load ptr, ptr %b.addr, align 8
+  // SPIRV-NEXT:    store <9 x float> %1, ptr %2, align 4
+  // SPIRV-NEXT:    %3 = load ptr, ptr %c.addr, align 8
+  // SPIRV-NEXT:    store <9 x float> %1, ptr %3, align 4
+  // SPIRV-NEXT:    ret void
+  c = b = a;
+}
+
+// SPIRV-LABEL: define {{.*}}return_matrix
+float3x3 return_matrix(inout float3x3 a) {
+  // SPIRV-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // SPIRV-NEXT:    %a.addr = alloca ptr, align 8
+  // SPIRV-NEXT:    store ptr %a, ptr %a.addr, align 8
+  // SPIRV-NEXT:    %1 = load ptr, ptr %a.addr, align 8
+  // SPIRV-NEXT:    %2 = load <9 x float>, ptr %1, align 4
+  // SPIRV-NEXT:    ret <9 x float> %2
+  return a;
+}
+
+
+class MatrixClass {
+  int Tmp1;
+  float3x4 Data;
+  long Tmp2;
+};
+
+// SPIRV-LABEL: define {{.*}}matrix_class_reference
+void matrix_class_reference(inout MatrixClass a, inout MatrixClass b) {
+  // SPIRV-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // SPIRV-NEXT:    %a.addr = alloca ptr, align 8
+  // SPIRV-NEXT:    %b.addr = alloca ptr, align 8
+  // SPIRV-NEXT:    store ptr %a, ptr %a.addr, align 8
+  // SPIRV-NEXT:    store ptr %b, ptr %b.addr, align 8
+  // SPIRV-NEXT:    %1 = load ptr, ptr %a.addr, align 8
+  // SPIRV-NEXT:    %Data = getelementptr inbounds nuw %class.MatrixClass, ptr %1, i32 0, i32 1
+  // SPIRV-NEXT:    %2 = load <12 x float>, ptr %Data, align 4
+  // SPIRV-NEXT:    %3 = load ptr, ptr %b.addr, align 8
+  // SPIRV-NEXT:    %Data1 = getelementptr inbounds nuw %class.MatrixClass, ptr %3, i32 0, i32 1
+  // SPIRV-NEXT:    store <12 x float> %2, ptr %Data1, align 4
+  // SPIRV-NEXT:    ret void
+  b.Data = a.Data;
+}
+
+template <typename Ty, unsigned Rows, unsigned Cols>
+class MatrixClassTemplate {
+  using MatrixTy = matrix<Ty, Rows, Cols>;
+  int Tmp1;
+  MatrixTy Data;
+  long Tmp2;
+};
+
+template <typename Ty, unsigned Rows, unsigned Cols>
+void matrix_template_reference(inout MatrixClassTemplate<Ty, Rows, Cols> a, inout MatrixClassTemplate<Ty, Rows, Cols> b) {
+  b.Data = a.Data;
+}
+
+// SPIRV-LABEL: define {{.*}}matrix_template_reference_caller
+MatrixClassTemplate<float, 3, 4> matrix_template_reference_caller(matrix<float,3,4> Data) {
+  // SPIRV-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // SPIRV-NEXT:    %Data.addr = alloca [12 x float], align 4
+  // SPIRV-NEXT:    %Arg = alloca %class.MatrixClassTemplate, align 8
+  // SPIRV-NEXT:    %tmp = alloca %class.MatrixClassTemplate, align 8
+  // SPIRV-NEXT:    %tmp2 = alloca %class.MatrixClassTemplate, align 8
+  // SPIRV-NEXT:    store <12 x float> %Data, ptr %Data.addr, align 4
+  // SPIRV-NEXT:    %1 = load <12 x float>, ptr %Data.addr, align 4
+  // SPIRV-NEXT:    %Data1 = getelementptr inbounds nuw %class.MatrixClassTemplate, ptr %Arg, i32 0, i32 1
+  // SPIRV-NEXT:    store <12 x float> %1, ptr %Data1, align 4
+  // SPIRV-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %Arg, i64 64, i1 false)
+  // SPIRV-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp2, ptr align 8 %agg.result, i64 64, i1 false)
+  // SPIRV-NEXT:    call{{.*}} void @_Z25matrix_template_referenceIfLj3ELj4EEv19MatrixClassTemplateIT_XT0_EXT1_EES2_(ptr noalias nonnull align 8 dereferenceable(64) %tmp, ptr noalias nonnull align 8 dereferenceable(64) %tmp2)
+  // SPIRV-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 %Arg, ptr align 8 %tmp, i64 64, i1 false)
+  // SPIRV-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.result, ptr align 8 %tmp2, i64 64, i1 false)
+  // SPIRV-NEXT:    ret void
+
+  // SPIRV-LABEL: define{{.*}} void @_Z25matrix_template_referenceIfLj3ELj4EEv19MatrixClassTemplateIT_XT0_EXT1_EES2_(ptr noalias nonnull align 8 dereferenceable(64) %a, ptr noalias nonnull align 8 dereferenceable(64) %b)
+  // SPIRV-NEXT:  entry:
+  // SPIRV-NEXT:    %0 = call token @llvm.experimental.convergence.entry()
+  // SPIRV-NEXT:    %a.addr = alloca ptr, align 8
+  // SPIRV-NEXT:    %b.addr = alloca ptr, align 8
+  // SPIRV-NEXT:    store ptr %a, ptr %a.addr, align 8
+  // SPIRV-NEXT:    store ptr %b, ptr %b.addr, align 8
+  // SPIRV-NEXT:    %1 = load ptr, ptr %a.addr, align 8
+  // SPIRV-NEXT:    %Data = getelementptr inbounds nuw %class.MatrixClassTemplate, ptr %1, i32 0, i32 1
+  // SPIRV-NEXT:    %2 = load <12 x float>, ptr %Data, align 4
+  // SPIRV-NEXT:    %3 = load ptr, ptr %b.addr, align 8
+  // SPIRV-NEXT:    %Data1 = getelementptr inbounds nuw %class.MatrixClassTemplate, ptr %3, i32 0, i32 1
+  // SPIRV-NEXT:    store <12 x float> %2, ptr %Data1, align 4
+  // SPIRV-NEXT:    ret void
+
+  MatrixClassTemplate<float, 3, 4> Result, Arg;
+  Arg.Data = Data;
+  matrix_template_reference(Arg, Result);
+  return Result;
+}
+
+#endif
+
diff --git a/clang/test/CodeGenHLSL/basic_types.hlsl b/clang/test/CodeGenHLSL/basic_types.hlsl
index 15c963dfa666f4..05260f7b32ff87 100644
--- a/clang/test/CodeGenHLSL/basic_types.hlsl
+++ b/clang/test/CodeGenHLSL/basic_types.hlsl
@@ -11,30 +11,39 @@
 // CHECK:"?uint_Val@@3IA" = global i32 0, align 4
 // CHECK:"?uint64_t_Val@@3KA" = global i64 0, align 8
 // CHECK:"?int64_t_Val@@3JA" = global i64 0, align 8
+// CHECK:"?int16_t1_Val@@3T?$__vector at F$00 at __clang@@A" = global <1 x i16> zeroinitializer, align 2
 // CHECK:"?int16_t2_Val@@3T?$__vector at F$01 at __clang@@A" = global <2 x i16> zeroinitializer, align 4
 // CHECK:"?int16_t3_Val@@3T?$__vector at F$02 at __clang@@A" = global <3 x i16> zeroinitializer, align 8
 // CHECK:"?int16_t4_Val@@3T?$__vector at F$03 at __clang@@A" = global <4 x i16> zeroinitializer, align 8
+// CHECK:"?uint16_t1_Val@@3T?$__vector at G$00 at __clang@@A" = global <1 x i16> zeroinitializer, align 2
 // CHECK:"?uint16_t2_Val@@3T?$__vector at G$01 at __clang@@A" = global <2 x i16> zeroinitializer, align 4
 // CHECK:"?uint16_t3_Val@@3T?$__vector at G$02 at __clang@@A" = global <3 x i16> zeroinitializer, align 8
 // CHECK:"?uint16_t4_Val@@3T?$__vector at G$03 at __clang@@A" = global <4 x i16> zeroinitializer, align 8
+// CHECK:"?int1_Val@@3T?$__vector at H$00 at __clang@@A" = global <1 x i32> zeroinitializer, align 4
 // CHECK:"?int2_Val@@3T?$__vector at H$01 at __clang@@A" = global <2 x i32> zeroinitializer, align 8
 // CHECK:"?int3_Val@@3T?$__vector at H$02 at __clang@@A" = global <3 x i32> zeroinitializer, align 16
 // CHECK:"?int4_Val@@3T?$__vector at H$03 at __clang@@A" = global <4 x i32> zeroinitializer, align 16
+// CHECK:"?uint1_Val@@3T?$__vector at I$00 at __clang@@A" = global <1 x i32> zeroinitializer, align 4
 // CHECK:"?uint2_Val@@3T?$__vector at I$01 at __clang@@A" = global <2 x i32> zeroinitializer, align 8
 // CHECK:"?uint3_Val@@3T?$__vector at I$02 at __clang@@A" = global <3 x i32> zeroinitializer, align 16
 // CHECK:"?uint4_Val@@3T?$__vector at I$03 at __clang@@A" = global <4 x i32> zeroinitializer, align 16
+// CHECK:"?int64_t1_Val@@3T?$__vector at J$00 at __clang@@A" = global <1 x i64> zeroinitializer, align 8
 // CHECK:"?int64_t2_Val@@3T?$__vector at J$01 at __clang@@A" = global <2 x i64> zeroinitializer, align 16
 // CHECK:"?int64_t3_Val@@3T?$__vector at J$02 at __clang@@A" = global <3 x i64> zeroinitializer, align 32
 // CHECK:"?int64_t4_Val@@3T?$__vector at J$03 at __clang@@A" = global <4 x i64> zeroinitializer, align 32
+// CHECK:"?uint64_t1_Val@@3T?$__vector at K$00 at __clang@@A" = global <1 x i64> zeroinitializer, align 8
 // CHECK:"?uint64_t2_Val@@3T?$__vector at K$01 at __clang@@A" = global <2 x i64> zeroinitializer, align 16
 // CHECK:"?uint64_t3_Val@@3T?$__vector at K$02 at __clang@@A" = global <3 x i64> zeroinitializer, align 32
 // CHECK:"?uint64_t4_Val@@3T?$__vector at K$03 at __clang@@A" = global <4 x i64> zeroinitializer, align 32
+// CHECK:"?half1_Val@@3T?$__vector@$f16@$00 at __clang@@A" = global <1 x half> zeroinitializer, align 2
 // CHECK:"?half2_Val@@3T?$__vector@$f16@$01 at __clang@@A" = global <2 x half> zeroinitializer, align 4
 // CHECK:"?half3_Val@@3T?$__vector@$f16@$02 at __clang@@A" = global <3 x half> zeroinitializer, align 8
 // CHECK:"?half4_Val@@3T?$__vector@$f16@$03 at __clang@@A" = global <4 x half> zeroinitializer, align 8
+// CHECK:"?float1_Val@@3T?$__vector at M$00 at __clang@@A" = global <1 x float> zeroinitializer, align 4
 // CHECK:"?float2_Val@@3T?$__vector at M$01 at __clang@@A" = global <2 x float> zeroinitializer, align 8
 // CHECK:"?float3_Val@@3T?$__vector at M$02 at __clang@@A" = global <3 x float> zeroinitializer, align 16
 // CHECK:"?float4_Val@@3T?$__vector at M$03 at __clang@@A" = global <4 x float> zeroinitializer, align 16
+// CHECK:"?double1_Val@@3T?$__vector at N$00 at __clang@@A" = global <1 x double> zeroinitializer, align 8
 // CHECK:"?double2_Val@@3T?$__vector at N$01 at __clang@@A" = global <2 x double> zeroinitializer, align 16
 // CHECK:"?double3_Val@@3T?$__vector at N$02 at __clang@@A" = global <3 x double> zeroinitializer, align 32
 // CHECK:"?double4_Val@@3T?$__vector at N$03 at __clang@@A" = global <4 x double> zeroinitializer, align 32
@@ -60,36 +69,45 @@ TYPE_DECL(int64_t);
 // built-in vector data types:
 
 #ifdef __HLSL_ENABLE_16_BIT
+TYPE_DECL(int16_t1   );
 TYPE_DECL(int16_t2   );
 TYPE_DECL(int16_t3   );
 TYPE_DECL(int16_t4   );
+TYPE_DECL( uint16_t1 );
 TYPE_DECL( uint16_t2 );
 TYPE_DECL( uint16_t3 );
 TYPE_DECL( uint16_t4 );
 #endif
 
+TYPE_DECL( int1  );
 TYPE_DECL( int2  );
 TYPE_DECL( int3  );
 TYPE_DECL( int4  );
+TYPE_DECL( uint1 );
 TYPE_DECL( uint2 );
 TYPE_DECL( uint3 );
 TYPE_DECL( uint4     );
+TYPE_DECL( int64_t1  );
 TYPE_DECL( int64_t2  );
 TYPE_DECL( int64_t3  );
 TYPE_DECL( int64_t4  );
+TYPE_DECL( uint64_t1 );
 TYPE_DECL( uint64_t2 );
 TYPE_DECL( uint64_t3 );
 TYPE_DECL( uint64_t4 );
 
 #ifdef __HLSL_ENABLE_16_BIT
+TYPE_DECL(half1 );
 TYPE_DECL(half2 );
 TYPE_DECL(half3 );
 TYPE_DECL(half4 );
 #endif
 
+TYPE_DECL( float1  );
 TYPE_DECL( float2  );
 TYPE_DECL( float3  );
 TYPE_DECL( float4  );
+TYPE_DECL( double1 );
 TYPE_DECL( double2 );
 TYPE_DECL( double3 );
 TYPE_DECL( double4 );
diff --git a/clang/test/CodeGenHLSL/matrix-types.hlsl b/clang/test/CodeGenHLSL/matrix-types.hlsl
new file mode 100644
index 00000000000000..721d383cd04f1f
--- /dev/null
+++ b/clang/test/CodeGenHLSL/matrix-types.hlsl
@@ -0,0 +1,348 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - -DNAMESPACED| FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - -DSPIRV| FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - -DSPIRV -DNAMESPACED| FileCheck %s
+
+#ifdef NAMESPACED
+#define TYPE_DECL(T)  hlsl::T T##_Val
+#else
+#define TYPE_DECL(T)  T T##_Val
+#endif
+
+// Until MicrosoftCXXABI supports mangling matrices,
+// these have to be local variables for DXIL.
+#ifndef SPIRV
+void f() {
+#endif
+
+// built-in matrix types:
+
+// Capture target-specific details.
+//CHECK: [[PFX:[%@]]]int16_t1x1_Val = [[STR:(alloca|global)]] [1 x i16][[ZI:( zeroinitializer)?]], align 2
+//CHECK: [[PFX]]int16_t1x2_Val = [[STR]] [2 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t1x3_Val = [[STR]] [3 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t1x4_Val = [[STR]] [4 x i16][[ZI]], align 2
+TYPE_DECL( int16_t1x1 );
+TYPE_DECL( int16_t1x2 );
+TYPE_DECL( int16_t1x3 );
+TYPE_DECL( int16_t1x4 );
+
+//CHECK: [[PFX]]int16_t2x1_Val = [[STR]] [2 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t2x2_Val = [[STR]] [4 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t2x3_Val = [[STR]] [6 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t2x4_Val = [[STR]] [8 x i16][[ZI]], align 2
+TYPE_DECL( int16_t2x1 );
+TYPE_DECL( int16_t2x2 );
+TYPE_DECL( int16_t2x3 );
+TYPE_DECL( int16_t2x4 );
+
+//CHECK: [[PFX]]int16_t3x1_Val = [[STR]] [3 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t3x2_Val = [[STR]] [6 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t3x3_Val = [[STR]] [9 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t3x4_Val = [[STR]] [12 x i16][[ZI]], align 2
+TYPE_DECL( int16_t3x1 );
+TYPE_DECL( int16_t3x2 );
+TYPE_DECL( int16_t3x3 );
+TYPE_DECL( int16_t3x4 );
+
+//CHECK: [[PFX]]int16_t4x1_Val = [[STR]] [4 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t4x2_Val = [[STR]] [8 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t4x3_Val = [[STR]] [12 x i16][[ZI]], align 2
+//CHECK: [[PFX]]int16_t4x4_Val = [[STR]] [16 x i16][[ZI]], align 2
+TYPE_DECL( int16_t4x1 );
+TYPE_DECL( int16_t4x2 );
+TYPE_DECL( int16_t4x3 );
+TYPE_DECL( int16_t4x4 );
+
+//CHECK: [[PFX]]uint16_t1x1_Val = [[STR]] [1 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t1x2_Val = [[STR]] [2 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t1x3_Val = [[STR]] [3 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t1x4_Val = [[STR]] [4 x i16][[ZI]], align 2
+TYPE_DECL( uint16_t1x1 );
+TYPE_DECL( uint16_t1x2 );
+TYPE_DECL( uint16_t1x3 );
+TYPE_DECL( uint16_t1x4 );
+
+//CHECK: [[PFX]]uint16_t2x1_Val = [[STR]] [2 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t2x2_Val = [[STR]] [4 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t2x3_Val = [[STR]] [6 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t2x4_Val = [[STR]] [8 x i16][[ZI]], align 2
+TYPE_DECL( uint16_t2x1 );
+TYPE_DECL( uint16_t2x2 );
+TYPE_DECL( uint16_t2x3 );
+TYPE_DECL( uint16_t2x4 );
+
+//CHECK: [[PFX]]uint16_t3x1_Val = [[STR]] [3 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t3x2_Val = [[STR]] [6 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t3x3_Val = [[STR]] [9 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t3x4_Val = [[STR]] [12 x i16][[ZI]], align 2
+TYPE_DECL( uint16_t3x1 );
+TYPE_DECL( uint16_t3x2 );
+TYPE_DECL( uint16_t3x3 );
+TYPE_DECL( uint16_t3x4 );
+
+//CHECK: [[PFX]]uint16_t4x1_Val = [[STR]] [4 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t4x2_Val = [[STR]] [8 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t4x3_Val = [[STR]] [12 x i16][[ZI]], align 2
+//CHECK: [[PFX]]uint16_t4x4_Val = [[STR]] [16 x i16][[ZI]], align 2
+TYPE_DECL( uint16_t4x1 );
+TYPE_DECL( uint16_t4x2 );
+TYPE_DECL( uint16_t4x3 );
+TYPE_DECL( uint16_t4x4 );
+
+//CHECK: [[PFX]]int1x1_Val = [[STR]] [1 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int1x2_Val = [[STR]] [2 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int1x3_Val = [[STR]] [3 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int1x4_Val = [[STR]] [4 x i32][[ZI]], align 4
+TYPE_DECL( int1x1 );
+TYPE_DECL( int1x2 );
+TYPE_DECL( int1x3 );
+TYPE_DECL( int1x4 );
+
+//CHECK: [[PFX]]int2x1_Val = [[STR]] [2 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int2x2_Val = [[STR]] [4 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int2x3_Val = [[STR]] [6 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int2x4_Val = [[STR]] [8 x i32][[ZI]], align 4
+TYPE_DECL( int2x1 );
+TYPE_DECL( int2x2 );
+TYPE_DECL( int2x3 );
+TYPE_DECL( int2x4 );
+
+//CHECK: [[PFX]]int3x1_Val = [[STR]] [3 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int3x2_Val = [[STR]] [6 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int3x3_Val = [[STR]] [9 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int3x4_Val = [[STR]] [12 x i32][[ZI]], align 4
+TYPE_DECL( int3x1 );
+TYPE_DECL( int3x2 );
+TYPE_DECL( int3x3 );
+TYPE_DECL( int3x4 );
+
+//CHECK: [[PFX]]int4x1_Val = [[STR]] [4 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int4x2_Val = [[STR]] [8 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int4x3_Val = [[STR]] [12 x i32][[ZI]], align 4
+//CHECK: [[PFX]]int4x4_Val = [[STR]] [16 x i32][[ZI]], align 4
+TYPE_DECL( int4x1 );
+TYPE_DECL( int4x2 );
+TYPE_DECL( int4x3 );
+TYPE_DECL( int4x4 );
+
+//CHECK: [[PFX]]uint1x1_Val = [[STR]] [1 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint1x2_Val = [[STR]] [2 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint1x3_Val = [[STR]] [3 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint1x4_Val = [[STR]] [4 x i32][[ZI]], align 4
+TYPE_DECL( uint1x1 );
+TYPE_DECL( uint1x2 );
+TYPE_DECL( uint1x3 );
+TYPE_DECL( uint1x4 );
+
+//CHECK: [[PFX]]uint2x1_Val = [[STR]] [2 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint2x2_Val = [[STR]] [4 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint2x3_Val = [[STR]] [6 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint2x4_Val = [[STR]] [8 x i32][[ZI]], align 4
+TYPE_DECL( uint2x1 );
+TYPE_DECL( uint2x2 );
+TYPE_DECL( uint2x3 );
+TYPE_DECL( uint2x4 );
+
+//CHECK: [[PFX]]uint3x1_Val = [[STR]] [3 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint3x2_Val = [[STR]] [6 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint3x3_Val = [[STR]] [9 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint3x4_Val = [[STR]] [12 x i32][[ZI]], align 4
+TYPE_DECL( uint3x1 );
+TYPE_DECL( uint3x2 );
+TYPE_DECL( uint3x3 );
+TYPE_DECL( uint3x4 );
+
+//CHECK: [[PFX]]uint4x1_Val = [[STR]] [4 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint4x2_Val = [[STR]] [8 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint4x3_Val = [[STR]] [12 x i32][[ZI]], align 4
+//CHECK: [[PFX]]uint4x4_Val = [[STR]] [16 x i32][[ZI]], align 4
+TYPE_DECL( uint4x1 );
+TYPE_DECL( uint4x2 );
+TYPE_DECL( uint4x3 );
+TYPE_DECL( uint4x4 );
+
+//CHECK: [[PFX]]int64_t1x1_Val = [[STR]] [1 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t1x2_Val = [[STR]] [2 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t1x3_Val = [[STR]] [3 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t1x4_Val = [[STR]] [4 x i64][[ZI]], align 8
+TYPE_DECL( int64_t1x1 );
+TYPE_DECL( int64_t1x2 );
+TYPE_DECL( int64_t1x3 );
+TYPE_DECL( int64_t1x4 );
+
+//CHECK: [[PFX]]int64_t2x1_Val = [[STR]] [2 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t2x2_Val = [[STR]] [4 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t2x3_Val = [[STR]] [6 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t2x4_Val = [[STR]] [8 x i64][[ZI]], align 8
+TYPE_DECL( int64_t2x1 );
+TYPE_DECL( int64_t2x2 );
+TYPE_DECL( int64_t2x3 );
+TYPE_DECL( int64_t2x4 );
+
+//CHECK: [[PFX]]int64_t3x1_Val = [[STR]] [3 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t3x2_Val = [[STR]] [6 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t3x3_Val = [[STR]] [9 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t3x4_Val = [[STR]] [12 x i64][[ZI]], align 8
+TYPE_DECL( int64_t3x1 );
+TYPE_DECL( int64_t3x2 );
+TYPE_DECL( int64_t3x3 );
+TYPE_DECL( int64_t3x4 );
+
+//CHECK: [[PFX]]int64_t4x1_Val = [[STR]] [4 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t4x2_Val = [[STR]] [8 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t4x3_Val = [[STR]] [12 x i64][[ZI]], align 8
+//CHECK: [[PFX]]int64_t4x4_Val = [[STR]] [16 x i64][[ZI]], align 8
+TYPE_DECL( int64_t4x1 );
+TYPE_DECL( int64_t4x2 );
+TYPE_DECL( int64_t4x3 );
+TYPE_DECL( int64_t4x4 );
+
+//CHECK: [[PFX]]uint64_t1x1_Val = [[STR]] [1 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t1x2_Val = [[STR]] [2 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t1x3_Val = [[STR]] [3 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t1x4_Val = [[STR]] [4 x i64][[ZI]], align 8
+TYPE_DECL( uint64_t1x1 );
+TYPE_DECL( uint64_t1x2 );
+TYPE_DECL( uint64_t1x3 );
+TYPE_DECL( uint64_t1x4 );
+
+//CHECK: [[PFX]]uint64_t2x1_Val = [[STR]] [2 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t2x2_Val = [[STR]] [4 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t2x3_Val = [[STR]] [6 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t2x4_Val = [[STR]] [8 x i64][[ZI]], align 8
+TYPE_DECL( uint64_t2x1 );
+TYPE_DECL( uint64_t2x2 );
+TYPE_DECL( uint64_t2x3 );
+TYPE_DECL( uint64_t2x4 );
+
+//CHECK: [[PFX]]uint64_t3x1_Val = [[STR]] [3 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t3x2_Val = [[STR]] [6 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t3x3_Val = [[STR]] [9 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t3x4_Val = [[STR]] [12 x i64][[ZI]], align 8
+TYPE_DECL( uint64_t3x1 );
+TYPE_DECL( uint64_t3x2 );
+TYPE_DECL( uint64_t3x3 );
+TYPE_DECL( uint64_t3x4 );
+
+//CHECK: [[PFX]]uint64_t4x1_Val = [[STR]] [4 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t4x2_Val = [[STR]] [8 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t4x3_Val = [[STR]] [12 x i64][[ZI]], align 8
+//CHECK: [[PFX]]uint64_t4x4_Val = [[STR]] [16 x i64][[ZI]], align 8
+TYPE_DECL( uint64_t4x1 );
+TYPE_DECL( uint64_t4x2 );
+TYPE_DECL( uint64_t4x3 );
+TYPE_DECL( uint64_t4x4 );
+
+
+//CHECK: [[PFX]]half1x1_Val = [[STR]] [1 x half][[ZI]], align 2
+//CHECK: [[PFX]]half1x2_Val = [[STR]] [2 x half][[ZI]], align 2
+//CHECK: [[PFX]]half1x3_Val = [[STR]] [3 x half][[ZI]], align 2
+//CHECK: [[PFX]]half1x4_Val = [[STR]] [4 x half][[ZI]], align 2
+TYPE_DECL( half1x1 );
+TYPE_DECL( half1x2 );
+TYPE_DECL( half1x3 );
+TYPE_DECL( half1x4 );
+
+//CHECK: [[PFX]]half2x1_Val = [[STR]] [2 x half][[ZI]], align 2
+//CHECK: [[PFX]]half2x2_Val = [[STR]] [4 x half][[ZI]], align 2
+//CHECK: [[PFX]]half2x3_Val = [[STR]] [6 x half][[ZI]], align 2
+//CHECK: [[PFX]]half2x4_Val = [[STR]] [8 x half][[ZI]], align 2
+TYPE_DECL( half2x1 );
+TYPE_DECL( half2x2 );
+TYPE_DECL( half2x3 );
+TYPE_DECL( half2x4 );
+
+//CHECK: [[PFX]]half3x1_Val = [[STR]] [3 x half][[ZI]], align 2
+//CHECK: [[PFX]]half3x2_Val = [[STR]] [6 x half][[ZI]], align 2
+//CHECK: [[PFX]]half3x3_Val = [[STR]] [9 x half][[ZI]], align 2
+//CHECK: [[PFX]]half3x4_Val = [[STR]] [12 x half][[ZI]], align 2
+TYPE_DECL( half3x1 );
+TYPE_DECL( half3x2 );
+TYPE_DECL( half3x3 );
+TYPE_DECL( half3x4 );
+
+//CHECK: [[PFX]]half4x1_Val = [[STR]] [4 x half][[ZI]], align 2
+//CHECK: [[PFX]]half4x2_Val = [[STR]] [8 x half][[ZI]], align 2
+//CHECK: [[PFX]]half4x3_Val = [[STR]] [12 x half][[ZI]], align 2
+//CHECK: [[PFX]]half4x4_Val = [[STR]] [16 x half][[ZI]], align 2
+TYPE_DECL( half4x1 );
+TYPE_DECL( half4x2 );
+TYPE_DECL( half4x3 );
+TYPE_DECL( half4x4 );
+
+//CHECK: [[PFX]]float1x1_Val = [[STR]] [1 x float][[ZI]], align 4
+//CHECK: [[PFX]]float1x2_Val = [[STR]] [2 x float][[ZI]], align 4
+//CHECK: [[PFX]]float1x3_Val = [[STR]] [3 x float][[ZI]], align 4
+//CHECK: [[PFX]]float1x4_Val = [[STR]] [4 x float][[ZI]], align 4
+TYPE_DECL( float1x1 );
+TYPE_DECL( float1x2 );
+TYPE_DECL( float1x3 );
+TYPE_DECL( float1x4 );
+
+//CHECK: [[PFX]]float2x1_Val = [[STR]] [2 x float][[ZI]], align 4
+//CHECK: [[PFX]]float2x2_Val = [[STR]] [4 x float][[ZI]], align 4
+//CHECK: [[PFX]]float2x3_Val = [[STR]] [6 x float][[ZI]], align 4
+//CHECK: [[PFX]]float2x4_Val = [[STR]] [8 x float][[ZI]], align 4
+TYPE_DECL( float2x1 );
+TYPE_DECL( float2x2 );
+TYPE_DECL( float2x3 );
+TYPE_DECL( float2x4 );
+
+//CHECK: [[PFX]]float3x1_Val = [[STR]] [3 x float][[ZI]], align 4
+//CHECK: [[PFX]]float3x2_Val = [[STR]] [6 x float][[ZI]], align 4
+//CHECK: [[PFX]]float3x3_Val = [[STR]] [9 x float][[ZI]], align 4
+//CHECK: [[PFX]]float3x4_Val = [[STR]] [12 x float][[ZI]], align 4
+TYPE_DECL( float3x1 );
+TYPE_DECL( float3x2 );
+TYPE_DECL( float3x3 );
+TYPE_DECL( float3x4 );
+
+//CHECK: [[PFX]]float4x1_Val = [[STR]] [4 x float][[ZI]], align 4
+//CHECK: [[PFX]]float4x2_Val = [[STR]] [8 x float][[ZI]], align 4
+//CHECK: [[PFX]]float4x3_Val = [[STR]] [12 x float][[ZI]], align 4
+//CHECK: [[PFX]]float4x4_Val = [[STR]] [16 x float][[ZI]], align 4
+TYPE_DECL( float4x1 );
+TYPE_DECL( float4x2 );
+TYPE_DECL( float4x3 );
+TYPE_DECL( float4x4 );
+
+//CHECK: [[PFX]]double1x1_Val = [[STR]] [1 x double][[ZI]], align 8
+//CHECK: [[PFX]]double1x2_Val = [[STR]] [2 x double][[ZI]], align 8
+//CHECK: [[PFX]]double1x3_Val = [[STR]] [3 x double][[ZI]], align 8
+//CHECK: [[PFX]]double1x4_Val = [[STR]] [4 x double][[ZI]], align 8
+TYPE_DECL( double1x1 );
+TYPE_DECL( double1x2 );
+TYPE_DECL( double1x3 );
+TYPE_DECL( double1x4 );
+
+//CHECK: [[PFX]]double2x1_Val = [[STR]] [2 x double][[ZI]], align 8
+//CHECK: [[PFX]]double2x2_Val = [[STR]] [4 x double][[ZI]], align 8
+//CHECK: [[PFX]]double2x3_Val = [[STR]] [6 x double][[ZI]], align 8
+//CHECK: [[PFX]]double2x4_Val = [[STR]] [8 x double][[ZI]], align 8
+TYPE_DECL( double2x1 );
+TYPE_DECL( double2x2 );
+TYPE_DECL( double2x3 );
+TYPE_DECL( double2x4 );
+
+//CHECK: [[PFX]]double3x1_Val = [[STR]] [3 x double][[ZI]], align 8
+//CHECK: [[PFX]]double3x2_Val = [[STR]] [6 x double][[ZI]], align 8
+//CHECK: [[PFX]]double3x3_Val = [[STR]] [9 x double][[ZI]], align 8
+//CHECK: [[PFX]]double3x4_Val = [[STR]] [12 x double][[ZI]], align 8
+TYPE_DECL( double3x1 );
+TYPE_DECL( double3x2 );
+TYPE_DECL( double3x3 );
+TYPE_DECL( double3x4 );
+
+//CHECK: [[PFX]]double4x1_Val = [[STR]] [4 x double][[ZI]], align 8
+//CHECK: [[PFX]]double4x2_Val = [[STR]] [8 x double][[ZI]], align 8
+//CHECK: [[PFX]]double4x3_Val = [[STR]] [12 x double][[ZI]], align 8
+//CHECK: [[PFX]]double4x4_Val = [[STR]] [16 x double][[ZI]], align 8
+TYPE_DECL( double4x1 );
+TYPE_DECL( double4x2 );
+TYPE_DECL( double4x3 );
+TYPE_DECL( double4x4 );
+
+#ifndef SPIRV
+}
+#endif
diff --git a/clang/test/Sema/matrix-type-operators.c b/clang/test/Sema/matrix-type-operators.c
index c83685fc7c6402..96cf176c2d7112 100644
--- a/clang/test/Sema/matrix-type-operators.c
+++ b/clang/test/Sema/matrix-type-operators.c
@@ -17,12 +17,12 @@ void add(sx10x10_t a, sx5x10_t b, sx10x5_t c) {
   // expected-error at -1 {{assigning to 'sx10x10_t' (aka 'float __attribute__((matrix_type(10, 10)))') from incompatible type 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))')}}
 
   a = b + &c;
-  // expected-error at -1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*'))}}
-  // expected-error at -2 {{casting 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*') to incompatible type 'float'}}
+  // expected-error at -1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t *' (aka 'float * __attribute__((matrix_type(10, 5)))'))}}
+  // expected-error at -2 {{casting 'sx10x5_t *' (aka 'float * __attribute__((matrix_type(10, 5)))') to incompatible type 'float'}}
 
   b += &c;
-  // expected-error at -1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*'))}}
-  // expected-error at -2 {{casting 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*') to incompatible type 'float'}}
+  // expected-error at -1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t *' (aka 'float * __attribute__((matrix_type(10, 5)))'))}}
+  // expected-error at -2 {{casting 'sx10x5_t *' (aka 'float * __attribute__((matrix_type(10, 5)))') to incompatible type 'float'}}
 }
 
 void sub(sx10x10_t a, sx5x10_t b, sx10x5_t c) {
@@ -38,12 +38,12 @@ void sub(sx10x10_t a, sx5x10_t b, sx10x5_t c) {
   // expected-error at -1 {{assigning to 'sx10x10_t' (aka 'float __attribute__((matrix_type(10, 10)))') from incompatible type 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))')}}
 
   a = b - &c;
-  // expected-error at -1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*'))}}
-  // expected-error at -2 {{casting 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*') to incompatible type 'float'}}
+  // expected-error at -1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t *' (aka 'float * __attribute__((matrix_type(10, 5)))'))}}
+  // expected-error at -2 {{casting 'sx10x5_t *' (aka 'float * __attribute__((matrix_type(10, 5)))') to incompatible type 'float'}}
 
   b -= &c;
-  // expected-error at -1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*'))}}
-  // expected-error at -2 {{casting 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*') to incompatible type 'float'}}
+  // expected-error at -1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t *' (aka 'float * __attribute__((matrix_type(10, 5)))'))}}
+  // expected-error at -2 {{casting 'sx10x5_t *' (aka 'float * __attribute__((matrix_type(10, 5)))') to incompatible type 'float'}}
 }
 
 typedef int ix10x5_t __attribute__((matrix_type(10, 5)));
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-cast.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-cast.hlsl
new file mode 100644
index 00000000000000..5acddc2e533021
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-cast.hlsl
@@ -0,0 +1,138 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-library -fnative-half-type -finclude-default-header -fsyntax-only %s -verify
+
+typedef struct test_struct { // expected-note 1+ {{candidate constructor (the implicit default constructor) not viable: requires 0 arguments, but 1 was provided}}
+  // expected-note-re at -1 1+ {{candidate constructor (the implicit move constructor) not viable: no known conversion from '{{[^']*}}' (aka '{{[^']*}}') to 'test_struct' for 1st argument}}
+  // expected-note-re at -2 1+ {{candidate constructor (the implicit copy constructor) not viable: no known conversion from '{{[^']*}}' (aka '{{[^']*}}') to 'const test_struct' for 1st argument}}
+} test_struct;
+
+void f1(void) {
+  uint16_t3x3 u16_3x3;
+  int3x3 i32_3x3;
+  int16_t3x3 i16_3x3;
+  int4x4 i32_4x4;
+  float4x4 f32_4x4;
+  int i;
+  float4 v;
+  test_struct s;
+
+  i32_3x3 = (int3x3)u16_3x3;
+  i16_3x3 = (int16_t3x3)i32_3x3;
+  i32_4x4 = (int4x4)i16_3x3;        // expected-error {{conversion between matrix types 'int4x4' (aka 'matrix<int, 4, 4>') and 'matrix<short, 3, 3>' of different size is not allowed}}
+  f32_4x4 = (int4x4)i32_4x4;        // expected-error {{assigning to 'matrix<float, [2 * ...]>' from incompatible type 'matrix<int, [2 * ...]>'}}
+  i = (int)i32_4x4;           // expected-error {{C-style cast from 'int4x4' (aka 'matrix<int, 4, 4>') to 'int' is not allowed}}
+  i32_4x4 = (int4x4)i;         // expected-error {{C-style cast from 'int' to 'int4x4' (aka 'matrix<int, 4, 4>') is not allowed}}
+  v = (float4)i32_4x4;           // expected-error {{C-style cast from 'int4x4' (aka 'matrix<int, 4, 4>') to 'float4' (aka 'vector<float, 4>') is not allowed}}
+  i32_4x4 = (int4x4)v;         // expected-error {{C-style cast from 'float4' (aka 'vector<float, 4>') to 'int4x4' (aka 'matrix<int, 4, 4>') is not allowed}}
+  s = (test_struct)i16_3x3; // expected-error {{no matching conversion for C-style cast from 'int16_t3x3' (aka 'matrix<int16_t, 3, 3>') to 'test_struct'}}
+  i16_3x3 = (int16_t3x3)s;         // expected-error {{cannot convert 'test_struct' to 'int16_t3x3' (aka 'matrix<int16_t, 3, 3>') without a conversion operator}}
+
+  i32_4x4 = (int4x4)f32_4x4;
+}
+
+void f2(void) {
+  float2x2 f32_2x2;
+  double3x3 f64_3x3;
+  double2x2 f64_2x2;
+  int4x4 i32_4x4;
+  uint4x4 u32_4x4;
+  uint3x3 u32_3x3;
+  float f;
+
+  f64_3x3 = (double3x3)f32_2x2; // expected-error {{conversion between matrix types 'double3x3' (aka 'matrix<double, 3, 3>') and 'matrix<float, 2, 2>' of different size is not allowed}}
+  f64_2x2 = (double2x2)f32_2x2;
+
+  u32_4x4 = (uint4x4)i32_4x4;
+  i32_4x4 = (int4x4)u32_4x4;
+  u32_3x3 = (uint3x3)i32_4x4; // expected-error {{conversion between matrix types 'uint3x3' (aka 'matrix<uint, 3, 3>') and 'matrix<int, 4, 4>' of different size is not allowed}}
+  f = (float)i32_4x4;    // expected-error {{C-style cast from 'int4x4' (aka 'matrix<int, 4, 4>') to 'float' is not allowed}}
+  i32_4x4 = (int4x4)f;    // expected-error {{C-style cast from 'float' to 'int4x4' (aka 'matrix<int, 4, 4>') is not allowed}}
+}
+
+template <typename X>
+using matrix_3_3 = matrix<X, 3, 3>;
+
+template <typename Y>
+using matrix_4_4 = matrix<Y, 4, 4>;
+
+void f3() {
+  matrix_3_3<uint16_t> u16_3x3;
+  matrix_3_3<int> i32_3x3;
+  matrix_3_3<int16_t> i16_3x3;
+  matrix_4_4<int> i32_4x4;
+  matrix_4_4<float> f32_4x4;
+  int i;
+  int4 v;
+  test_struct s;
+
+  i32_3x3 = (matrix_3_3<int>)u16_3x3;
+  i32_3x3 = u16_3x3; // expected-error {{assigning to 'matrix_3_3<int>' from incompatible type 'matrix_3_3<uint16_t>'}}
+  i16_3x3 = (matrix_3_3<int16_t>)i32_3x3;
+  i32_4x4 = (matrix_4_4<int>)i16_3x3; // expected-error {{conversion between matrix types 'matrix_4_4<int>' (aka 'matrix<int, 4, 4>') and 'matrix<short, 3, 3>' of different size is not allowed}}
+
+  i = (int)i16_3x3;            // expected-error {{C-style cast from 'matrix_3_3<int16_t>' (aka 'matrix<int16_t, 3, 3>') to 'int' is not allowed}}
+  i32_3x3 = (matrix_3_3<int>)i; // expected-error {{C-style cast from 'int' to 'matrix_3_3<int>' (aka 'matrix<int, 3, 3>') is not allowed}}
+
+  v = (int4)i32_3x3;            // expected-error {{C-style cast from 'matrix_3_3<int>' (aka 'matrix<int, 3, 3>') to 'int4' (aka 'vector<int, 4>') is not allowed}}
+  u16_3x3 = (matrix_3_3<uint16_t>)v; // expected-error {{C-style cast from 'int4' (aka 'vector<int, 4>') to 'matrix_3_3<uint16_t>' (aka 'matrix<uint16_t, 3, 3>') is not allowed}}
+  s = (test_struct)u16_3x3;    // expected-error {{no matching conversion for C-style cast from 'matrix_3_3<uint16_t>' (aka 'matrix<uint16_t, 3, 3>') to 'test_struct'}}
+  f32_4x4 = (matrix_4_4<float>)s; // expected-error {{cannot convert 'test_struct' to 'matrix_4_4<float>' (aka 'matrix<float, 4, 4>') without a conversion operator}}
+}
+
+void f4() {
+  matrix_3_3<uint16_t> u16_3x3;
+  matrix_3_3<int> i32_3x3;
+  matrix_3_3<int16_t> i16_3x3;
+  matrix_4_4<int> i32_4x4;
+  matrix_4_4<float> f32_4x4;
+  int i;
+  int4 v;
+  test_struct s;
+
+  i32_3x3 = static_cast<matrix_3_3<int>>(u16_3x3);
+  i16_3x3 = static_cast<matrix_3_3<int16_t>>(i32_3x3);
+  i32_4x4 = static_cast<matrix_4_4<int>>(i16_3x3); // expected-error {{conversion between matrix types 'matrix_4_4<int>' (aka 'matrix<int, 4, 4>') and 'matrix<short, 3, 3>' of different size is not allowed}}
+
+  i = static_cast<int>(i16_3x3);            // expected-error {{static_cast from 'matrix_3_3<int16_t>' (aka 'matrix<int16_t, 3, 3>') to 'int' is not allowed}}
+  i32_3x3 = static_cast<matrix_3_3<int>>(i); // expected-error {{static_cast from 'int' to 'matrix_3_3<int>' (aka 'matrix<int, 3, 3>') is not allowed}}
+
+  v = static_cast<int4>(i32_3x3);             // expected-error {{static_cast from 'matrix_3_3<int>' (aka 'matrix<int, 3, 3>') to 'int4' (aka 'vector<int, 4>') is not allowed}}
+  i16_3x3 = static_cast<matrix_3_3<uint16_t>>(v); // expected-error {{static_cast from 'int4' (aka 'vector<int, 4>') to 'matrix_3_3<uint16_t>' (aka 'matrix<uint16_t, 3, 3>') is not allowed}}
+
+  s = static_cast<test_struct>(u16_3x3);    // expected-error {{no matching conversion for static_cast from 'matrix_3_3<uint16_t>' (aka 'matrix<uint16_t, 3, 3>') to 'test_struct'}}
+  f32_4x4 = static_cast<matrix_4_4<float>>(s); // expected-error {{cannot convert 'test_struct' to 'matrix_4_4<float>' (aka 'matrix<float, 4, 4>') without a conversion operator}}
+}
+
+void f5() {
+  matrix_3_3<float> f32_3x3;
+  matrix_3_3<double> f64_3x3;
+  matrix_4_4<double> f64_4x4;
+  matrix_4_4<signed int> i32_4x4;
+  matrix_3_3<unsigned int> u32_3x3;
+  matrix_4_4<unsigned int> u32_4x4;
+  float f;
+
+  f64_3x3 = (matrix_3_3<double>)f32_3x3;
+  f64_4x4 = (matrix_4_4<double>)f32_3x3; // expected-error {{conversion between matrix types 'matrix_4_4<double>' (aka 'matrix<double, 4, 4>') and 'matrix<float, 3, 3>' of different size is not allowed}}
+  i32_4x4 = (matrix_4_4<signed int>)f64_4x4;
+  u32_3x3 = (matrix_4_4<unsigned int>)i32_4x4; // expected-error {{assigning to 'matrix<[...], 3, 3>' from incompatible type 'matrix<[...], 4, 4>'}}
+  u32_4x4 = (matrix_4_4<unsigned int>)i32_4x4;
+  i32_4x4 = (matrix_4_4<signed int>)u32_4x4;
+}
+
+void f6() {
+  matrix_3_3<float> f32_3x3;
+  matrix_3_3<double> f64_3x3;
+  matrix_4_4<double> f64_4x4;
+  matrix_4_4<signed int> i32_4x4;
+  matrix_3_3<unsigned int> u32_3x3;
+  matrix_4_4<unsigned int> u32_4x4;
+  float f;
+
+  f64_3x3 = static_cast<matrix_3_3<double>>(f32_3x3);
+  f64_4x4 = static_cast<matrix_4_4<double>>(f32_3x3); // expected-error {{conversion between matrix types 'matrix_4_4<double>' (aka 'matrix<double, 4, 4>') and 'matrix<float, 3, 3>' of different size is not allowed}}
+
+  i32_4x4 = static_cast<matrix_4_4<signed int>>(f64_4x4);
+  u32_3x3 = static_cast<matrix_4_4<unsigned int>>(i32_4x4); // expected-error {{assigning to 'matrix<[...], 3, 3>' from incompatible type 'matrix<[...], 4, 4>'}}
+  u32_4x4 = static_cast<matrix_4_4<unsigned int>>(i32_4x4);
+  i32_4x4 = static_cast<matrix_4_4<signed int>>(u32_4x4);
+}
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-index-operator-type.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-index-operator-type.hlsl
new file mode 100644
index 00000000000000..29640ae01d6fb2
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-index-operator-type.hlsl
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -fnative-half-type -finclude-default-header -fsyntax-only %s -verify
+
+double indexi32(matrix<double,3,1> X, int       i) { return X[i][0]; }
+
+double indexu32(matrix<double,3,1> X, uint      i) { return X[i][0]; }
+
+double indexi16(matrix<double,3,1> X, int16_t   i) { return X[i][0]; }
+
+double indexu16(matrix<double,3,1> X, uint16_t  i) { return X[i][0]; }
+
+double indexi64(matrix<double,3,1> X, int64_t   i) { return X[i][0]; }
+
+double indexu64(matrix<double,3,1> X, uint64_t  i) { return X[i][0]; }
+
+double indexi32c(matrix<double,3,1> X, int      i) { return X[0][i]; }
+
+double indexu32c(matrix<double,3,1> X, uint     i) { return X[0][i]; }
+
+double indexi16c(matrix<double,3,1> X, int16_t  i) { return X[0][i]; }
+
+double indexu16c(matrix<double,3,1> X, uint16_t i) { return X[0][i]; }
+
+double indexi64c(matrix<double,3,1> X, int64_t  i) { return X[0][i]; }
+
+double indexu64c(matrix<double,3,1> X, uint64_t i) { return X[0][i]; }
+
+// expected-no-diagnostics
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl
new file mode 100644
index 00000000000000..3d982652b9e0ae
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-transpose.hlsl
@@ -0,0 +1,56 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -fnative-half-type -finclude-default-header -fsyntax-only %s -verify
+
+void transpose() {
+  float3x4 a;
+  int3x2 b;
+  double3x3 c;
+  int e;
+  a = __builtin_matrix_transpose(b);
+  // expected-error at -1 {{assigning to 'float3x4' (aka 'matrix<float, 3, 4>') from incompatible type 'matrix<int, 2, 3>'}}
+  b = __builtin_matrix_transpose(b);
+  // expected-error at -1 {{assigning to 'int3x2' (aka 'matrix<int, 3, 2>') from incompatible type 'matrix<int, 2, 3>'}}
+  __builtin_matrix_transpose(e);
+  // expected-error at -1 {{1st argument must be a matrix}}
+  __builtin_matrix_transpose("test");
+  // expected-error at -1 {{1st argument must be a matrix}}
+
+  uint3x3 m = __builtin_matrix_transpose(c);
+  // expected-error at -1 {{cannot initialize a variable of type 'uint3x3' (aka 'matrix<uint, 3, 3>') with an rvalue of type 'matrix<double, 3, 3>'}}
+}
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = matrix<EltTy, Rows, Columns>;
+
+  matrix_t value;
+};
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1>
+typename MyMatrix<EltTy1, R1, C1>::matrix_t transpose(inout MyMatrix<EltTy0, R0, C0> A) {
+  uint16_t v1 = __builtin_matrix_transpose(A.value);
+  // expected-error at -1 {{cannot initialize a variable of type 'uint16_t' (aka 'unsigned short') with an rvalue of type 'matrix<unsigned int, 3, 2>'}}
+  // expected-error at -2 2 {{cannot initialize a variable of type 'uint16_t' (aka 'unsigned short') with an rvalue of type 'matrix<unsigned int, 3, 3>'}}
+
+  __builtin_matrix_transpose(A);
+  // expected-error at -1 3 {{1st argument must be a matrix}}
+
+  return __builtin_matrix_transpose(A.value);
+  // expected-error at -1 {{cannot initialize return object of type 'typename MyMatrix<unsigned int, 2U, 3U>::matrix_t' (aka 'matrix<unsigned int, 2, 3>') with an rvalue of type 'matrix<unsigned int, 3, 2>'}}
+  // expected-error at -2 {{cannot initialize return object of type 'typename MyMatrix<unsigned int, 2U, 3U>::matrix_t' (aka 'matrix<unsigned int, 2, 3>') with an rvalue of type 'matrix<unsigned int, 3, 3>'}}
+  // expected-error at -3 {{cannot initialize return object of type 'typename MyMatrix<float, 3U, 3U>::matrix_t' (aka 'matrix<float, 3, 3>') with an rvalue of type 'matrix<unsigned int, 3, 3>'}}
+}
+
+void test_transpose_template() {
+  MyMatrix<unsigned, 2, 3> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  Mat1.value = transpose<unsigned, 2, 3, unsigned, 2, 3>(Mat1);
+  // expected-note at -1 {{in instantiation of function template specialization 'transpose<unsigned int, 2U, 3U, unsigned int, 2U, 3U>' requested here}}
+
+  Mat1.value = transpose<unsigned, 3, 3, unsigned, 2, 3>(Mat2);
+  // expected-note at -1 {{in instantiation of function template specialization 'transpose<unsigned int, 3U, 3U, unsigned int, 2U, 3U>' requested here}}
+
+  MyMatrix<float, 3, 3> Mat3;
+  Mat3.value = transpose<unsigned, 3, 3, float, 3, 3>(Mat2);
+  // expected-note at -1 {{in instantiation of function template specialization 'transpose<unsigned int, 3U, 3U, float, 3U, 3U>' requested here}}
+}
+
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl
new file mode 100644
index 00000000000000..f80d431f3336d2
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type-operators.hlsl
@@ -0,0 +1,307 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -fnative-half-type -finclude-default-header -fsyntax-only %s -verify
+
+void add(float4x4 a, float3x4 b, float4x3 c) {
+  a = b + c;
+  // expected-error at -1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'float4x3' (aka 'matrix<float, 4, 3>'))}}
+
+  b += c;
+  // expected-error at -1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'float4x3' (aka 'matrix<float, 4, 3>'))}}
+
+  a = b + b; // expected-error {{assigning to 'matrix<[...], 4, [...]>' from incompatible type 'matrix<[...], 3, [...]>'}}
+
+  a = 10 + b;
+  // expected-error at -1 {{assigning to 'matrix<[...], 4, [...]>' from incompatible type 'matrix<[...], 3, [...]>'}}
+}
+
+void sub(float4x4 a, float3x4 b, float4x3 c) {
+  a = b - c;
+  // expected-error at -1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'float4x3' (aka 'matrix<float, 4, 3>'))}}
+
+  b -= c;
+  // expected-error at -1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'float4x3' (aka 'matrix<float, 4, 3>'))}}
+
+  a = b - b; // expected-error {{assigning to 'matrix<[...], 4, [...]>' from incompatible type 'matrix<[...], 3, [...]>'}}
+
+  a = 10 - b;
+  // expected-error at -1 {{assigning to 'matrix<[...], 4, [...]>' from incompatible type 'matrix<[...], 3, [...]>'}}
+
+}
+
+void matrix_matrix_multiply(float4x4 a, float3x4 b, int4x3 c, int4x4 d, float sf, inout uint16_t p) {
+  // Check dimension mismatches.
+  a = a * b;
+  // expected-error at -1 {{invalid operands to binary expression ('float4x4' (aka 'matrix<float, 4, 4>') and 'float3x4' (aka 'matrix<float, 3, 4>'))}}
+  a *= b;
+  // expected-error at -1 {{invalid operands to binary expression ('float4x4' (aka 'matrix<float, 4, 4>') and 'float3x4' (aka 'matrix<float, 3, 4>'))}}
+  b = a * a;
+  // expected-error at -1 {{assigning to 'matrix<[...], 3, [...]>' from incompatible type 'matrix<[...], 4, [...]>'}}
+
+  // Check element type mismatches.
+  a = b * c;
+  // expected-error at -1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'int4x3' (aka 'matrix<int, 4, 3>'))}}
+  b *= c;
+  // expected-error at -1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'int4x3' (aka 'matrix<int, 4, 3>'))}}
+  d = a * a;
+  // expected-error at -1 {{assigning to 'matrix<int, [2 * ...]>' from incompatible type 'matrix<float, [2 * ...]>'}}
+
+  p = a * a;
+  // expected-error at -1 {{assigning to 'uint16_t' (aka 'unsigned short') from incompatible type 'float4x4' (aka 'matrix<float, 4, 4>')}}
+}
+
+void mat_scalar_multiply(float4x4 a, float3x4 b, float sf, inout uint16_t p) {
+  // Shape of multiplication result does not match the type of b.
+  b = a * sf;
+  // expected-error at -1 {{assigning to 'matrix<[...], 3, [...]>' from incompatible type 'matrix<[...], 4, [...]>'}}
+  b = sf * a;
+  // expected-error at -1 {{assigning to 'matrix<[...], 3, [...]>' from incompatible type 'matrix<[...], 4, [...]>'}}
+
+  sf = a * sf;
+  // expected-error at -1 {{assigning to 'float' from incompatible type 'float4x4' (aka 'matrix<float, 4, 4>')}}
+}
+
+void mat_scalar_divide(float4x4 a, float3x4 b, float sf, inout uint16_t p) {
+  // Shape of multiplication result does not match the type of b.
+  b = a / sf;
+  // expected-error at -1 {{assigning to 'matrix<[...], 3, [...]>' from incompatible type 'matrix<[...], 4, [...]>'}}
+  b = sf / a;
+  // expected-error at -1 {{invalid operands to binary expression ('float' and 'float4x4' (aka 'matrix<float, 4, 4>'))}}
+
+  a = p / a;
+  // expected-error at -1 {{invalid operands to binary expression ('uint16_t' (aka 'unsigned short') and 'float4x4' (aka 'matrix<float, 4, 4>'))}}
+
+  sf = a / sf;
+  // expected-error at -1 {{assigning to 'float' from incompatible type 'float4x4' (aka 'matrix<float, 4, 4>')}}
+}
+
+void matrix_matrix_divide(float4x4 a, float3x4 b, int4x3 c, int4x4 d, float sf, uint16_t p) {
+  // Matrix by matrix division is not supported.
+  a = a / a;
+  // expected-error at -1 {{invalid operands to binary expression ('float4x4' (aka 'matrix<float, 4, 4>') and 'float4x4')}}
+
+  b = a / a;
+  // expected-error at -1 {{invalid operands to binary expression ('float4x4' (aka 'matrix<float, 4, 4>') and 'float4x4')}}
+
+  // Check element type mismatches.
+  a = b / c;
+  // expected-error at -1 {{invalid operands to binary expression ('float3x4' (aka 'matrix<float, 3, 4>') and 'int4x3' (aka 'matrix<int, 4, 3>'))}}
+  d = a / a;
+  // expected-error at -1 {{invalid operands to binary expression ('float4x4' (aka 'matrix<float, 4, 4>') and 'float4x4')}}
+
+  p = a / a;
+  // expected-error at -1 {{invalid operands to binary expression ('float4x4' (aka 'matrix<float, 4, 4>') and 'float4x4')}}
+}
+
+float3x4 get_matrix(void);
+
+void insert(float3x4 a, float f) {
+  // Non integer indexes.
+  a[1][f] = 0;
+  // expected-error at -1 {{matrix column index is not an integer}}
+  a[f][2] = 0;
+  // expected-error at -1 {{matrix row index is not an integer}}
+  a[f][f] = 0;
+  // expected-error at -1 {{matrix row index is not an integer}}
+  // expected-error at -2 {{matrix column index is not an integer}}
+  a[0][f] = 0;
+  // expected-error at -1 {{matrix column index is not an integer}}
+
+  a[f][f] = 0;
+  // expected-error at -1 {{matrix row index is not an integer}}
+  // expected-error at -2 {{matrix column index is not an integer}}
+
+  // Indexes outside allowed dimensions.
+  a[-1][3] = 10.0;
+  // expected-error at -1 {{matrix row index is outside the allowed range [0, 3)}}
+  a[2][-1] = 10.0;
+  // expected-error at -1 {{matrix column index is outside the allowed range [0, 4)}}
+  a[2][-1u] = 10.0;
+  // expected-error at -1 {{matrix column index is outside the allowed range [0, 4)}}
+  a[-1u][3] = 10.0;
+  // expected-error at -1 {{matrix row index is outside the allowed range [0, 3)}}
+  a[5][2] = 10.0;
+  // expected-error at -1 {{matrix row index is outside the allowed range [0, 3)}}
+  a[2][10] = 10.0;
+  // expected-error at -1 {{matrix column index is outside the allowed range [0, 4)}}
+  a[3][2.0] = f;
+  // expected-error at -1 {{matrix row index is outside the allowed range [0, 3)}}
+  // expected-error at -2 {{matrix column index is not an integer}}
+  (a[1])[1] = f;
+  // expected-error at -1 {{matrix row and column subscripts cannot be separated by any expression}}
+
+  get_matrix()[0][0] = f;
+  // expected-error at -1 {{expression is not assignable}}
+  get_matrix()[3][1.0] = f;
+  // expected-error at -1 {{matrix row index is outside the allowed range [0, 3)}}
+  // expected-error at -2 {{matrix column index is not an integer}}
+
+  (get_matrix()[0])[2] = f;
+  // expected-error at -1 {{matrix row and column subscripts cannot be separated by any expression}}
+
+  a[4, 5] = 5.0;
+  // expected-error at -1 {{comma expressions are not allowed as indices in matrix subscript expressions}}
+  // expected-warning at -2 {{left operand of comma operator has no effect}}
+
+  a[4, 5, 4] = 5.0;
+  // expected-error at -1 {{comma expressions are not allowed as indices in matrix subscript expressions}}
+  // expected-warning at -2 {{left operand of comma operator has no effect}}
+  // expected-warning at -3 {{left operand of comma operator has no effect}}
+}
+
+void extract(float3x4 a, float f) {
+  // Non integer indexes.
+  float v1 = a[2][f];
+  // expected-error at -1 {{matrix column index is not an integer}}
+  float v2 = a[f][3];
+  // expected-error at -1 {{matrix row index is not an integer}}
+  float v3 = a[f][f];
+  // expected-error at -1 {{matrix row index is not an integer}}
+  // expected-error at -2 {{matrix column index is not an integer}}
+
+  // Indexes outside allowed dimensions.
+  float v5 = a[-1][3];
+  // expected-error at -1 {{matrix row index is outside the allowed range [0, 3)}}
+  float v6 = a[2][-1];
+  // expected-error at -1 {{matrix column index is outside the allowed range [0, 4)}}
+  float v8 = a[-1u][3];
+  // expected-error at -1 {{matrix row index is outside the allowed range [0, 3)}}
+  float v9 = a[5][2];
+  // expected-error at -1 {{matrix row index is outside the allowed range [0, 3)}}
+  float v10 = a[2][4];
+  // expected-error at -1 {{matrix column index is outside the allowed range [0, 4)}}
+  float v11 = a[3][2.0];
+  // expected-error at -1 {{matrix row index is outside the allowed range [0, 3)}}
+  // expected-error at -2 {{matrix column index is not an integer}}
+
+  float v12 = get_matrix()[0][0];
+  float v13 = get_matrix()[3][2.0];
+  // expected-error at -1 {{matrix row index is outside the allowed range [0, 3)}}
+  // expected-error at -2 {{matrix column index is not an integer}}
+
+}
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = matrix<EltTy, Rows, Columns>;
+
+  matrix_t value;
+};
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1, typename EltTy2, unsigned R2, unsigned C2>
+typename MyMatrix<EltTy2, R2, C2>::matrix_t add(inout MyMatrix<EltTy0, R0, C0> A, inout MyMatrix<EltTy1, R1, C1> B) {
+  uint16_t v1 = A.value + B.value;
+  // expected-error at -1 {{cannot initialize a variable of type 'uint16_t' (aka 'unsigned short') with an rvalue of type 'matrix_t' (aka 'matrix<unsigned int, 2, 2>')}}
+  // expected-error at -2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 3>') and 'matrix_t' (aka 'matrix<float, 2, 2>'))}}
+  // expected-error at -3 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 2, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 3>'))}}
+
+  return A.value + B.value;
+  // expected-error at -1 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 3>') and 'matrix_t' (aka 'matrix<float, 2, 2>'))}}
+  // expected-error at -2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 2, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 3>'))}}
+}
+
+void test_add_template() {
+  MyMatrix<unsigned, 2, 2> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  MyMatrix<float, 2, 2> Mat3;
+  unsigned v1 = add<unsigned, 2, 2, unsigned, 2, 2, unsigned, 2, 2>(Mat1, Mat1);
+  // expected-error at -1 {{cannot initialize a variable of type 'unsigned int' with an rvalue of type 'typename MyMatrix<unsigned int, 2U, 2U>::matrix_t' (aka 'matrix<unsigned int, 2, 2>')}}
+  // expected-note at -2 {{in instantiation of function template specialization 'add<unsigned int, 2U, 2U, unsigned int, 2U, 2U, unsigned int, 2U, 2U>' requested here}}
+
+  Mat1.value = add<unsigned, 2, 2, unsigned, 3, 3, unsigned, 2, 2>(Mat1, Mat2);
+  // expected-note at -1 {{in instantiation of function template specialization 'add<unsigned int, 2U, 2U, unsigned int, 3U, 3U, unsigned int, 2U, 2U>' requested here}}
+
+  Mat1.value = add<unsigned, 3, 3, float, 2, 2, unsigned, 2, 2>(Mat2, Mat3);
+  // expected-note at -1 {{in instantiation of function template specialization 'add<unsigned int, 3U, 3U, float, 2U, 2U, unsigned int, 2U, 2U>' requested here}}
+}
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1, typename EltTy2, unsigned R2, unsigned C2>
+typename MyMatrix<EltTy2, R2, C2>::matrix_t subtract(inout MyMatrix<EltTy0, R0, C0> A, inout MyMatrix<EltTy1, R1, C1> B) {
+  uint16_t v1 = A.value - B.value;
+  // expected-error at -1 {{cannot initialize a variable of type 'uint16_t' (aka 'unsigned short') with an rvalue of type 'matrix_t' (aka 'matrix<unsigned int, 2, 2>')}}
+  // expected-error at -2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 3>') and 'matrix_t' (aka 'matrix<float, 2, 2>')}}
+  // expected-error at -3 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 2, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 3>')}}
+
+  return A.value - B.value;
+  // expected-error at -1 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 3>') and 'matrix_t' (aka 'matrix<float, 2, 2>')}}
+  // expected-error at -2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 2, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 3>')}}
+}
+
+void test_subtract_template() {
+  MyMatrix<unsigned, 2, 2> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  MyMatrix<float, 2, 2> Mat3;
+  unsigned v1 = subtract<unsigned, 2, 2, unsigned, 2, 2, unsigned, 2, 2>(Mat1, Mat1);
+  // expected-error at -1 {{cannot initialize a variable of type 'unsigned int' with an rvalue of type 'typename MyMatrix<unsigned int, 2U, 2U>::matrix_t' (aka 'matrix<unsigned int, 2, 2>')}}
+  // expected-note at -2 {{in instantiation of function template specialization 'subtract<unsigned int, 2U, 2U, unsigned int, 2U, 2U, unsigned int, 2U, 2U>' requested here}}
+
+  Mat1.value = subtract<unsigned, 2, 2, unsigned, 3, 3, unsigned, 2, 2>(Mat1, Mat2);
+  // expected-note at -1 {{in instantiation of function template specialization 'subtract<unsigned int, 2U, 2U, unsigned int, 3U, 3U, unsigned int, 2U, 2U>' requested here}}
+
+  Mat1.value = subtract<unsigned, 3, 3, float, 2, 2, unsigned, 2, 2>(Mat2, Mat3);
+  // expected-note at -1 {{in instantiation of function template specialization 'subtract<unsigned int, 3U, 3U, float, 2U, 2U, unsigned int, 2U, 2U>' requested here}}
+}
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1, typename EltTy2, unsigned R2, unsigned C2>
+typename MyMatrix<EltTy2, R2, C2>::matrix_t multiply(inout MyMatrix<EltTy0, R0, C0> A, inout MyMatrix<EltTy1, R1, C1> B) {
+  uint16_t v1 = A.value * B.value;
+  // expected-error at -1 {{cannot initialize a variable of type 'uint16_t' (aka 'unsigned short') with an rvalue of type 'matrix_t' (aka 'matrix<unsigned int, 2, 2>')}}
+  // expected-error at -2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 3>'))}}
+  // expected-error at -3 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<float, 2, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 2, 2>'))}}
+
+  MyMatrix<int, 5, 6> m;
+  B.value = m.value * A.value;
+  // expected-error at -1 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<int, 5, 6>') and 'matrix_t' (aka 'matrix<unsigned int, 2, 2>'))}}
+  // expected-error at -2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<int, 5, 6>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 2>'))}}
+  // expected-error at -3 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<int, 5, 6>') and 'matrix_t' (aka 'matrix<float, 2, 2>'))}}
+
+  return A.value * B.value;
+  // expected-error at -1 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 3, 3>'))}}
+  // expected-error at -2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<float, 2, 2>') and 'matrix_t' (aka 'matrix<unsigned int, 2, 2>'))}}
+}
+
+void test_multiply_template() {
+  MyMatrix<unsigned, 2, 2> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  MyMatrix<float, 2, 2> Mat3;
+  unsigned v1 = multiply<unsigned, 2, 2, unsigned, 2, 2, unsigned, 2, 2>(Mat1, Mat1);
+  // expected-note at -1 {{in instantiation of function template specialization 'multiply<unsigned int, 2U, 2U, unsigned int, 2U, 2U, unsigned int, 2U, 2U>' requested here}}
+  // expected-error at -2 {{cannot initialize a variable of type 'unsigned int' with an rvalue of type 'typename MyMatrix<unsigned int, 2U, 2U>::matrix_t' (aka 'matrix<unsigned int, 2, 2>')}}
+
+  MyMatrix<unsigned, 3, 2> Mat4;
+  Mat1.value = multiply<unsigned, 3, 2, unsigned, 3, 3, unsigned, 2, 2>(Mat4, Mat2);
+  // expected-note at -1 {{in instantiation of function template specialization 'multiply<unsigned int, 3U, 2U, unsigned int, 3U, 3U, unsigned int, 2U, 2U>' requested here}}
+
+  Mat1.value = multiply<float, 2, 2, unsigned, 2, 2, unsigned, 2, 2>(Mat3, Mat1);
+  // expected-note at -1 {{in instantiation of function template specialization 'multiply<float, 2U, 2U, unsigned int, 2U, 2U, unsigned int, 2U, 2U>' requested here}}
+
+  Mat4.value = Mat4.value * Mat1;
+  // expected-error at -1 {{no viable conversion from 'MyMatrix<unsigned int, 2, 2>' to 'unsigned int'}}
+  // expected-error at -2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<unsigned int, 3, 2>') and 'MyMatrix<unsigned int, 2, 2>')}}
+}
+
+struct UserT {};
+
+struct StructWithC {
+  operator UserT() {
+    // expected-note at -1 4 {{candidate function}}
+    return {};
+  }
+};
+
+void test_DoubleWrapper(inout MyMatrix<double, 4, 3> m, inout StructWithC c) {
+  m.value = m.value + c;
+  // expected-error at -1 {{no viable conversion from 'StructWithC' to 'double'}}
+  // expected-error at -2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<double, 4, 3>') and 'StructWithC')}}
+
+  m.value = c + m.value;
+  // expected-error at -1 {{no viable conversion from 'StructWithC' to 'double'}}
+  // expected-error at -2 {{invalid operands to binary expression ('StructWithC' and 'matrix_t' (aka 'matrix<double, 4, 3>'))}}
+
+  m.value = m.value - c;
+  // expected-error at -1 {{no viable conversion from 'StructWithC' to 'double'}}
+  // expected-error at -2 {{invalid operands to binary expression ('matrix_t' (aka 'matrix<double, 4, 3>') and 'StructWithC')}}
+
+  m.value = c - m.value;
+  // expected-error at -1 {{no viable conversion from 'StructWithC' to 'double'}}
+  // expected-error at -2 {{invalid operands to binary expression ('StructWithC' and 'matrix_t' (aka 'matrix<double, 4, 3>'))}}
+}
+
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type.hlsl
new file mode 100644
index 00000000000000..fe374f388d104b
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/matrix-type.hlsl
@@ -0,0 +1,48 @@
+// A note points to the external source at present, so we have to ignore it.
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -fnative-half-type -finclude-default-header -fsyntax-only %s -verify -verify-ignore-unexpected=note
+// All the errors are actually in the external source at present, so we have to ignore them.
+// The notes point to the proper lines though.
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-compute -fnative-half-type -finclude-default-header -fsyntax-only -DMTXTYPE %s -verify=mtxtype -verify-ignore-unexpected=error
+
+#ifndef MTXTYPE
+void matrix_var_dimensions(int Rows, unsigned Columns, uint16_t C) {
+  // expected-note at -1 3{{declared here}}
+  matrix<int, Rows, 1> m1;    // expected-error{{non-type template argument is not a constant expression}}
+  // expected-note at -1{{function parameter 'Rows' with unknown value cannot be used in a constant expression}}
+  matrix<int, 1, Columns> m2; // expected-error{{non-type template argument is not a constant expression}}
+  // expected-note at -1{{function parameter 'Columns' with unknown value cannot be used in a constant expression}}
+  matrix<int, C, C> m3;       // expected-error{{non-type template argument is not a constant expression}}
+  // expected-note at -1{{function parameter 'C' with unknown value cannot be used in a constant expression}}
+  matrix<int, char, 0> m8;    // expected-error{{template argument for non-type template parameter must be an expression}}
+
+}
+#else
+struct S1 {};
+
+enum TestEnum {
+  A,
+  B
+};
+
+void matrix_unsupported_element_type() {
+  // The future-errors are not checked yet since they are predeclared and are ignored.
+  matrix<S1, 1, 1> m1;       // future-error{{invalid matrix element type 'S1'}}
+  // mtxtype-note at -1{{in instantiation of template type alias 'matrix' requested here}}
+  matrix<bool, 1, 1> m2;     // future-error{{invalid matrix element type 'bool'}}
+  // mtxtype-note at -1{{in instantiation of template type alias 'matrix' requested here}}
+  matrix<TestEnum, 1, 1> m3; // future-error{{invalid matrix element type 'TestEnum'}}
+  // mtxtype-note at -1{{in instantiation of template type alias 'matrix' requested here}}
+
+  matrix<int, -1, 1> m4;      // future-error{{matrix row size too large}}
+  // mtxtype-note at -1{{in instantiation of template type alias 'matrix' requested here}}
+  matrix<int, 1, -1> m5;      // future-error{{matrix column size too large}}
+  // mtxtype-note at -1{{in instantiation of template type alias 'matrix' requested here}}
+  matrix<int, 0, 1> m6;       // future-error{{zero matrix size}}
+  // mtxtype-note at -1{{in instantiation of template type alias 'matrix' requested here}}
+  matrix<int, 1, 0> m7;       // future-error{{zero matrix size}}
+  // mtxtype-note at -1{{in instantiation of template type alias 'matrix' requested here}}
+  matrix<int, 1048576, 1> m9; // future-error{{matrix row size too large}}
+  // mtxtype-note at -1{{in instantiation of template type alias 'matrix' requested here}}
+
+}
+#endif
diff --git a/clang/test/SemaTemplate/matrix-type.cpp b/clang/test/SemaTemplate/matrix-type.cpp
index 1843c0a1a6ed62..dd9cbf1165b02c 100644
--- a/clang/test/SemaTemplate/matrix-type.cpp
+++ b/clang/test/SemaTemplate/matrix-type.cpp
@@ -64,7 +64,7 @@ matrix<float, R + 1, C + 2> use_matrix_2(matrix<int, R, C> &m) {}
 
 template <unsigned long R, unsigned long C>
 void use_matrix_2(matrix<int, R + 2, C / 2> &m1, matrix<float, R, C> &m2) {}
-// expected-note at -1 {{candidate function [with R = 3, C = 11] not viable: no known conversion from 'matrix<int, 5, 6>' (aka 'int __attribute__((matrix_type(5, 6)))') to 'matrix<int, 3UL + 2, 11UL / 2> &' (aka 'int  __attribute__((matrix_type(5, 5)))&') for 1st argument}}
+// expected-note at -1 {{candidate function [with R = 3, C = 11] not viable: no known conversion from 'matrix<int, 5, 6>' (aka 'int __attribute__((matrix_type(5, 6)))') to 'matrix<int, 3UL + 2, 11UL / 2> &' (aka 'int & __attribute__((matrix_type(5, 5)))') for 1st argument}}
 // expected-note at -2 {{candidate template ignored: deduced type 'matrix<float, 3UL, 4UL>' of 2nd parameter does not match adjusted type 'matrix<int, 3, 4>' of argument [with R = 3, C = 4]}}
 
 template <typename T, unsigned long R, unsigned long C>

>From 1fa442bfcd9e428753332f33da39510986b5811a Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth at microsoft.com>
Date: Mon, 7 Oct 2024 12:08:56 -0600
Subject: [PATCH 2/2] clang-format

---
 clang/lib/Sema/HLSLExternalSemaSource.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index d1a53d2ad88864..e8e4e802051740 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -493,7 +493,7 @@ void HLSLExternalSemaSource::defineHLSLMatrixAlias() {
       false, AST.getTrivialTypeSourceInfo(AST.IntTy));
   llvm::APInt RVal(AST.getIntWidth(AST.IntTy), 4);
   TemplateArgument RDefault(AST, llvm::APSInt(std::move(RVal)), AST.IntTy,
-                           /*IsDefaulted=*/true);
+                            /*IsDefaulted=*/true);
   RowsParam->setDefaultArgument(
       AST, SemaPtr->getTrivialTemplateArgumentLoc(RDefault, AST.IntTy,
                                                   SourceLocation(), RowsParam));
@@ -505,7 +505,7 @@ void HLSLExternalSemaSource::defineHLSLMatrixAlias() {
       false, AST.getTrivialTypeSourceInfo(AST.IntTy));
   llvm::APInt CVal(AST.getIntWidth(AST.IntTy), 4);
   TemplateArgument CDefault(AST, llvm::APSInt(std::move(CVal)), AST.IntTy,
-                           /*IsDefaulted=*/true);
+                            /*IsDefaulted=*/true);
   ColsParam->setDefaultArgument(
       AST, SemaPtr->getTrivialTemplateArgumentLoc(CDefault, AST.IntTy,
                                                   SourceLocation(), ColsParam));