[clang] [HLSL][Matrix] Allow flattening matrix types in FlattenAccessAndTypeLValue (PR #177708)

Tue Feb 17 11:29:47 PST 2026

https://github.com/Icohedron updated https://github.com/llvm/llvm-project/pull/177708

>From 21287b87994c3dce5e1615c9bace88d819526324 Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Fri, 23 Jan 2026 15:13:32 -0800
Subject: [PATCH 1/6] Enable explicit cast of matrices to vectors

---
 clang/lib/CodeGen/CGExpr.cpp                  | 21 +++++-
 .../BasicFeatures/MatrixToVectorCast.hlsl     | 70 +++++++++++++++++++
 2 files changed, 89 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/BasicFeatures/MatrixToVectorCast.hlsl

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 76a3939cd28eb..5d595c2e3b1e5 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -7117,8 +7117,6 @@ void CodeGenFunction::FlattenAccessAndTypeLValue(
   while (!WorkList.empty()) {
     auto [LVal, T, IdxList] = WorkList.pop_back_val();
     T = T.getCanonicalType().getUnqualifiedType();
-    assert(!isa<MatrixType>(T) && "Matrix types not yet supported in HLSL");
-
     if (const auto *CAT = dyn_cast<ConstantArrayType>(T)) {
       uint64_t Size = CAT->getZExtSize();
       for (int64_t I = Size - 1; I > -1; I--) {
@@ -7192,6 +7190,25 @@ void CodeGenFunction::FlattenAccessAndTypeLValue(
                                   Base.getBaseInfo(), TBAAAccessInfo());
         AccessList.emplace_back(LV);
       }
+    } else if (const auto *MT = dyn_cast<ConstantMatrixType>(T)) {
+      // Matrices are represented as flat arrays in memory, but has a vector
+      // value type. So we use ConvertMatrixAddress to convert the address from
+      // array to vector, and extract elements similar to the vector case above.
+      // The order in which we iterate over the elements is sequentially in
+      // memory; whether the matrix is in row- or column-major order does not
+      // matter.
+      llvm::Type *LLVMT = ConvertTypeForMem(T);
+      CharUnits Align = getContext().getTypeAlignInChars(T);
+      Address GEP = Builder.CreateInBoundsGEP(LVal.getAddress(), IdxList, LLVMT,
+                                              Align, "matrix.gep");
+      LValue Base = MakeAddrLValue(GEP, T);
+      Address MatAddr = MaybeConvertMatrixAddress(Base.getAddress(), *this);
+      for (unsigned I = 0, E = MT->getNumElementsFlattened(); I < E; I++) {
+        llvm::Constant *Idx = llvm::ConstantInt::get(IdxTy, I);
+        LValue LV = LValue::MakeMatrixElt(MatAddr, Idx, MT->getElementType(),
+                                          Base.getBaseInfo(), TBAAAccessInfo());
+        AccessList.emplace_back(LV);
+      }
     } else { // a scalar/builtin type
       if (!IdxList.empty()) {
         llvm::Type *LLVMT = ConvertTypeForMem(T);
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixToVectorCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixToVectorCast.hlsl
new file mode 100644
index 0000000000000..06af2ebe72473
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixToVectorCast.hlsl
@@ -0,0 +1,70 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COL-CHECK
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROW-CHECK
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z2fnu11matrix_typeILm2ELm2EfE(
+// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[M:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [4 x float], align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca [4 x float], align 4
+// CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <4 x float> [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[HLSL_EWCAST_SRC]], align 4
+// CHECK-NEXT:    [[MATRIX_GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[HLSL_EWCAST_SRC]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[FLATCAST_TMP]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
+// CHECK-NEXT:    [[MATRIXEXT:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP1]], float [[MATRIXEXT]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
+// CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP3]], float [[MATRIXEXT1]], i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
+// CHECK-NEXT:    [[MATRIXEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float [[MATRIXEXT2]], i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
+// CHECK-NEXT:    [[MATRIXEXT3:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
+// CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP7]], float [[MATRIXEXT3]], i64 3
+// CHECK-NEXT:    store <4 x float> [[TMP9]], ptr [[V]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[V]], align 16
+// CHECK-NEXT:    ret <4 x float> [[TMP10]]
+//
+float4 fn(float2x2 M) {
+    float4 V = (float4)M;
+    return V;
+}
+
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z3fn2u11matrix_typeILm3ELm1EiE(
+// CHECK-SAME: <3 x i32> noundef [[M:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [3 x i32], align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca <3 x i32>, align 16
+// CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca [3 x i32], align 4
+// CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <3 x i32>, align 16
+// CHECK-NEXT:    store <3 x i32> [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <3 x i32> [[TMP0]], ptr [[HLSL_EWCAST_SRC]], align 4
+// CHECK-NEXT:    [[MATRIX_GEP:%.*]] = getelementptr inbounds <3 x i32>, ptr [[HLSL_EWCAST_SRC]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[FLATCAST_TMP]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr [[MATRIX_GEP]], align 4
+// CHECK-NEXT:    [[MATRIXEXT:%.*]] = extractelement <3 x i32> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP1]], i32 [[MATRIXEXT]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <3 x i32>, ptr [[MATRIX_GEP]], align 4
+// CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <3 x i32> [[TMP4]], i32 1
+// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[MATRIXEXT1]], i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <3 x i32>, ptr [[MATRIX_GEP]], align 4
+// CHECK-NEXT:    [[MATRIXEXT2:%.*]] = extractelement <3 x i32> [[TMP6]], i32 2
+// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x i32> [[TMP5]], i32 [[MATRIXEXT2]], i64 2
+// CHECK-NEXT:    store <3 x i32> [[TMP7]], ptr [[V]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = load <3 x i32>, ptr [[V]], align 16
+// CHECK-NEXT:    ret <3 x i32> [[TMP8]]
+//
+int3 fn2(int3x1 M) {
+    int3 V = (int3)M;
+    return V;
+}
+
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// COL-CHECK: {{.*}}
+// ROW-CHECK: {{.*}}

>From 805aebe5ff6bd6b39f07c383d5531cc51759c25e Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Mon, 26 Jan 2026 10:17:01 -0800
Subject: [PATCH 2/6] Adjust index iteration for matrix memory layout

---
 clang/lib/CodeGen/CGExpr.cpp                   | 18 ++++++++++++++----
 .../BasicFeatures/MatrixToVectorCast.hlsl      | 12 +++++-------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 5d595c2e3b1e5..9a6629e4b6f9f 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -7194,17 +7194,27 @@ void CodeGenFunction::FlattenAccessAndTypeLValue(
       // Matrices are represented as flat arrays in memory, but has a vector
       // value type. So we use ConvertMatrixAddress to convert the address from
       // array to vector, and extract elements similar to the vector case above.
-      // The order in which we iterate over the elements is sequentially in
-      // memory; whether the matrix is in row- or column-major order does not
-      // matter.
+      // The order in which we iterate over the elements must respect the
+      // matrix memory layout, computing the proper index for each (row, col).
       llvm::Type *LLVMT = ConvertTypeForMem(T);
       CharUnits Align = getContext().getTypeAlignInChars(T);
       Address GEP = Builder.CreateInBoundsGEP(LVal.getAddress(), IdxList, LLVMT,
                                               Align, "matrix.gep");
       LValue Base = MakeAddrLValue(GEP, T);
       Address MatAddr = MaybeConvertMatrixAddress(Base.getAddress(), *this);
+      unsigned NumRows = MT->getNumRows();
+      unsigned NumCols = MT->getNumColumns();
+      bool IsMatrixRowMajor = getLangOpts().getDefaultMatrixMemoryLayout() ==
+                              LangOptions::MatrixMemoryLayout::MatrixRowMajor;
+      llvm::MatrixBuilder MB(Builder);
       for (unsigned I = 0, E = MT->getNumElementsFlattened(); I < E; I++) {
-        llvm::Constant *Idx = llvm::ConstantInt::get(IdxTy, I);
+        // Compute (row, col) from linear index assuming row-major iteration.
+        unsigned Row = I / NumCols;
+        unsigned Col = I % NumCols;
+        llvm::Value *RowIdx = llvm::ConstantInt::get(IdxTy, Row);
+        llvm::Value *ColIdx = llvm::ConstantInt::get(IdxTy, Col);
+        llvm::Value *Idx =
+            MB.CreateIndex(RowIdx, ColIdx, NumRows, NumCols, IsMatrixRowMajor);
         LValue LV = LValue::MakeMatrixElt(MatAddr, Idx, MT->getElementType(),
                                           Base.getBaseInfo(), TBAAAccessInfo());
         AccessList.emplace_back(LV);
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixToVectorCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixToVectorCast.hlsl
index 06af2ebe72473..0485fb73197ad 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixToVectorCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixToVectorCast.hlsl
@@ -1,4 +1,3 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COL-CHECK
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROW-CHECK
 
@@ -18,10 +17,12 @@
 // CHECK-NEXT:    [[MATRIXEXT:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 // CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP1]], float [[MATRIXEXT]], i64 0
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
-// CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+// COL-CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+// ROW-CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
 // CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP3]], float [[MATRIXEXT1]], i64 1
 // CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
-// CHECK-NEXT:    [[MATRIXEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+// COL-CHECK-NEXT:    [[MATRIXEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
+// ROW-CHECK-NEXT:    [[MATRIXEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
 // CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float [[MATRIXEXT2]], i64 2
 // CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
 // CHECK-NEXT:    [[MATRIXEXT3:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
@@ -36,7 +37,7 @@ float4 fn(float2x2 M) {
 }
 
 // CHECK-LABEL: define hidden noundef <3 x i32> @_Z3fn2u11matrix_typeILm3ELm1EiE(
-// CHECK-SAME: <3 x i32> noundef [[M:%.*]]) #[[ATTR0]] {
+// CHECK-SAME: <3 x i32> noundef [[M:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [3 x i32], align 4
 // CHECK-NEXT:    [[V:%.*]] = alloca <3 x i32>, align 16
@@ -65,6 +66,3 @@ int3 fn2(int3x1 M) {
     return V;
 }
 
-//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-// COL-CHECK: {{.*}}
-// ROW-CHECK: {{.*}}

>From 98e55a9ea34b9387600ca6d5e650d81099f6dd9c Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Tue, 27 Jan 2026 10:48:18 -0800
Subject: [PATCH 3/6] Move tests to existing VectorElementwiseCast test file

---
 .../BasicFeatures/MatrixToVectorCast.hlsl     | 68 -------------------
 .../BasicFeatures/VectorElementwiseCast.hlsl  | 61 ++++++++++++++++-
 2 files changed, 60 insertions(+), 69 deletions(-)
 delete mode 100644 clang/test/CodeGenHLSL/BasicFeatures/MatrixToVectorCast.hlsl

diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixToVectorCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixToVectorCast.hlsl
deleted file mode 100644
index 0485fb73197ad..0000000000000
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixToVectorCast.hlsl
+++ /dev/null
@@ -1,68 +0,0 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COL-CHECK
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROW-CHECK
-
-// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z2fnu11matrix_typeILm2ELm2EfE(
-// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[M:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [4 x float], align 4
-// CHECK-NEXT:    [[V:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca [4 x float], align 4
-// CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    store <4 x float> [[M]], ptr [[M_ADDR]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
-// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[HLSL_EWCAST_SRC]], align 4
-// CHECK-NEXT:    [[MATRIX_GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[HLSL_EWCAST_SRC]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[FLATCAST_TMP]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
-// CHECK-NEXT:    [[MATRIXEXT:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP1]], float [[MATRIXEXT]], i64 0
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
-// COL-CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
-// ROW-CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
-// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP3]], float [[MATRIXEXT1]], i64 1
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
-// COL-CHECK-NEXT:    [[MATRIXEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
-// ROW-CHECK-NEXT:    [[MATRIXEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
-// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float [[MATRIXEXT2]], i64 2
-// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
-// CHECK-NEXT:    [[MATRIXEXT3:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
-// CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP7]], float [[MATRIXEXT3]], i64 3
-// CHECK-NEXT:    store <4 x float> [[TMP9]], ptr [[V]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[V]], align 16
-// CHECK-NEXT:    ret <4 x float> [[TMP10]]
-//
-float4 fn(float2x2 M) {
-    float4 V = (float4)M;
-    return V;
-}
-
-// CHECK-LABEL: define hidden noundef <3 x i32> @_Z3fn2u11matrix_typeILm3ELm1EiE(
-// CHECK-SAME: <3 x i32> noundef [[M:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [3 x i32], align 4
-// CHECK-NEXT:    [[V:%.*]] = alloca <3 x i32>, align 16
-// CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca [3 x i32], align 4
-// CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <3 x i32>, align 16
-// CHECK-NEXT:    store <3 x i32> [[M]], ptr [[M_ADDR]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr [[M_ADDR]], align 4
-// CHECK-NEXT:    store <3 x i32> [[TMP0]], ptr [[HLSL_EWCAST_SRC]], align 4
-// CHECK-NEXT:    [[MATRIX_GEP:%.*]] = getelementptr inbounds <3 x i32>, ptr [[HLSL_EWCAST_SRC]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[FLATCAST_TMP]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr [[MATRIX_GEP]], align 4
-// CHECK-NEXT:    [[MATRIXEXT:%.*]] = extractelement <3 x i32> [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP1]], i32 [[MATRIXEXT]], i64 0
-// CHECK-NEXT:    [[TMP4:%.*]] = load <3 x i32>, ptr [[MATRIX_GEP]], align 4
-// CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <3 x i32> [[TMP4]], i32 1
-// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[MATRIXEXT1]], i64 1
-// CHECK-NEXT:    [[TMP6:%.*]] = load <3 x i32>, ptr [[MATRIX_GEP]], align 4
-// CHECK-NEXT:    [[MATRIXEXT2:%.*]] = extractelement <3 x i32> [[TMP6]], i32 2
-// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x i32> [[TMP5]], i32 [[MATRIXEXT2]], i64 2
-// CHECK-NEXT:    store <3 x i32> [[TMP7]], ptr [[V]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = load <3 x i32>, ptr [[V]], align 16
-// CHECK-NEXT:    ret <3 x i32> [[TMP8]]
-//
-int3 fn2(int3x1 M) {
-    int3 V = (int3)M;
-    return V;
-}
-
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl
index 26aa41aaf4626..c11c8498ada45 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COL-CHECK
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROW-CHECK
 
 // vector flat cast from array
 // CHECK-LABEL: define void {{.*}}call2
@@ -121,3 +122,61 @@ struct Derived : BFields {
 export void call6(Derived D) {
   int4 A = (int4)D;
 }
+
+// vector flat cast from matrix of same size (float)
+// CHECK-LABEL: call7
+// CHECK:    [[M_ADDR:%.*]] = alloca [4 x float], align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca [4 x float], align 4
+// CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <4 x float> %M, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[HLSL_EWCAST_SRC]], align 4
+// CHECK-NEXT:    [[MATRIX_GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[HLSL_EWCAST_SRC]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[FLATCAST_TMP]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
+// CHECK-NEXT:    [[MATRIXEXT:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP1]], float [[MATRIXEXT]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
+// COL-CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+// ROW-CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP3]], float [[MATRIXEXT1]], i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
+// COL-CHECK-NEXT:    [[MATRIXEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
+// ROW-CHECK-NEXT:    [[MATRIXEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float [[MATRIXEXT2]], i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
+// CHECK-NEXT:    [[MATRIXEXT3:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
+// CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP7]], float [[MATRIXEXT3]], i64 3
+// CHECK-NEXT:    store <4 x float> [[TMP9]], ptr [[V]], align 16
+// CHECK-NEXT:    ret void
+export void call7(float2x2 M) {
+    float4 V = (float4)M;
+}
+
+// vector flat cast from matrix of same size (int)
+// CHECK-LABEL: call8
+// CHECK:    [[M_ADDR:%.*]] = alloca [3 x i32], align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca <3 x i32>, align 16
+// CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca [3 x i32], align 4
+// CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <3 x i32>, align 16
+// CHECK-NEXT:    store <3 x i32> %M, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <3 x i32> [[TMP0]], ptr [[HLSL_EWCAST_SRC]], align 4
+// CHECK-NEXT:    [[MATRIX_GEP:%.*]] = getelementptr inbounds <3 x i32>, ptr [[HLSL_EWCAST_SRC]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[FLATCAST_TMP]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr [[MATRIX_GEP]], align 4
+// CHECK-NEXT:    [[MATRIXEXT:%.*]] = extractelement <3 x i32> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP1]], i32 [[MATRIXEXT]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <3 x i32>, ptr [[MATRIX_GEP]], align 4
+// CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <3 x i32> [[TMP4]], i32 1
+// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[MATRIXEXT1]], i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <3 x i32>, ptr [[MATRIX_GEP]], align 4
+// CHECK-NEXT:    [[MATRIXEXT2:%.*]] = extractelement <3 x i32> [[TMP6]], i32 2
+// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x i32> [[TMP5]], i32 [[MATRIXEXT2]], i64 2
+// CHECK-NEXT:    store <3 x i32> [[TMP7]], ptr [[V]], align 16
+// CHECK-NEXT:    ret void
+export void call8(int3x1 M) {
+    int3 V = (int3)M;
+}
+

>From cd13943548d84e49290501a03165113fcfbbf6a8 Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Tue, 27 Jan 2026 10:53:00 -0800
Subject: [PATCH 4/6] Fixup code comments regarding matrix element indexing

---
 clang/lib/CodeGen/CGExpr.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 9a6629e4b6f9f..fcd5a30d6ac19 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -7194,8 +7194,8 @@ void CodeGenFunction::FlattenAccessAndTypeLValue(
       // Matrices are represented as flat arrays in memory, but has a vector
       // value type. So we use ConvertMatrixAddress to convert the address from
       // array to vector, and extract elements similar to the vector case above.
-      // The order in which we iterate over the elements must respect the
-      // matrix memory layout, computing the proper index for each (row, col).
+      // The matrix elements are iterated over in row-major order regardless of
+      // the memory layout of the matrix.
       llvm::Type *LLVMT = ConvertTypeForMem(T);
       CharUnits Align = getContext().getTypeAlignInChars(T);
       Address GEP = Builder.CreateInBoundsGEP(LVal.getAddress(), IdxList, LLVMT,
@@ -7208,7 +7208,6 @@ void CodeGenFunction::FlattenAccessAndTypeLValue(
                               LangOptions::MatrixMemoryLayout::MatrixRowMajor;
       llvm::MatrixBuilder MB(Builder);
       for (unsigned I = 0, E = MT->getNumElementsFlattened(); I < E; I++) {
-        // Compute (row, col) from linear index assuming row-major iteration.
         unsigned Row = I / NumCols;
         unsigned Col = I % NumCols;
         llvm::Value *RowIdx = llvm::ConstantInt::get(IdxTy, Row);

>From 77fe68ddfddb0b3dae34ca900c00df8a5ace7d43 Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Tue, 27 Jan 2026 14:16:41 -0800
Subject: [PATCH 5/6] Use two-dimensional for-loop instead of a single for-loop

---
 clang/lib/CodeGen/CGExpr.cpp | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index fcd5a30d6ac19..490377c04b034 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -7207,16 +7207,17 @@ void CodeGenFunction::FlattenAccessAndTypeLValue(
       bool IsMatrixRowMajor = getLangOpts().getDefaultMatrixMemoryLayout() ==
                               LangOptions::MatrixMemoryLayout::MatrixRowMajor;
       llvm::MatrixBuilder MB(Builder);
-      for (unsigned I = 0, E = MT->getNumElementsFlattened(); I < E; I++) {
-        unsigned Row = I / NumCols;
-        unsigned Col = I % NumCols;
-        llvm::Value *RowIdx = llvm::ConstantInt::get(IdxTy, Row);
-        llvm::Value *ColIdx = llvm::ConstantInt::get(IdxTy, Col);
-        llvm::Value *Idx =
-            MB.CreateIndex(RowIdx, ColIdx, NumRows, NumCols, IsMatrixRowMajor);
-        LValue LV = LValue::MakeMatrixElt(MatAddr, Idx, MT->getElementType(),
-                                          Base.getBaseInfo(), TBAAAccessInfo());
-        AccessList.emplace_back(LV);
+      for (unsigned Row = 0; Row < MT->getNumRows(); Row++) {
+        for (unsigned Col = 0; Col < MT->getNumColumns(); Col++) {
+          llvm::Value *RowIdx = llvm::ConstantInt::get(IdxTy, Row);
+          llvm::Value *ColIdx = llvm::ConstantInt::get(IdxTy, Col);
+          llvm::Value *Idx = MB.CreateIndex(RowIdx, ColIdx, NumRows, NumCols,
+                                            IsMatrixRowMajor);
+          LValue LV =
+              LValue::MakeMatrixElt(MatAddr, Idx, MT->getElementType(),
+                                    Base.getBaseInfo(), TBAAAccessInfo());
+          AccessList.emplace_back(LV);
+        }
       }
     } else { // a scalar/builtin type
       if (!IdxList.empty()) {

>From 4a7d85e9a8d62a9e198b225d47dd1b422feca5a2 Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Tue, 17 Feb 2026 11:29:20 -0800
Subject: [PATCH 6/6] Update matrix allocas in test for array of vectors
 representation

---
 .../BasicFeatures/VectorElementwiseCast.hlsl           | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl
index c11c8498ada45..5242efd77b3b4 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl
@@ -125,9 +125,9 @@ export void call6(Derived D) {
 
 // vector flat cast from matrix of same size (float)
 // CHECK-LABEL: call7
-// CHECK:    [[M_ADDR:%.*]] = alloca [4 x float], align 4
+// CHECK:    [[M_ADDR:%.*]] = alloca [2 x <2 x float>], align 4
 // CHECK-NEXT:    [[V:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca [4 x float], align 4
+// CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca [2 x <2 x float>], align 4
 // CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <4 x float>, align 16
 // CHECK-NEXT:    store <4 x float> %M, ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
@@ -156,9 +156,11 @@ export void call7(float2x2 M) {
 
 // vector flat cast from matrix of same size (int)
 // CHECK-LABEL: call8
-// CHECK:    [[M_ADDR:%.*]] = alloca [3 x i32], align 4
+// COL-CHECK:    [[M_ADDR:%.*]] = alloca [1 x <3 x i32>], align 4
+// ROW-CHECK:    [[M_ADDR:%.*]] = alloca [3 x <1 x i32>], align 4
 // CHECK-NEXT:    [[V:%.*]] = alloca <3 x i32>, align 16
-// CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca [3 x i32], align 4
+// COL-CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca [1 x <3 x i32>], align 4
+// ROW-CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca [3 x <1 x i32>], align 4
 // CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <3 x i32>, align 16
 // CHECK-NEXT:    store <3 x i32> %M, ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr [[M_ADDR]], align 4