[clang] [llvm] [HLSL][Matrix] Support row-major `transpose` and `mul` by inserting matrix memory layout transformations (PR #186898)

Fri Mar 20 13:06:24 PDT 2026

https://github.com/Icohedron updated https://github.com/llvm/llvm-project/pull/186898

>From 03162e8a21e0fa21a2a518c7fbda0c4d2b1b91a6 Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Mon, 16 Mar 2026 14:16:31 -0700
Subject: [PATCH 1/7] Insert matrix memory layout transposes before and after
 matrix intrinsics

The SPIRV and DXIL backends assume matrices are provided in column-major
order when lowering matrix transpose and matrix multiplication
intrinsics.

To support row-major order matrices from Clang/HLSL, we therefore need
to convert row-major order matrices into column-major order matrices
before applying matrix transpose and multiplication. A conversion
from column-major order back to row-major order is also required for
correctness after a matrix transpose or matrix multiply.

This commit adds helper functions to the MatrixBuilder to convert a NxM
row-/column-major order matrix into a NxM column-/row-major order matrix
by applying a matrix transpose.

These matrix memory layout transposes are inserted before and after HLSL
transpose and mul when lowering.

Assisted-by: GitHub Copilot (powered by Claude Opus 4.6)
---
 clang/lib/CodeGen/CGHLSLBuiltins.cpp          | 46 +++++++++++--
 clang/test/CodeGenHLSL/builtins/mul.hlsl      | 42 ++++++++----
 .../test/CodeGenHLSL/builtins/transpose.hlsl  | 64 ++++++++++++-------
 llvm/include/llvm/IR/MatrixBuilder.h          | 16 +++++
 4 files changed, 127 insertions(+), 41 deletions(-)

diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 80c590437309d..a891864e6d964 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -1122,32 +1122,64 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     bool IsMat0 = QTy0->isConstantMatrixType();
     bool IsMat1 = QTy1->isConstantMatrixType();
 
+    // The matrix multiply intrinsic only operates on column-major order
+    // matrices. Therefore matrix memory layout transforms must be inserted
+    // before and after matrix multiply intrinsics.
+    bool IsRowMajor = getLangOpts().getDefaultMatrixMemoryLayout() ==
+                      LangOptions::MatrixMemoryLayout::MatrixRowMajor;
+
     llvm::MatrixBuilder MB(Builder);
     if (IsVec0 && IsMat1) {
       unsigned N = QTy0->castAs<VectorType>()->getNumElements();
       auto *MatTy = QTy1->castAs<ConstantMatrixType>();
-      unsigned M = MatTy->getNumColumns();
-      return MB.CreateMatrixMultiply(Op0, Op1, 1, N, M, "hlsl.mul");
+      unsigned Rows = MatTy->getNumRows();
+      unsigned Cols = MatTy->getNumColumns();
+      if (IsRowMajor)
+        Op1 = MB.CreateRowMajorToColumnMajorTransform(Op1, Rows, Cols);
+      return MB.CreateMatrixMultiply(Op0, Op1, 1, N, Cols, "hlsl.mul");
     }
     if (IsMat0 && IsVec1) {
       auto *MatTy = QTy0->castAs<ConstantMatrixType>();
       unsigned Rows = MatTy->getNumRows();
       unsigned Cols = MatTy->getNumColumns();
+      if (IsRowMajor)
+        Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows, Cols);
       return MB.CreateMatrixMultiply(Op0, Op1, Rows, Cols, 1, "hlsl.mul");
     }
     assert(IsMat0 && IsMat1);
     auto *MatTy0 = QTy0->castAs<ConstantMatrixType>();
     auto *MatTy1 = QTy1->castAs<ConstantMatrixType>();
-    return MB.CreateMatrixMultiply(Op0, Op1, MatTy0->getNumRows(),
-                                   MatTy0->getNumColumns(),
-                                   MatTy1->getNumColumns(), "hlsl.mul");
+    unsigned Rows0 = MatTy0->getNumRows();
+    unsigned Rows1 = MatTy1->getNumRows();
+    unsigned Cols0 = MatTy0->getNumColumns();
+    unsigned Cols1 = MatTy1->getNumColumns();
+    if (IsRowMajor) {
+      Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows0, Cols0);
+      Op1 = MB.CreateRowMajorToColumnMajorTransform(Op1, Rows1, Cols1);
+    }
+    Value *Result =
+        MB.CreateMatrixMultiply(Op0, Op1, Rows0, Cols0, Cols1, "hlsl.mul");
+    if (IsRowMajor)
+      Result = MB.CreateColumnMajorToRowMajorTransform(Result, Rows0, Cols1);
+    return Result;
   }
   case Builtin::BI__builtin_hlsl_transpose: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     auto *MatTy = E->getArg(0)->getType()->castAs<ConstantMatrixType>();
+    unsigned Rows = MatTy->getNumRows();
+    unsigned Cols = MatTy->getNumColumns();
     llvm::MatrixBuilder MB(Builder);
-    return MB.CreateMatrixTranspose(Op0, MatTy->getNumRows(),
-                                    MatTy->getNumColumns());
+    // The matrix transpose intrinsic only operates on column-major order
+    // matrices. Therefore matrix memory layout transforms must be inserted
+    // before and after matrix transpose intrinsics.
+    bool IsRowMajor = getLangOpts().getDefaultMatrixMemoryLayout() ==
+                      LangOptions::MatrixMemoryLayout::MatrixRowMajor;
+    if (IsRowMajor)
+      Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows, Cols);
+    Value *Result = MB.CreateMatrixTranspose(Op0, Rows, Cols);
+    if (IsRowMajor)
+      Result = MB.CreateColumnMajorToRowMajorTransform(Result, Cols, Rows);
+    return Result;
   }
   case Builtin::BI__builtin_hlsl_elementwise_rcp: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
diff --git a/clang/test/CodeGenHLSL/builtins/mul.hlsl b/clang/test/CodeGenHLSL/builtins/mul.hlsl
index 17749e527af65..f9151225d3846 100644
--- a/clang/test/CodeGenHLSL/builtins/mul.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/mul.hlsl
@@ -1,5 +1,7 @@
-// RUN: %clang_cc1 -finclude-default-header -O1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,DXIL
-// RUN: %clang_cc1 -finclude-default-header -O1 -triple spirv-unknown-vulkan1.3-library -fnative-half-type -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,SPIRV
+// RUN: %clang_cc1 -finclude-default-header -O1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -emit-llvm -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COLMAJOR,DXIL
+// RUN: %clang_cc1 -finclude-default-header -O1 -triple spirv-unknown-vulkan1.3-library -fnative-half-type -emit-llvm -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COLMAJOR,SPIRV
+// RUN: %clang_cc1 -finclude-default-header -O1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -emit-llvm -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROWMAJOR,DXIL
+// RUN: %clang_cc1 -finclude-default-header -O1 -triple spirv-unknown-vulkan1.3-library -fnative-half-type -emit-llvm -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROWMAJOR,SPIRV
 
 // -- Case 1: scalar * scalar -> scalar --
 
@@ -74,7 +76,8 @@ export double test_vec_vec_muld(double3 a, double3 b) { return mul(a, b); }
 // -- Case 6: vector * matrix -> vector --
 
 // CHECK-LABEL: test_vec_mat_mul
-// CHECK: %hlsl.mul = {{.*}} call {{.*}} <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %m, i32 1, i32 2, i32 3)
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2)
+// CHECK: %hlsl.mul = {{.*}} call {{.*}} <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %{{.*}}, i32 1, i32 2, i32 3)
 // CHECK: ret <3 x float> %hlsl.mul
 export float3 test_vec_mat_mul(float2 v, float2x3 m) { return mul(v, m); }
 
@@ -90,22 +93,31 @@ export float2x3 test_mat_scalar_mul(float2x3 a, float b) { return mul(a, b); }
 // -- Case 8: matrix * vector -> vector --
 
 // CHECK-LABEL: test_mat_vec_mul
-// CHECK: %hlsl.mul = {{.*}} call {{.*}} <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %m, <3 x float> %v, i32 2, i32 3, i32 1)
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2)
+// CHECK: %hlsl.mul = {{.*}} call {{.*}} <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %{{.*}}, <3 x float> %v, i32 2, i32 3, i32 1)
 // CHECK: ret <2 x float> %hlsl.mul
 export float2 test_mat_vec_mul(float2x3 m, float3 v) { return mul(m, v); }
 
 // -- Case 9: matrix * matrix -> matrix --
 
 // CHECK-LABEL: test_mat_mat_mul
-// CHECK: %hlsl.mul = {{.*}} call {{.*}} <8 x float> @llvm.matrix.multiply.v8f32.v6f32.v12f32(<6 x float> %a, <12 x float> %b, i32 2, i32 3, i32 4)
-// CHECK: ret <8 x float> %hlsl.mul
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2)
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <12 x float> @llvm.matrix.transpose.v12f32(<12 x float> %{{.*}}, i32 4, i32 3)
+// CHECK: %hlsl.mul = {{.*}} call {{.*}} <8 x float> @llvm.matrix.multiply.v8f32.v6f32.v12f32(<6 x float> %{{.*}}, <12 x float> %{{.*}}, i32 2, i32 3, i32 4)
+// COLMAJOR: ret <8 x float> %hlsl.mul
+// ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call {{.*}} <8 x float> @llvm.matrix.transpose.v8f32(<8 x float> %hlsl.mul, i32 2, i32 4)
+// ROWMAJOR: ret <8 x float> %[[TRANSPOSE_RES]]
 export float2x4 test_mat_mat_mul(float2x3 a, float3x4 b) { return mul(a, b); }
 
 // -- Integer matrix multiply --
 
 // CHECK-LABEL: test_mat_mat_muli
-// CHECK: %hlsl.mul = {{.*}} call <8 x i32> @llvm.matrix.multiply.v8i32.v6i32.v12i32(<6 x i32> %a, <12 x i32> %b, i32 2, i32 3, i32 4)
-// CHECK: ret <8 x i32> %hlsl.mul
+// ROWMAJOR: {{.*}} = {{.*}} call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %{{.*}}, i32 3, i32 2)
+// ROWMAJOR: {{.*}} = {{.*}} call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> %{{.*}}, i32 4, i32 3)
+// CHECK: %hlsl.mul = {{.*}} call <8 x i32> @llvm.matrix.multiply.v8i32.v6i32.v12i32(<6 x i32> %{{.*}}, <12 x i32> %{{.*}}, i32 2, i32 3, i32 4)
+// COLMAJOR: ret <8 x i32> %hlsl.mul
+// ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call <8 x i32> @llvm.matrix.transpose.v8i32(<8 x i32> %hlsl.mul, i32 2, i32 4)
+// ROWMAJOR: ret <8 x i32> %[[TRANSPOSE_RES]]
 export int2x4 test_mat_mat_muli(int2x3 a, int3x4 b) { return mul(a, b); }
 
 // -- Half-type overloads (native half) --
@@ -150,16 +162,22 @@ export half test_vec_vec_mulh(half3 a, half3 b) { return mul(a, b); }
 export half2x3 test_mat_scalar_mulh(half2x3 a, half b) { return mul(a, b); }
 
 // CHECK-LABEL: test_vec_mat_mulh
-// CHECK: %hlsl.mul = {{.*}}call {{.*}} <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> %v, <6 x half> %m, i32 1, i32 2, i32 3)
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2)
+// CHECK: %hlsl.mul = {{.*}}call {{.*}} <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> %v, <6 x half> %{{.*}}, i32 1, i32 2, i32 3)
 // CHECK: ret <3 x half> %hlsl.mul
 export half3 test_vec_mat_mulh(half2 v, half2x3 m) { return mul(v, m); }
 
 // CHECK-LABEL: test_mat_vec_mulh
-// CHECK: %hlsl.mul = {{.*}}call {{.*}} <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %m, <3 x half> %v, i32 2, i32 3, i32 1)
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2)
+// CHECK: %hlsl.mul = {{.*}}call {{.*}} <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %{{.*}}, <3 x half> %v, i32 2, i32 3, i32 1)
 // CHECK: ret <2 x half> %hlsl.mul
 export half2 test_mat_vec_mulh(half2x3 m, half3 v) { return mul(m, v); }
 
 // CHECK-LABEL: test_mat_mat_mulh
-// CHECK: %hlsl.mul = {{.*}}call {{.*}} <8 x half> @llvm.matrix.multiply.v8f16.v6f16.v12f16(<6 x half> %a, <12 x half> %b, i32 2, i32 3, i32 4)
-// CHECK: ret <8 x half> %hlsl.mul
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2)
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <12 x half> @llvm.matrix.transpose.v12f16(<12 x half> %{{.*}}, i32 4, i32 3)
+// CHECK: %hlsl.mul = {{.*}}call {{.*}} <8 x half> @llvm.matrix.multiply.v8f16.v6f16.v12f16(<6 x half> %{{.*}}, <12 x half> %{{.*}}, i32 2, i32 3, i32 4)
+// COLMAJOR: ret <8 x half> %hlsl.mul
+// ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call {{.*}} <8 x half> @llvm.matrix.transpose.v8f16(<8 x half> %hlsl.mul, i32 2, i32 4)
+// ROWMAJOR: ret <8 x half> %[[TRANSPOSE_RES]]
 export half2x4 test_mat_mat_mulh(half2x3 a, half3x4 b) { return mul(a, b); }
diff --git a/clang/test/CodeGenHLSL/builtins/transpose.hlsl b/clang/test/CodeGenHLSL/builtins/transpose.hlsl
index 9018e3913269d..b0bb99c70fda2 100644
--- a/clang/test/CodeGenHLSL/builtins/transpose.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/transpose.hlsl
@@ -1,42 +1,62 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
-// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan1.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COLMAJOR
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan1.3-library -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COLMAJOR
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROWMAJOR
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan1.3-library -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROWMAJOR
 
 // CHECK-LABEL: define {{.*}}test_transpose_bool2x3
-// CHECK:    [[A_ADDR:%.*]] = alloca [3 x <2 x i32>], align 4
-// CHECK:    [[A_EXT:%.*]] = zext <6 x i1> %{{.*}} to <6 x i32>
-// CHECK:    store <6 x i32> [[A_EXT]], ptr [[A_ADDR]], align 4
-// CHECK:    [[A:%.*]] = load <6 x i32>, ptr [[A_ADDR]], align 4
-// CHECK:    [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 2, i32 3)
+// COLMAJOR:    [[A_ADDR:%.*]] = alloca [3 x <2 x i32>], align 4
+// ROWMAJOR:    [[A_ADDR:%.*]] = alloca [2 x <3 x i32>], align 4
+// CHECK:       [[A_EXT:%.*]] = zext <6 x i1> %{{.*}} to <6 x i32>
+// CHECK:       store <6 x i32> [[A_EXT]], ptr [[A_ADDR]], align 4
+// CHECK:       [[A:%.*]] = load <6 x i32>, ptr [[A_ADDR]], align 4
+// COLMAJOR:    [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 2, i32 3)
+// ROWMAJOR:    [[LAYOUT:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 3, i32 2)
+// ROWMAJOR:    [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[LAYOUT]], i32 2, i32 3)
+// ROWMAJOR:    {{.*}} = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[TRANS]], i32 3, i32 2)
 bool3x2 test_transpose_bool2x3(bool2x3 a) {
   return transpose(a);
 }
 
 // CHECK-LABEL: define {{.*}}test_transpose_int4x3
-// CHECK:    [[A_ADDR:%.*]] = alloca [3 x <4 x i32>], align 4
-// CHECK:    store <12 x i32> %{{.*}}, ptr [[A_ADDR]], align 4
-// CHECK:    [[A:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4
-// CHECK:    [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 4, i32 3)
-// CHECK:    ret <12 x i32> [[TRANS]]
+// COLMAJOR:    [[A_ADDR:%.*]] = alloca [3 x <4 x i32>], align 4
+// ROWMAJOR:    [[A_ADDR:%.*]] = alloca [4 x <3 x i32>], align 4
+// CHECK:       store <12 x i32> %{{.*}}, ptr [[A_ADDR]], align 4
+// CHECK:       [[A:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4
+// COLMAJOR:    [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 4, i32 3)
+// COLMAJOR:    ret <12 x i32> [[TRANS]]
+// ROWMAJOR:    [[LAYOUT:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 3, i32 4)
+// ROWMAJOR:    [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[LAYOUT]], i32 4, i32 3)
+// ROWMAJOR:    [[RESULT:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[TRANS]], i32 3, i32 4)
+// ROWMAJOR:    ret <12 x i32> [[RESULT]]
 int3x4 test_transpose_int4x3(int4x3 a) {
   return transpose(a);
 }
 
 // CHECK-LABEL: define {{.*}}test_transpose_float4x4
-// CHECK:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
-// CHECK:    store <16 x float> %{{.*}}, ptr [[A_ADDR]], align 4
-// CHECK:    [[A:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
-// CHECK:    [[TRANS:%.*]] = call {{.*}}<16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4)
-// CHECK:    ret <16 x float> [[TRANS]]
+// CHECK:       [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
+// CHECK:       store <16 x float> %{{.*}}, ptr [[A_ADDR]], align 4
+// CHECK:       [[A:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
+// COLMAJOR:    [[TRANS:%.*]] = call {{.*}}<16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4)
+// COLMAJOR:    ret <16 x float> [[TRANS]]
+// ROWMAJOR:    [[LAYOUT:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4)
+// ROWMAJOR:    [[TRANS:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[LAYOUT]], i32 4, i32 4)
+// ROWMAJOR:    [[RESULT:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[TRANS]], i32 4, i32 4)
+// ROWMAJOR:    ret <16 x float> [[RESULT]]
 float4x4 test_transpose_float4x4(float4x4 a) {
   return transpose(a);
 }
 
 // CHECK-LABEL: define {{.*}}test_transpose_double1x4
-// CHECK:    [[A_ADDR:%.*]] = alloca [4 x <1 x double>], align 8
-// CHECK:    store <4 x double> %{{.*}}, ptr [[A_ADDR]], align 8
-// CHECK:    [[A:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 8
-// CHECK:    [[TRANS:%.*]] = call {{.*}}<4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 1, i32 4)
-// CHECK:    ret <4 x double> [[TRANS]]
+// COLMAJOR:    [[A_ADDR:%.*]] = alloca [4 x <1 x double>], align 8
+// ROWMAJOR:    [[A_ADDR:%.*]] = alloca [1 x <4 x double>], align 8
+// CHECK:       store <4 x double> %{{.*}}, ptr [[A_ADDR]], align 8
+// CHECK:       [[A:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 8
+// COLMAJOR:    [[TRANS:%.*]] = call {{.*}}<4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 1, i32 4)
+// COLMAJOR:    ret <4 x double> [[TRANS]]
+// ROWMAJOR:    [[LAYOUT:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 4, i32 1)
+// ROWMAJOR:    [[TRANS:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[LAYOUT]], i32 1, i32 4)
+// ROWMAJOR:    [[RESULT:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[TRANS]], i32 4, i32 1)
+// ROWMAJOR:    ret <4 x double> [[RESULT]]
 double4x1 test_transpose_double1x4(double1x4 a) {
   return transpose(a);
 }
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index 5c891d644bfd2..b161e79a1cb6b 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -141,6 +141,22 @@ class MatrixBuilder {
     return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
   }
 
+  /// Create a column-major matrix from a row-major matrix with the given
+  /// logical dimensions by transposing it.
+  CallInst *CreateRowMajorToColumnMajorTransform(Value *Matrix, unsigned Rows,
+                                                 unsigned Columns,
+                                                 const Twine &Name = "") {
+    return CreateMatrixTranspose(Matrix, Columns, Rows, Name);
+  }
+
+  /// Create a row-major matrix from a column-major matrix with the given
+  /// logical dimensions by transposing it.
+  CallInst *CreateColumnMajorToRowMajorTransform(Value *Matrix, unsigned Rows,
+                                                 unsigned Columns,
+                                                 const Twine &Name = "") {
+    return CreateMatrixTranspose(Matrix, Rows, Columns, Name);
+  }
+
   /// Insert a single element \p NewVal into \p Matrix at indices (\p RowIdx, \p
   /// ColumnIdx).
   Value *CreateMatrixInsert(Value *Matrix, Value *NewVal, Value *RowIdx,

>From b5f29e0a645407234b75f73d1d7722640214c103 Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Mon, 16 Mar 2026 15:09:39 -0700
Subject: [PATCH 2/7] Add assertions for mul codegen to ensure dimensions are
 correct

---
 clang/lib/CodeGen/CGHLSLBuiltins.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index a891864e6d964..30c3098285865 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -1134,6 +1134,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
       auto *MatTy = QTy1->castAs<ConstantMatrixType>();
       unsigned Rows = MatTy->getNumRows();
       unsigned Cols = MatTy->getNumColumns();
+      assert(N == Rows && "vector length must match matrix row count");
       if (IsRowMajor)
         Op1 = MB.CreateRowMajorToColumnMajorTransform(Op1, Rows, Cols);
       return MB.CreateMatrixMultiply(Op0, Op1, 1, N, Cols, "hlsl.mul");
@@ -1142,6 +1143,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
       auto *MatTy = QTy0->castAs<ConstantMatrixType>();
       unsigned Rows = MatTy->getNumRows();
       unsigned Cols = MatTy->getNumColumns();
+      unsigned N = QTy1->castAs<VectorType>()->getNumElements();
+      assert(N == Cols && "vector length must match matrix column count");
       if (IsRowMajor)
         Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows, Cols);
       return MB.CreateMatrixMultiply(Op0, Op1, Rows, Cols, 1, "hlsl.mul");
@@ -1153,6 +1156,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     unsigned Rows1 = MatTy1->getNumRows();
     unsigned Cols0 = MatTy0->getNumColumns();
     unsigned Cols1 = MatTy1->getNumColumns();
+    assert(Cols0 == Rows1 &&
+           "inner matrix dimensions must match for multiplication");
     if (IsRowMajor) {
       Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows0, Cols0);
       Op1 = MB.CreateRowMajorToColumnMajorTransform(Op1, Rows1, Cols1);

>From 36de9b1431137cc68dbb7fcc83fc349bb7dd6c8b Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Mon, 16 Mar 2026 15:24:26 -0700
Subject: [PATCH 3/7] Eliminate redundant/cancelling transposes from matrix
 transpose codegen

---
 clang/lib/CodeGen/CGHLSLBuiltins.cpp          | 14 +++++-----
 .../test/CodeGenHLSL/builtins/transpose.hlsl  | 26 +++++--------------
 2 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 30c3098285865..d13feef6c2eb0 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -1174,17 +1174,15 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     unsigned Rows = MatTy->getNumRows();
     unsigned Cols = MatTy->getNumColumns();
     llvm::MatrixBuilder MB(Builder);
-    // The matrix transpose intrinsic only operates on column-major order
-    // matrices. Therefore matrix memory layout transforms must be inserted
-    // before and after matrix transpose intrinsics.
+    // The matrix transpose intrinsic operates on column-major matrices.
+    // For row-major, a row-major RxC matrix is equivalent to a column-major
+    // CxR matrix, so transposing with swapped dimensions produces the correct
+    // row-major CxR result directly.
     bool IsRowMajor = getLangOpts().getDefaultMatrixMemoryLayout() ==
                       LangOptions::MatrixMemoryLayout::MatrixRowMajor;
     if (IsRowMajor)
-      Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows, Cols);
-    Value *Result = MB.CreateMatrixTranspose(Op0, Rows, Cols);
-    if (IsRowMajor)
-      Result = MB.CreateColumnMajorToRowMajorTransform(Result, Cols, Rows);
-    return Result;
+      return MB.CreateMatrixTranspose(Op0, Cols, Rows);
+    return MB.CreateMatrixTranspose(Op0, Rows, Cols);
   }
   case Builtin::BI__builtin_hlsl_elementwise_rcp: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
diff --git a/clang/test/CodeGenHLSL/builtins/transpose.hlsl b/clang/test/CodeGenHLSL/builtins/transpose.hlsl
index b0bb99c70fda2..d8430fcf5bf9d 100644
--- a/clang/test/CodeGenHLSL/builtins/transpose.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/transpose.hlsl
@@ -10,9 +10,7 @@
 // CHECK:       store <6 x i32> [[A_EXT]], ptr [[A_ADDR]], align 4
 // CHECK:       [[A:%.*]] = load <6 x i32>, ptr [[A_ADDR]], align 4
 // COLMAJOR:    [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 2, i32 3)
-// ROWMAJOR:    [[LAYOUT:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 3, i32 2)
-// ROWMAJOR:    [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[LAYOUT]], i32 2, i32 3)
-// ROWMAJOR:    {{.*}} = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[TRANS]], i32 3, i32 2)
+// ROWMAJOR:    [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 3, i32 2)
 bool3x2 test_transpose_bool2x3(bool2x3 a) {
   return transpose(a);
 }
@@ -23,11 +21,8 @@ bool3x2 test_transpose_bool2x3(bool2x3 a) {
 // CHECK:       store <12 x i32> %{{.*}}, ptr [[A_ADDR]], align 4
 // CHECK:       [[A:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4
 // COLMAJOR:    [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 4, i32 3)
-// COLMAJOR:    ret <12 x i32> [[TRANS]]
-// ROWMAJOR:    [[LAYOUT:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 3, i32 4)
-// ROWMAJOR:    [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[LAYOUT]], i32 4, i32 3)
-// ROWMAJOR:    [[RESULT:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[TRANS]], i32 3, i32 4)
-// ROWMAJOR:    ret <12 x i32> [[RESULT]]
+// ROWMAJOR:    [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 3, i32 4)
+// CHECK:       ret <12 x i32> [[TRANS]]
 int3x4 test_transpose_int4x3(int4x3 a) {
   return transpose(a);
 }
@@ -36,12 +31,8 @@ int3x4 test_transpose_int4x3(int4x3 a) {
 // CHECK:       [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK:       store <16 x float> %{{.*}}, ptr [[A_ADDR]], align 4
 // CHECK:       [[A:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
-// COLMAJOR:    [[TRANS:%.*]] = call {{.*}}<16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4)
-// COLMAJOR:    ret <16 x float> [[TRANS]]
-// ROWMAJOR:    [[LAYOUT:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4)
-// ROWMAJOR:    [[TRANS:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[LAYOUT]], i32 4, i32 4)
-// ROWMAJOR:    [[RESULT:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[TRANS]], i32 4, i32 4)
-// ROWMAJOR:    ret <16 x float> [[RESULT]]
+// CHECK:       [[TRANS:%.*]] = call {{.*}}<16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4)
+// CHECK:       ret <16 x float> [[TRANS]]
 float4x4 test_transpose_float4x4(float4x4 a) {
   return transpose(a);
 }
@@ -52,11 +43,8 @@ float4x4 test_transpose_float4x4(float4x4 a) {
 // CHECK:       store <4 x double> %{{.*}}, ptr [[A_ADDR]], align 8
 // CHECK:       [[A:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 8
 // COLMAJOR:    [[TRANS:%.*]] = call {{.*}}<4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 1, i32 4)
-// COLMAJOR:    ret <4 x double> [[TRANS]]
-// ROWMAJOR:    [[LAYOUT:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 4, i32 1)
-// ROWMAJOR:    [[TRANS:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[LAYOUT]], i32 1, i32 4)
-// ROWMAJOR:    [[RESULT:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[TRANS]], i32 4, i32 1)
-// ROWMAJOR:    ret <4 x double> [[RESULT]]
+// ROWMAJOR:    [[TRANS:%.*]] = call {{.*}}<4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 4, i32 1)
+// CHECK:       ret <4 x double> [[TRANS]]
 double4x1 test_transpose_double1x4(double1x4 a) {
   return transpose(a);
 }

>From 699c32b4ef9bc8d027ea91cf76087906e44e5f91 Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Wed, 18 Mar 2026 11:09:45 -0700
Subject: [PATCH 4/7] Add comment about matrix transpose memory layout
 assumption

---
 llvm/include/llvm/IR/MatrixBuilder.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index b161e79a1cb6b..73ea2daea966a 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -143,6 +143,9 @@ class MatrixBuilder {
 
   /// Create a column-major matrix from a row-major matrix with the given
   /// logical dimensions by transposing it.
+  /// Assumes the matrix transpose assumes column-major matrix memory layout,
+  /// which is true in the case of the DirectX and SPIRV backends, but not true
+  /// in the case of the LowerMatrixIntrinsics pass.
   CallInst *CreateRowMajorToColumnMajorTransform(Value *Matrix, unsigned Rows,
                                                  unsigned Columns,
                                                  const Twine &Name = "") {
@@ -151,6 +154,9 @@ class MatrixBuilder {
 
   /// Create a row-major matrix from a column-major matrix with the given
   /// logical dimensions by transposing it.
+  /// Assumes the matrix transpose assumes column-major matrix memory layout,
+  /// which is true in the case of the DirectX and SPIRV backends, but not true
+  /// in the case of the LowerMatrixIntrinsics pass.
   CallInst *CreateColumnMajorToRowMajorTransform(Value *Matrix, unsigned Rows,
                                                  unsigned Columns,
                                                  const Twine &Name = "") {

>From 06eb1c3dd6982fe5fc724cf4b3033ae48cdf58bb Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Wed, 18 Mar 2026 11:14:49 -0700
Subject: [PATCH 5/7] Add 'necessarily' key word to transpose assumption

---
 llvm/include/llvm/IR/MatrixBuilder.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index 73ea2daea966a..41cd5ea0efd93 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -144,8 +144,8 @@ class MatrixBuilder {
   /// Create a column-major matrix from a row-major matrix with the given
   /// logical dimensions by transposing it.
   /// Assumes the matrix transpose assumes column-major matrix memory layout,
-  /// which is true in the case of the DirectX and SPIRV backends, but not true
-  /// in the case of the LowerMatrixIntrinsics pass.
+  /// which is true in the case of the DirectX and SPIRV backends, but not
+  /// necessarily true in the case of the LowerMatrixIntrinsics pass.
   CallInst *CreateRowMajorToColumnMajorTransform(Value *Matrix, unsigned Rows,
                                                  unsigned Columns,
                                                  const Twine &Name = "") {
@@ -155,8 +155,8 @@ class MatrixBuilder {
   /// Create a row-major matrix from a column-major matrix with the given
   /// logical dimensions by transposing it.
   /// Assumes the matrix transpose assumes column-major matrix memory layout,
-  /// which is true in the case of the DirectX and SPIRV backends, but not true
-  /// in the case of the LowerMatrixIntrinsics pass.
+  /// which is true in the case of the DirectX and SPIRV backends, but not
+  /// necessarily true in the case of the LowerMatrixIntrinsics pass.
   CallInst *CreateColumnMajorToRowMajorTransform(Value *Matrix, unsigned Rows,
                                                  unsigned Columns,
                                                  const Twine &Name = "") {

>From 03eb06aa0eb50e9c8981ff1aa811cf800c5a0c96 Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Fri, 20 Mar 2026 12:35:26 -0700
Subject: [PATCH 6/7] Capture more variables in FileCheck

---
 clang/test/CodeGenHLSL/builtins/mul.hlsl | 45 ++++++++++++++----------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/clang/test/CodeGenHLSL/builtins/mul.hlsl b/clang/test/CodeGenHLSL/builtins/mul.hlsl
index f9151225d3846..408355054200d 100644
--- a/clang/test/CodeGenHLSL/builtins/mul.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/mul.hlsl
@@ -67,8 +67,8 @@ export uint test_vec_vec_mulu(uint3 a, uint3 b) { return mul(a, b); }
 // CHECK-LABEL: test_vec_vec_muld
 // CHECK-NOT: @llvm.dx.fdot
 // CHECK-NOT: @llvm.spv.fdot
-// CHECK: fmul {{.*}} double
-// CHECK: %hlsl.fmad.i = {{.*}}call {{.*}} double @llvm.fmuladd.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
+// CHECK: %[[FMUL:.*]] = fmul {{.*}} double
+// CHECK: %hlsl.fmad.i = {{.*}}call {{.*}} double @llvm.fmuladd.f64(double %{{.*}}, double %{{.*}}, double %[[FMUL]])
 // CHECK: %hlsl.fmad.i.1 = {{.*}}call {{.*}} double @llvm.fmuladd.f64(double %{{.*}}, double %{{.*}}, double %hlsl.fmad.i)
 // CHECK: ret double %hlsl.fmad.i.1
 export double test_vec_vec_muld(double3 a, double3 b) { return mul(a, b); }
@@ -76,8 +76,9 @@ export double test_vec_vec_muld(double3 a, double3 b) { return mul(a, b); }
 // -- Case 6: vector * matrix -> vector --
 
 // CHECK-LABEL: test_vec_mat_mul
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2)
-// CHECK: %hlsl.mul = {{.*}} call {{.*}} <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %{{.*}}, i32 1, i32 2, i32 3)
+// ROWMAJOR: %[[TRANSPOSE:.*]] = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %m, i32 3, i32 2)
+// ROWMAJOR: %hlsl.mul = {{.*}} call {{.*}} <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %[[TRANSPOSE]], i32 1, i32 2, i32 3)
+// COLMAJOR: %hlsl.mul = {{.*}} call {{.*}} <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %m, i32 1, i32 2, i32 3)
 // CHECK: ret <3 x float> %hlsl.mul
 export float3 test_vec_mat_mul(float2 v, float2x3 m) { return mul(v, m); }
 
@@ -93,17 +94,19 @@ export float2x3 test_mat_scalar_mul(float2x3 a, float b) { return mul(a, b); }
 // -- Case 8: matrix * vector -> vector --
 
 // CHECK-LABEL: test_mat_vec_mul
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2)
-// CHECK: %hlsl.mul = {{.*}} call {{.*}} <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %{{.*}}, <3 x float> %v, i32 2, i32 3, i32 1)
+// ROWMAJOR: %[[TRANSPOSE:.*]] = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %m, i32 3, i32 2)
+// ROWMAJOR: %hlsl.mul = {{.*}} call {{.*}} <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %[[TRANSPOSE]], <3 x float> %v, i32 2, i32 3, i32 1)
+// COLMAJOR: %hlsl.mul = {{.*}} call {{.*}} <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %m, <3 x float> %v, i32 2, i32 3, i32 1)
 // CHECK: ret <2 x float> %hlsl.mul
 export float2 test_mat_vec_mul(float2x3 m, float3 v) { return mul(m, v); }
 
 // -- Case 9: matrix * matrix -> matrix --
 
 // CHECK-LABEL: test_mat_mat_mul
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2)
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <12 x float> @llvm.matrix.transpose.v12f32(<12 x float> %{{.*}}, i32 4, i32 3)
-// CHECK: %hlsl.mul = {{.*}} call {{.*}} <8 x float> @llvm.matrix.multiply.v8f32.v6f32.v12f32(<6 x float> %{{.*}}, <12 x float> %{{.*}}, i32 2, i32 3, i32 4)
+// ROWMAJOR: %[[TRANSPOSE_A:.*]] = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %a, i32 3, i32 2)
+// ROWMAJOR: %[[TRANSPOSE_B:.*]] = {{.*}} call {{.*}} <12 x float> @llvm.matrix.transpose.v12f32(<12 x float> %b, i32 4, i32 3)
+// ROWMAJOR: %hlsl.mul = {{.*}} call {{.*}} <8 x float> @llvm.matrix.multiply.v8f32.v6f32.v12f32(<6 x float> %[[TRANSPOSE_A]], <12 x float> %[[TRANSPOSE_B]], i32 2, i32 3, i32 4)
+// COLMAJOR: %hlsl.mul = {{.*}} call {{.*}} <8 x float> @llvm.matrix.multiply.v8f32.v6f32.v12f32(<6 x float> %a, <12 x float> %b, i32 2, i32 3, i32 4)
 // COLMAJOR: ret <8 x float> %hlsl.mul
 // ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call {{.*}} <8 x float> @llvm.matrix.transpose.v8f32(<8 x float> %hlsl.mul, i32 2, i32 4)
 // ROWMAJOR: ret <8 x float> %[[TRANSPOSE_RES]]
@@ -112,9 +115,10 @@ export float2x4 test_mat_mat_mul(float2x3 a, float3x4 b) { return mul(a, b); }
 // -- Integer matrix multiply --
 
 // CHECK-LABEL: test_mat_mat_muli
-// ROWMAJOR: {{.*}} = {{.*}} call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %{{.*}}, i32 3, i32 2)
-// ROWMAJOR: {{.*}} = {{.*}} call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> %{{.*}}, i32 4, i32 3)
-// CHECK: %hlsl.mul = {{.*}} call <8 x i32> @llvm.matrix.multiply.v8i32.v6i32.v12i32(<6 x i32> %{{.*}}, <12 x i32> %{{.*}}, i32 2, i32 3, i32 4)
+// ROWMAJOR: %[[TRANSPOSE_A:.*]] = {{.*}} call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %a, i32 3, i32 2)
+// ROWMAJOR: %[[TRANSPOSE_B:.*]] = {{.*}} call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> %b, i32 4, i32 3)
+// ROWMAJOR: %hlsl.mul = {{.*}} call <8 x i32> @llvm.matrix.multiply.v8i32.v6i32.v12i32(<6 x i32> %[[TRANSPOSE_A]], <12 x i32> %[[TRANSPOSE_B]], i32 2, i32 3, i32 4)
+// COLMAJOR: %hlsl.mul = {{.*}} call <8 x i32> @llvm.matrix.multiply.v8i32.v6i32.v12i32(<6 x i32> %a, <12 x i32> %b, i32 2, i32 3, i32 4)
 // COLMAJOR: ret <8 x i32> %hlsl.mul
 // ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call <8 x i32> @llvm.matrix.transpose.v8i32(<8 x i32> %hlsl.mul, i32 2, i32 4)
 // ROWMAJOR: ret <8 x i32> %[[TRANSPOSE_RES]]
@@ -162,21 +166,24 @@ export half test_vec_vec_mulh(half3 a, half3 b) { return mul(a, b); }
 export half2x3 test_mat_scalar_mulh(half2x3 a, half b) { return mul(a, b); }
 
 // CHECK-LABEL: test_vec_mat_mulh
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2)
-// CHECK: %hlsl.mul = {{.*}}call {{.*}} <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> %v, <6 x half> %{{.*}}, i32 1, i32 2, i32 3)
+// ROWMAJOR: %[[TRANSPOSE:.*]] = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %m, i32 3, i32 2)
+// ROWMAJOR: %hlsl.mul = {{.*}}call {{.*}} <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> %v, <6 x half> %[[TRANSPOSE]], i32 1, i32 2, i32 3)
+// COLMAJOR: %hlsl.mul = {{.*}}call {{.*}} <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> %v, <6 x half> %m, i32 1, i32 2, i32 3)
 // CHECK: ret <3 x half> %hlsl.mul
 export half3 test_vec_mat_mulh(half2 v, half2x3 m) { return mul(v, m); }
 
 // CHECK-LABEL: test_mat_vec_mulh
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2)
-// CHECK: %hlsl.mul = {{.*}}call {{.*}} <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %{{.*}}, <3 x half> %v, i32 2, i32 3, i32 1)
+// ROWMAJOR: %[[TRANSPOSE:.*]] = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %m, i32 3, i32 2)
+// ROWMAJOR: %hlsl.mul = {{.*}}call {{.*}} <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %[[TRANSPOSE]], <3 x half> %v, i32 2, i32 3, i32 1)
+// COLMAJOR: %hlsl.mul = {{.*}}call {{.*}} <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %m, <3 x half> %v, i32 2, i32 3, i32 1)
 // CHECK: ret <2 x half> %hlsl.mul
 export half2 test_mat_vec_mulh(half2x3 m, half3 v) { return mul(m, v); }
 
 // CHECK-LABEL: test_mat_mat_mulh
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2)
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <12 x half> @llvm.matrix.transpose.v12f16(<12 x half> %{{.*}}, i32 4, i32 3)
-// CHECK: %hlsl.mul = {{.*}}call {{.*}} <8 x half> @llvm.matrix.multiply.v8f16.v6f16.v12f16(<6 x half> %{{.*}}, <12 x half> %{{.*}}, i32 2, i32 3, i32 4)
+// ROWMAJOR: %[[TRANSPOSE_A:.*]] = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %a, i32 3, i32 2)
+// ROWMAJOR: %[[TRANSPOSE_B:.*]] = {{.*}} call {{.*}} <12 x half> @llvm.matrix.transpose.v12f16(<12 x half> %b, i32 4, i32 3)
+// ROWMAJOR: %hlsl.mul = {{.*}}call {{.*}} <8 x half> @llvm.matrix.multiply.v8f16.v6f16.v12f16(<6 x half> %[[TRANSPOSE_A]], <12 x half> %[[TRANSPOSE_B]], i32 2, i32 3, i32 4)
+// COLMAJOR: %hlsl.mul = {{.*}}call {{.*}} <8 x half> @llvm.matrix.multiply.v8f16.v6f16.v12f16(<6 x half> %a, <12 x half> %b, i32 2, i32 3, i32 4)
 // COLMAJOR: ret <8 x half> %hlsl.mul
 // ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call {{.*}} <8 x half> @llvm.matrix.transpose.v8f16(<8 x half> %hlsl.mul, i32 2, i32 4)
 // ROWMAJOR: ret <8 x half> %[[TRANSPOSE_RES]]

>From fa7a7e48daf3656d15017a6b316997d03ae1b57d Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Fri, 20 Mar 2026 13:05:54 -0700
Subject: [PATCH 7/7] Do not capture fmul because it may be vectorized
 depending on build config

---
 clang/test/CodeGenHLSL/builtins/mul.hlsl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/CodeGenHLSL/builtins/mul.hlsl b/clang/test/CodeGenHLSL/builtins/mul.hlsl
index 408355054200d..5e7468763654b 100644
--- a/clang/test/CodeGenHLSL/builtins/mul.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/mul.hlsl
@@ -67,8 +67,8 @@ export uint test_vec_vec_mulu(uint3 a, uint3 b) { return mul(a, b); }
 // CHECK-LABEL: test_vec_vec_muld
 // CHECK-NOT: @llvm.dx.fdot
 // CHECK-NOT: @llvm.spv.fdot
-// CHECK: %[[FMUL:.*]] = fmul {{.*}} double
-// CHECK: %hlsl.fmad.i = {{.*}}call {{.*}} double @llvm.fmuladd.f64(double %{{.*}}, double %{{.*}}, double %[[FMUL]])
+// CHECK: fmul {{.*}} double
+// CHECK: %hlsl.fmad.i = {{.*}}call {{.*}} double @llvm.fmuladd.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
 // CHECK: %hlsl.fmad.i.1 = {{.*}}call {{.*}} double @llvm.fmuladd.f64(double %{{.*}}, double %{{.*}}, double %hlsl.fmad.i)
 // CHECK: ret double %hlsl.fmad.i.1
 export double test_vec_vec_muld(double3 a, double3 b) { return mul(a, b); }