[clang] [llvm] [HLSL][Matrix] Support row-major `transpose` and `mul` by inserting matrix memory layout transformations (PR #186898)
Deric C. via cfe-commits
cfe-commits at lists.llvm.org
Fri Mar 20 13:06:24 PDT 2026
https://github.com/Icohedron updated https://github.com/llvm/llvm-project/pull/186898
>From 03162e8a21e0fa21a2a518c7fbda0c4d2b1b91a6 Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Mon, 16 Mar 2026 14:16:31 -0700
Subject: [PATCH 1/7] Insert matrix memory layout transposes before and after
matrix intrinsics
The SPIRV and DXIL backends assume matrices are provided in column-major
order when lowering matrix transpose and matrix multiplication
intrinsics.
To support row-major order matrices from Clang/HLSL, we therefore need
to convert row-major order matrices into column-major order matrices
before applying matrix transpose and multiplication. A conversion
from column-major order back to row-major order is also required for
correctness after a matrix transpose or matrix multiply.
This commit adds helper functions to the MatrixBuilder to convert a NxM
row-/column-major order matrix into a NxM column-/row-major order matrix
by applying a matrix transpose.
These matrix memory layout transposes are inserted before and after HLSL
transpose and mul when lowering.
Assisted-by: GitHub Copilot (powered by Claude Opus 4.6)
---
clang/lib/CodeGen/CGHLSLBuiltins.cpp | 46 +++++++++++--
clang/test/CodeGenHLSL/builtins/mul.hlsl | 42 ++++++++----
.../test/CodeGenHLSL/builtins/transpose.hlsl | 64 ++++++++++++-------
llvm/include/llvm/IR/MatrixBuilder.h | 16 +++++
4 files changed, 127 insertions(+), 41 deletions(-)
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 80c590437309d..a891864e6d964 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -1122,32 +1122,64 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
bool IsMat0 = QTy0->isConstantMatrixType();
bool IsMat1 = QTy1->isConstantMatrixType();
+ // The matrix multiply intrinsic only operates on column-major order
+ // matrices. Therefore matrix memory layout transforms must be inserted
+ // before and after matrix multiply intrinsics.
+ bool IsRowMajor = getLangOpts().getDefaultMatrixMemoryLayout() ==
+ LangOptions::MatrixMemoryLayout::MatrixRowMajor;
+
llvm::MatrixBuilder MB(Builder);
if (IsVec0 && IsMat1) {
unsigned N = QTy0->castAs<VectorType>()->getNumElements();
auto *MatTy = QTy1->castAs<ConstantMatrixType>();
- unsigned M = MatTy->getNumColumns();
- return MB.CreateMatrixMultiply(Op0, Op1, 1, N, M, "hlsl.mul");
+ unsigned Rows = MatTy->getNumRows();
+ unsigned Cols = MatTy->getNumColumns();
+ if (IsRowMajor)
+ Op1 = MB.CreateRowMajorToColumnMajorTransform(Op1, Rows, Cols);
+ return MB.CreateMatrixMultiply(Op0, Op1, 1, N, Cols, "hlsl.mul");
}
if (IsMat0 && IsVec1) {
auto *MatTy = QTy0->castAs<ConstantMatrixType>();
unsigned Rows = MatTy->getNumRows();
unsigned Cols = MatTy->getNumColumns();
+ if (IsRowMajor)
+ Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows, Cols);
return MB.CreateMatrixMultiply(Op0, Op1, Rows, Cols, 1, "hlsl.mul");
}
assert(IsMat0 && IsMat1);
auto *MatTy0 = QTy0->castAs<ConstantMatrixType>();
auto *MatTy1 = QTy1->castAs<ConstantMatrixType>();
- return MB.CreateMatrixMultiply(Op0, Op1, MatTy0->getNumRows(),
- MatTy0->getNumColumns(),
- MatTy1->getNumColumns(), "hlsl.mul");
+ unsigned Rows0 = MatTy0->getNumRows();
+ unsigned Rows1 = MatTy1->getNumRows();
+ unsigned Cols0 = MatTy0->getNumColumns();
+ unsigned Cols1 = MatTy1->getNumColumns();
+ if (IsRowMajor) {
+ Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows0, Cols0);
+ Op1 = MB.CreateRowMajorToColumnMajorTransform(Op1, Rows1, Cols1);
+ }
+ Value *Result =
+ MB.CreateMatrixMultiply(Op0, Op1, Rows0, Cols0, Cols1, "hlsl.mul");
+ if (IsRowMajor)
+ Result = MB.CreateColumnMajorToRowMajorTransform(Result, Rows0, Cols1);
+ return Result;
}
case Builtin::BI__builtin_hlsl_transpose: {
Value *Op0 = EmitScalarExpr(E->getArg(0));
auto *MatTy = E->getArg(0)->getType()->castAs<ConstantMatrixType>();
+ unsigned Rows = MatTy->getNumRows();
+ unsigned Cols = MatTy->getNumColumns();
llvm::MatrixBuilder MB(Builder);
- return MB.CreateMatrixTranspose(Op0, MatTy->getNumRows(),
- MatTy->getNumColumns());
+ // The matrix transpose intrinsic only operates on column-major order
+ // matrices. Therefore matrix memory layout transforms must be inserted
+ // before and after matrix transpose intrinsics.
+ bool IsRowMajor = getLangOpts().getDefaultMatrixMemoryLayout() ==
+ LangOptions::MatrixMemoryLayout::MatrixRowMajor;
+ if (IsRowMajor)
+ Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows, Cols);
+ Value *Result = MB.CreateMatrixTranspose(Op0, Rows, Cols);
+ if (IsRowMajor)
+ Result = MB.CreateColumnMajorToRowMajorTransform(Result, Cols, Rows);
+ return Result;
}
case Builtin::BI__builtin_hlsl_elementwise_rcp: {
Value *Op0 = EmitScalarExpr(E->getArg(0));
diff --git a/clang/test/CodeGenHLSL/builtins/mul.hlsl b/clang/test/CodeGenHLSL/builtins/mul.hlsl
index 17749e527af65..f9151225d3846 100644
--- a/clang/test/CodeGenHLSL/builtins/mul.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/mul.hlsl
@@ -1,5 +1,7 @@
-// RUN: %clang_cc1 -finclude-default-header -O1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,DXIL
-// RUN: %clang_cc1 -finclude-default-header -O1 -triple spirv-unknown-vulkan1.3-library -fnative-half-type -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,SPIRV
+// RUN: %clang_cc1 -finclude-default-header -O1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -emit-llvm -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COLMAJOR,DXIL
+// RUN: %clang_cc1 -finclude-default-header -O1 -triple spirv-unknown-vulkan1.3-library -fnative-half-type -emit-llvm -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COLMAJOR,SPIRV
+// RUN: %clang_cc1 -finclude-default-header -O1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -emit-llvm -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROWMAJOR,DXIL
+// RUN: %clang_cc1 -finclude-default-header -O1 -triple spirv-unknown-vulkan1.3-library -fnative-half-type -emit-llvm -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROWMAJOR,SPIRV
// -- Case 1: scalar * scalar -> scalar --
@@ -74,7 +76,8 @@ export double test_vec_vec_muld(double3 a, double3 b) { return mul(a, b); }
// -- Case 6: vector * matrix -> vector --
// CHECK-LABEL: test_vec_mat_mul
-// CHECK: %hlsl.mul = {{.*}} call {{.*}} <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %m, i32 1, i32 2, i32 3)
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2)
+// CHECK: %hlsl.mul = {{.*}} call {{.*}} <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %{{.*}}, i32 1, i32 2, i32 3)
// CHECK: ret <3 x float> %hlsl.mul
export float3 test_vec_mat_mul(float2 v, float2x3 m) { return mul(v, m); }
@@ -90,22 +93,31 @@ export float2x3 test_mat_scalar_mul(float2x3 a, float b) { return mul(a, b); }
// -- Case 8: matrix * vector -> vector --
// CHECK-LABEL: test_mat_vec_mul
-// CHECK: %hlsl.mul = {{.*}} call {{.*}} <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %m, <3 x float> %v, i32 2, i32 3, i32 1)
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2)
+// CHECK: %hlsl.mul = {{.*}} call {{.*}} <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %{{.*}}, <3 x float> %v, i32 2, i32 3, i32 1)
// CHECK: ret <2 x float> %hlsl.mul
export float2 test_mat_vec_mul(float2x3 m, float3 v) { return mul(m, v); }
// -- Case 9: matrix * matrix -> matrix --
// CHECK-LABEL: test_mat_mat_mul
-// CHECK: %hlsl.mul = {{.*}} call {{.*}} <8 x float> @llvm.matrix.multiply.v8f32.v6f32.v12f32(<6 x float> %a, <12 x float> %b, i32 2, i32 3, i32 4)
-// CHECK: ret <8 x float> %hlsl.mul
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2)
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <12 x float> @llvm.matrix.transpose.v12f32(<12 x float> %{{.*}}, i32 4, i32 3)
+// CHECK: %hlsl.mul = {{.*}} call {{.*}} <8 x float> @llvm.matrix.multiply.v8f32.v6f32.v12f32(<6 x float> %{{.*}}, <12 x float> %{{.*}}, i32 2, i32 3, i32 4)
+// COLMAJOR: ret <8 x float> %hlsl.mul
+// ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call {{.*}} <8 x float> @llvm.matrix.transpose.v8f32(<8 x float> %hlsl.mul, i32 2, i32 4)
+// ROWMAJOR: ret <8 x float> %[[TRANSPOSE_RES]]
export float2x4 test_mat_mat_mul(float2x3 a, float3x4 b) { return mul(a, b); }
// -- Integer matrix multiply --
// CHECK-LABEL: test_mat_mat_muli
-// CHECK: %hlsl.mul = {{.*}} call <8 x i32> @llvm.matrix.multiply.v8i32.v6i32.v12i32(<6 x i32> %a, <12 x i32> %b, i32 2, i32 3, i32 4)
-// CHECK: ret <8 x i32> %hlsl.mul
+// ROWMAJOR: {{.*}} = {{.*}} call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %{{.*}}, i32 3, i32 2)
+// ROWMAJOR: {{.*}} = {{.*}} call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> %{{.*}}, i32 4, i32 3)
+// CHECK: %hlsl.mul = {{.*}} call <8 x i32> @llvm.matrix.multiply.v8i32.v6i32.v12i32(<6 x i32> %{{.*}}, <12 x i32> %{{.*}}, i32 2, i32 3, i32 4)
+// COLMAJOR: ret <8 x i32> %hlsl.mul
+// ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call <8 x i32> @llvm.matrix.transpose.v8i32(<8 x i32> %hlsl.mul, i32 2, i32 4)
+// ROWMAJOR: ret <8 x i32> %[[TRANSPOSE_RES]]
export int2x4 test_mat_mat_muli(int2x3 a, int3x4 b) { return mul(a, b); }
// -- Half-type overloads (native half) --
@@ -150,16 +162,22 @@ export half test_vec_vec_mulh(half3 a, half3 b) { return mul(a, b); }
export half2x3 test_mat_scalar_mulh(half2x3 a, half b) { return mul(a, b); }
// CHECK-LABEL: test_vec_mat_mulh
-// CHECK: %hlsl.mul = {{.*}}call {{.*}} <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> %v, <6 x half> %m, i32 1, i32 2, i32 3)
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2)
+// CHECK: %hlsl.mul = {{.*}}call {{.*}} <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> %v, <6 x half> %{{.*}}, i32 1, i32 2, i32 3)
// CHECK: ret <3 x half> %hlsl.mul
export half3 test_vec_mat_mulh(half2 v, half2x3 m) { return mul(v, m); }
// CHECK-LABEL: test_mat_vec_mulh
-// CHECK: %hlsl.mul = {{.*}}call {{.*}} <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %m, <3 x half> %v, i32 2, i32 3, i32 1)
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2)
+// CHECK: %hlsl.mul = {{.*}}call {{.*}} <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %{{.*}}, <3 x half> %v, i32 2, i32 3, i32 1)
// CHECK: ret <2 x half> %hlsl.mul
export half2 test_mat_vec_mulh(half2x3 m, half3 v) { return mul(m, v); }
// CHECK-LABEL: test_mat_mat_mulh
-// CHECK: %hlsl.mul = {{.*}}call {{.*}} <8 x half> @llvm.matrix.multiply.v8f16.v6f16.v12f16(<6 x half> %a, <12 x half> %b, i32 2, i32 3, i32 4)
-// CHECK: ret <8 x half> %hlsl.mul
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2)
+// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <12 x half> @llvm.matrix.transpose.v12f16(<12 x half> %{{.*}}, i32 4, i32 3)
+// CHECK: %hlsl.mul = {{.*}}call {{.*}} <8 x half> @llvm.matrix.multiply.v8f16.v6f16.v12f16(<6 x half> %{{.*}}, <12 x half> %{{.*}}, i32 2, i32 3, i32 4)
+// COLMAJOR: ret <8 x half> %hlsl.mul
+// ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call {{.*}} <8 x half> @llvm.matrix.transpose.v8f16(<8 x half> %hlsl.mul, i32 2, i32 4)
+// ROWMAJOR: ret <8 x half> %[[TRANSPOSE_RES]]
export half2x4 test_mat_mat_mulh(half2x3 a, half3x4 b) { return mul(a, b); }
diff --git a/clang/test/CodeGenHLSL/builtins/transpose.hlsl b/clang/test/CodeGenHLSL/builtins/transpose.hlsl
index 9018e3913269d..b0bb99c70fda2 100644
--- a/clang/test/CodeGenHLSL/builtins/transpose.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/transpose.hlsl
@@ -1,42 +1,62 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
-// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan1.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COLMAJOR
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan1.3-library -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COLMAJOR
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROWMAJOR
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan1.3-library -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROWMAJOR
// CHECK-LABEL: define {{.*}}test_transpose_bool2x3
-// CHECK: [[A_ADDR:%.*]] = alloca [3 x <2 x i32>], align 4
-// CHECK: [[A_EXT:%.*]] = zext <6 x i1> %{{.*}} to <6 x i32>
-// CHECK: store <6 x i32> [[A_EXT]], ptr [[A_ADDR]], align 4
-// CHECK: [[A:%.*]] = load <6 x i32>, ptr [[A_ADDR]], align 4
-// CHECK: [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 2, i32 3)
+// COLMAJOR: [[A_ADDR:%.*]] = alloca [3 x <2 x i32>], align 4
+// ROWMAJOR: [[A_ADDR:%.*]] = alloca [2 x <3 x i32>], align 4
+// CHECK: [[A_EXT:%.*]] = zext <6 x i1> %{{.*}} to <6 x i32>
+// CHECK: store <6 x i32> [[A_EXT]], ptr [[A_ADDR]], align 4
+// CHECK: [[A:%.*]] = load <6 x i32>, ptr [[A_ADDR]], align 4
+// COLMAJOR: [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 2, i32 3)
+// ROWMAJOR: [[LAYOUT:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 3, i32 2)
+// ROWMAJOR: [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[LAYOUT]], i32 2, i32 3)
+// ROWMAJOR: {{.*}} = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[TRANS]], i32 3, i32 2)
bool3x2 test_transpose_bool2x3(bool2x3 a) {
return transpose(a);
}
// CHECK-LABEL: define {{.*}}test_transpose_int4x3
-// CHECK: [[A_ADDR:%.*]] = alloca [3 x <4 x i32>], align 4
-// CHECK: store <12 x i32> %{{.*}}, ptr [[A_ADDR]], align 4
-// CHECK: [[A:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4
-// CHECK: [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 4, i32 3)
-// CHECK: ret <12 x i32> [[TRANS]]
+// COLMAJOR: [[A_ADDR:%.*]] = alloca [3 x <4 x i32>], align 4
+// ROWMAJOR: [[A_ADDR:%.*]] = alloca [4 x <3 x i32>], align 4
+// CHECK: store <12 x i32> %{{.*}}, ptr [[A_ADDR]], align 4
+// CHECK: [[A:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4
+// COLMAJOR: [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 4, i32 3)
+// COLMAJOR: ret <12 x i32> [[TRANS]]
+// ROWMAJOR: [[LAYOUT:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 3, i32 4)
+// ROWMAJOR: [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[LAYOUT]], i32 4, i32 3)
+// ROWMAJOR: [[RESULT:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[TRANS]], i32 3, i32 4)
+// ROWMAJOR: ret <12 x i32> [[RESULT]]
int3x4 test_transpose_int4x3(int4x3 a) {
return transpose(a);
}
// CHECK-LABEL: define {{.*}}test_transpose_float4x4
-// CHECK: [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
-// CHECK: store <16 x float> %{{.*}}, ptr [[A_ADDR]], align 4
-// CHECK: [[A:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
-// CHECK: [[TRANS:%.*]] = call {{.*}}<16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4)
-// CHECK: ret <16 x float> [[TRANS]]
+// CHECK: [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
+// CHECK: store <16 x float> %{{.*}}, ptr [[A_ADDR]], align 4
+// CHECK: [[A:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
+// COLMAJOR: [[TRANS:%.*]] = call {{.*}}<16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4)
+// COLMAJOR: ret <16 x float> [[TRANS]]
+// ROWMAJOR: [[LAYOUT:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4)
+// ROWMAJOR: [[TRANS:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[LAYOUT]], i32 4, i32 4)
+// ROWMAJOR: [[RESULT:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[TRANS]], i32 4, i32 4)
+// ROWMAJOR: ret <16 x float> [[RESULT]]
float4x4 test_transpose_float4x4(float4x4 a) {
return transpose(a);
}
// CHECK-LABEL: define {{.*}}test_transpose_double1x4
-// CHECK: [[A_ADDR:%.*]] = alloca [4 x <1 x double>], align 8
-// CHECK: store <4 x double> %{{.*}}, ptr [[A_ADDR]], align 8
-// CHECK: [[A:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 8
-// CHECK: [[TRANS:%.*]] = call {{.*}}<4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 1, i32 4)
-// CHECK: ret <4 x double> [[TRANS]]
+// COLMAJOR: [[A_ADDR:%.*]] = alloca [4 x <1 x double>], align 8
+// ROWMAJOR: [[A_ADDR:%.*]] = alloca [1 x <4 x double>], align 8
+// CHECK: store <4 x double> %{{.*}}, ptr [[A_ADDR]], align 8
+// CHECK: [[A:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 8
+// COLMAJOR: [[TRANS:%.*]] = call {{.*}}<4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 1, i32 4)
+// COLMAJOR: ret <4 x double> [[TRANS]]
+// ROWMAJOR: [[LAYOUT:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 4, i32 1)
+// ROWMAJOR: [[TRANS:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[LAYOUT]], i32 1, i32 4)
+// ROWMAJOR: [[RESULT:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[TRANS]], i32 4, i32 1)
+// ROWMAJOR: ret <4 x double> [[RESULT]]
double4x1 test_transpose_double1x4(double1x4 a) {
return transpose(a);
}
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index 5c891d644bfd2..b161e79a1cb6b 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -141,6 +141,22 @@ class MatrixBuilder {
return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
}
+ /// Create a column-major matrix from a row-major matrix with the given
+ /// logical dimensions by transposing it.
+ CallInst *CreateRowMajorToColumnMajorTransform(Value *Matrix, unsigned Rows,
+ unsigned Columns,
+ const Twine &Name = "") {
+ return CreateMatrixTranspose(Matrix, Columns, Rows, Name);
+ }
+
+ /// Create a row-major matrix from a column-major matrix with the given
+ /// logical dimensions by transposing it.
+ CallInst *CreateColumnMajorToRowMajorTransform(Value *Matrix, unsigned Rows,
+ unsigned Columns,
+ const Twine &Name = "") {
+ return CreateMatrixTranspose(Matrix, Rows, Columns, Name);
+ }
+
/// Insert a single element \p NewVal into \p Matrix at indices (\p RowIdx, \p
/// ColumnIdx).
Value *CreateMatrixInsert(Value *Matrix, Value *NewVal, Value *RowIdx,
>From b5f29e0a645407234b75f73d1d7722640214c103 Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Mon, 16 Mar 2026 15:09:39 -0700
Subject: [PATCH 2/7] Add assertions for mul codegen to ensure dimensions are
correct
---
clang/lib/CodeGen/CGHLSLBuiltins.cpp | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index a891864e6d964..30c3098285865 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -1134,6 +1134,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
auto *MatTy = QTy1->castAs<ConstantMatrixType>();
unsigned Rows = MatTy->getNumRows();
unsigned Cols = MatTy->getNumColumns();
+ assert(N == Rows && "vector length must match matrix row count");
if (IsRowMajor)
Op1 = MB.CreateRowMajorToColumnMajorTransform(Op1, Rows, Cols);
return MB.CreateMatrixMultiply(Op0, Op1, 1, N, Cols, "hlsl.mul");
@@ -1142,6 +1143,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
auto *MatTy = QTy0->castAs<ConstantMatrixType>();
unsigned Rows = MatTy->getNumRows();
unsigned Cols = MatTy->getNumColumns();
+ unsigned N = QTy1->castAs<VectorType>()->getNumElements();
+ assert(N == Cols && "vector length must match matrix column count");
if (IsRowMajor)
Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows, Cols);
return MB.CreateMatrixMultiply(Op0, Op1, Rows, Cols, 1, "hlsl.mul");
@@ -1153,6 +1156,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
unsigned Rows1 = MatTy1->getNumRows();
unsigned Cols0 = MatTy0->getNumColumns();
unsigned Cols1 = MatTy1->getNumColumns();
+ assert(Cols0 == Rows1 &&
+ "inner matrix dimensions must match for multiplication");
if (IsRowMajor) {
Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows0, Cols0);
Op1 = MB.CreateRowMajorToColumnMajorTransform(Op1, Rows1, Cols1);
>From 36de9b1431137cc68dbb7fcc83fc349bb7dd6c8b Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Mon, 16 Mar 2026 15:24:26 -0700
Subject: [PATCH 3/7] Eliminate redundant/cancelling transposes from matrix
transpose codegen
---
clang/lib/CodeGen/CGHLSLBuiltins.cpp | 14 +++++-----
.../test/CodeGenHLSL/builtins/transpose.hlsl | 26 +++++--------------
2 files changed, 13 insertions(+), 27 deletions(-)
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 30c3098285865..d13feef6c2eb0 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -1174,17 +1174,15 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
unsigned Rows = MatTy->getNumRows();
unsigned Cols = MatTy->getNumColumns();
llvm::MatrixBuilder MB(Builder);
- // The matrix transpose intrinsic only operates on column-major order
- // matrices. Therefore matrix memory layout transforms must be inserted
- // before and after matrix transpose intrinsics.
+ // The matrix transpose intrinsic operates on column-major matrices.
+ // For row-major, a row-major RxC matrix is equivalent to a column-major
+ // CxR matrix, so transposing with swapped dimensions produces the correct
+ // row-major CxR result directly.
bool IsRowMajor = getLangOpts().getDefaultMatrixMemoryLayout() ==
LangOptions::MatrixMemoryLayout::MatrixRowMajor;
if (IsRowMajor)
- Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows, Cols);
- Value *Result = MB.CreateMatrixTranspose(Op0, Rows, Cols);
- if (IsRowMajor)
- Result = MB.CreateColumnMajorToRowMajorTransform(Result, Cols, Rows);
- return Result;
+ return MB.CreateMatrixTranspose(Op0, Cols, Rows);
+ return MB.CreateMatrixTranspose(Op0, Rows, Cols);
}
case Builtin::BI__builtin_hlsl_elementwise_rcp: {
Value *Op0 = EmitScalarExpr(E->getArg(0));
diff --git a/clang/test/CodeGenHLSL/builtins/transpose.hlsl b/clang/test/CodeGenHLSL/builtins/transpose.hlsl
index b0bb99c70fda2..d8430fcf5bf9d 100644
--- a/clang/test/CodeGenHLSL/builtins/transpose.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/transpose.hlsl
@@ -10,9 +10,7 @@
// CHECK: store <6 x i32> [[A_EXT]], ptr [[A_ADDR]], align 4
// CHECK: [[A:%.*]] = load <6 x i32>, ptr [[A_ADDR]], align 4
// COLMAJOR: [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 2, i32 3)
-// ROWMAJOR: [[LAYOUT:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 3, i32 2)
-// ROWMAJOR: [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[LAYOUT]], i32 2, i32 3)
-// ROWMAJOR: {{.*}} = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[TRANS]], i32 3, i32 2)
+// ROWMAJOR: [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 3, i32 2)
bool3x2 test_transpose_bool2x3(bool2x3 a) {
return transpose(a);
}
@@ -23,11 +21,8 @@ bool3x2 test_transpose_bool2x3(bool2x3 a) {
// CHECK: store <12 x i32> %{{.*}}, ptr [[A_ADDR]], align 4
// CHECK: [[A:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4
// COLMAJOR: [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 4, i32 3)
-// COLMAJOR: ret <12 x i32> [[TRANS]]
-// ROWMAJOR: [[LAYOUT:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 3, i32 4)
-// ROWMAJOR: [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[LAYOUT]], i32 4, i32 3)
-// ROWMAJOR: [[RESULT:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[TRANS]], i32 3, i32 4)
-// ROWMAJOR: ret <12 x i32> [[RESULT]]
+// ROWMAJOR: [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 3, i32 4)
+// CHECK: ret <12 x i32> [[TRANS]]
int3x4 test_transpose_int4x3(int4x3 a) {
return transpose(a);
}
@@ -36,12 +31,8 @@ int3x4 test_transpose_int4x3(int4x3 a) {
// CHECK: [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
// CHECK: store <16 x float> %{{.*}}, ptr [[A_ADDR]], align 4
// CHECK: [[A:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
-// COLMAJOR: [[TRANS:%.*]] = call {{.*}}<16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4)
-// COLMAJOR: ret <16 x float> [[TRANS]]
-// ROWMAJOR: [[LAYOUT:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4)
-// ROWMAJOR: [[TRANS:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[LAYOUT]], i32 4, i32 4)
-// ROWMAJOR: [[RESULT:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[TRANS]], i32 4, i32 4)
-// ROWMAJOR: ret <16 x float> [[RESULT]]
+// CHECK: [[TRANS:%.*]] = call {{.*}}<16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4)
+// CHECK: ret <16 x float> [[TRANS]]
float4x4 test_transpose_float4x4(float4x4 a) {
return transpose(a);
}
@@ -52,11 +43,8 @@ float4x4 test_transpose_float4x4(float4x4 a) {
// CHECK: store <4 x double> %{{.*}}, ptr [[A_ADDR]], align 8
// CHECK: [[A:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 8
// COLMAJOR: [[TRANS:%.*]] = call {{.*}}<4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 1, i32 4)
-// COLMAJOR: ret <4 x double> [[TRANS]]
-// ROWMAJOR: [[LAYOUT:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 4, i32 1)
-// ROWMAJOR: [[TRANS:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[LAYOUT]], i32 1, i32 4)
-// ROWMAJOR: [[RESULT:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[TRANS]], i32 4, i32 1)
-// ROWMAJOR: ret <4 x double> [[RESULT]]
+// ROWMAJOR: [[TRANS:%.*]] = call {{.*}}<4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 4, i32 1)
+// CHECK: ret <4 x double> [[TRANS]]
double4x1 test_transpose_double1x4(double1x4 a) {
return transpose(a);
}
>From 699c32b4ef9bc8d027ea91cf76087906e44e5f91 Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Wed, 18 Mar 2026 11:09:45 -0700
Subject: [PATCH 4/7] Add comment about matrix transpose memory layout
assumption
---
llvm/include/llvm/IR/MatrixBuilder.h | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index b161e79a1cb6b..73ea2daea966a 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -143,6 +143,9 @@ class MatrixBuilder {
/// Create a column-major matrix from a row-major matrix with the given
/// logical dimensions by transposing it.
+ /// Assumes the matrix transpose assumes column-major matrix memory layout,
+ /// which is true in the case of the DirectX and SPIRV backends, but not true
+ /// in the case of the LowerMatrixIntrinsics pass.
CallInst *CreateRowMajorToColumnMajorTransform(Value *Matrix, unsigned Rows,
unsigned Columns,
const Twine &Name = "") {
@@ -151,6 +154,9 @@ class MatrixBuilder {
/// Create a row-major matrix from a column-major matrix with the given
/// logical dimensions by transposing it.
+ /// Assumes the matrix transpose assumes column-major matrix memory layout,
+ /// which is true in the case of the DirectX and SPIRV backends, but not true
+ /// in the case of the LowerMatrixIntrinsics pass.
CallInst *CreateColumnMajorToRowMajorTransform(Value *Matrix, unsigned Rows,
unsigned Columns,
const Twine &Name = "") {
>From 06eb1c3dd6982fe5fc724cf4b3033ae48cdf58bb Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Wed, 18 Mar 2026 11:14:49 -0700
Subject: [PATCH 5/7] Add 'necessarily' key word to transpose assumption
---
llvm/include/llvm/IR/MatrixBuilder.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index 73ea2daea966a..41cd5ea0efd93 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -144,8 +144,8 @@ class MatrixBuilder {
/// Create a column-major matrix from a row-major matrix with the given
/// logical dimensions by transposing it.
/// Assumes the matrix transpose assumes column-major matrix memory layout,
- /// which is true in the case of the DirectX and SPIRV backends, but not true
- /// in the case of the LowerMatrixIntrinsics pass.
+ /// which is true in the case of the DirectX and SPIRV backends, but not
+ /// necessarily true in the case of the LowerMatrixIntrinsics pass.
CallInst *CreateRowMajorToColumnMajorTransform(Value *Matrix, unsigned Rows,
unsigned Columns,
const Twine &Name = "") {
@@ -155,8 +155,8 @@ class MatrixBuilder {
/// Create a row-major matrix from a column-major matrix with the given
/// logical dimensions by transposing it.
/// Assumes the matrix transpose assumes column-major matrix memory layout,
- /// which is true in the case of the DirectX and SPIRV backends, but not true
- /// in the case of the LowerMatrixIntrinsics pass.
+ /// which is true in the case of the DirectX and SPIRV backends, but not
+ /// necessarily true in the case of the LowerMatrixIntrinsics pass.
CallInst *CreateColumnMajorToRowMajorTransform(Value *Matrix, unsigned Rows,
unsigned Columns,
const Twine &Name = "") {
>From 03eb06aa0eb50e9c8981ff1aa811cf800c5a0c96 Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Fri, 20 Mar 2026 12:35:26 -0700
Subject: [PATCH 6/7] Capture more variables in FileCheck
---
clang/test/CodeGenHLSL/builtins/mul.hlsl | 45 ++++++++++++++----------
1 file changed, 26 insertions(+), 19 deletions(-)
diff --git a/clang/test/CodeGenHLSL/builtins/mul.hlsl b/clang/test/CodeGenHLSL/builtins/mul.hlsl
index f9151225d3846..408355054200d 100644
--- a/clang/test/CodeGenHLSL/builtins/mul.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/mul.hlsl
@@ -67,8 +67,8 @@ export uint test_vec_vec_mulu(uint3 a, uint3 b) { return mul(a, b); }
// CHECK-LABEL: test_vec_vec_muld
// CHECK-NOT: @llvm.dx.fdot
// CHECK-NOT: @llvm.spv.fdot
-// CHECK: fmul {{.*}} double
-// CHECK: %hlsl.fmad.i = {{.*}}call {{.*}} double @llvm.fmuladd.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
+// CHECK: %[[FMUL:.*]] = fmul {{.*}} double
+// CHECK: %hlsl.fmad.i = {{.*}}call {{.*}} double @llvm.fmuladd.f64(double %{{.*}}, double %{{.*}}, double %[[FMUL]])
// CHECK: %hlsl.fmad.i.1 = {{.*}}call {{.*}} double @llvm.fmuladd.f64(double %{{.*}}, double %{{.*}}, double %hlsl.fmad.i)
// CHECK: ret double %hlsl.fmad.i.1
export double test_vec_vec_muld(double3 a, double3 b) { return mul(a, b); }
@@ -76,8 +76,9 @@ export double test_vec_vec_muld(double3 a, double3 b) { return mul(a, b); }
// -- Case 6: vector * matrix -> vector --
// CHECK-LABEL: test_vec_mat_mul
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2)
-// CHECK: %hlsl.mul = {{.*}} call {{.*}} <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %{{.*}}, i32 1, i32 2, i32 3)
+// ROWMAJOR: %[[TRANSPOSE:.*]] = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %m, i32 3, i32 2)
+// ROWMAJOR: %hlsl.mul = {{.*}} call {{.*}} <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %[[TRANSPOSE]], i32 1, i32 2, i32 3)
+// COLMAJOR: %hlsl.mul = {{.*}} call {{.*}} <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %m, i32 1, i32 2, i32 3)
// CHECK: ret <3 x float> %hlsl.mul
export float3 test_vec_mat_mul(float2 v, float2x3 m) { return mul(v, m); }
@@ -93,17 +94,19 @@ export float2x3 test_mat_scalar_mul(float2x3 a, float b) { return mul(a, b); }
// -- Case 8: matrix * vector -> vector --
// CHECK-LABEL: test_mat_vec_mul
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2)
-// CHECK: %hlsl.mul = {{.*}} call {{.*}} <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %{{.*}}, <3 x float> %v, i32 2, i32 3, i32 1)
+// ROWMAJOR: %[[TRANSPOSE:.*]] = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %m, i32 3, i32 2)
+// ROWMAJOR: %hlsl.mul = {{.*}} call {{.*}} <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %[[TRANSPOSE]], <3 x float> %v, i32 2, i32 3, i32 1)
+// COLMAJOR: %hlsl.mul = {{.*}} call {{.*}} <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %m, <3 x float> %v, i32 2, i32 3, i32 1)
// CHECK: ret <2 x float> %hlsl.mul
export float2 test_mat_vec_mul(float2x3 m, float3 v) { return mul(m, v); }
// -- Case 9: matrix * matrix -> matrix --
// CHECK-LABEL: test_mat_mat_mul
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2)
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <12 x float> @llvm.matrix.transpose.v12f32(<12 x float> %{{.*}}, i32 4, i32 3)
-// CHECK: %hlsl.mul = {{.*}} call {{.*}} <8 x float> @llvm.matrix.multiply.v8f32.v6f32.v12f32(<6 x float> %{{.*}}, <12 x float> %{{.*}}, i32 2, i32 3, i32 4)
+// ROWMAJOR: %[[TRANSPOSE_A:.*]] = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %a, i32 3, i32 2)
+// ROWMAJOR: %[[TRANSPOSE_B:.*]] = {{.*}} call {{.*}} <12 x float> @llvm.matrix.transpose.v12f32(<12 x float> %b, i32 4, i32 3)
+// ROWMAJOR: %hlsl.mul = {{.*}} call {{.*}} <8 x float> @llvm.matrix.multiply.v8f32.v6f32.v12f32(<6 x float> %[[TRANSPOSE_A]], <12 x float> %[[TRANSPOSE_B]], i32 2, i32 3, i32 4)
+// COLMAJOR: %hlsl.mul = {{.*}} call {{.*}} <8 x float> @llvm.matrix.multiply.v8f32.v6f32.v12f32(<6 x float> %a, <12 x float> %b, i32 2, i32 3, i32 4)
// COLMAJOR: ret <8 x float> %hlsl.mul
// ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call {{.*}} <8 x float> @llvm.matrix.transpose.v8f32(<8 x float> %hlsl.mul, i32 2, i32 4)
// ROWMAJOR: ret <8 x float> %[[TRANSPOSE_RES]]
@@ -112,9 +115,10 @@ export float2x4 test_mat_mat_mul(float2x3 a, float3x4 b) { return mul(a, b); }
// -- Integer matrix multiply --
// CHECK-LABEL: test_mat_mat_muli
-// ROWMAJOR: {{.*}} = {{.*}} call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %{{.*}}, i32 3, i32 2)
-// ROWMAJOR: {{.*}} = {{.*}} call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> %{{.*}}, i32 4, i32 3)
-// CHECK: %hlsl.mul = {{.*}} call <8 x i32> @llvm.matrix.multiply.v8i32.v6i32.v12i32(<6 x i32> %{{.*}}, <12 x i32> %{{.*}}, i32 2, i32 3, i32 4)
+// ROWMAJOR: %[[TRANSPOSE_A:.*]] = {{.*}} call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %a, i32 3, i32 2)
+// ROWMAJOR: %[[TRANSPOSE_B:.*]] = {{.*}} call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> %b, i32 4, i32 3)
+// ROWMAJOR: %hlsl.mul = {{.*}} call <8 x i32> @llvm.matrix.multiply.v8i32.v6i32.v12i32(<6 x i32> %[[TRANSPOSE_A]], <12 x i32> %[[TRANSPOSE_B]], i32 2, i32 3, i32 4)
+// COLMAJOR: %hlsl.mul = {{.*}} call <8 x i32> @llvm.matrix.multiply.v8i32.v6i32.v12i32(<6 x i32> %a, <12 x i32> %b, i32 2, i32 3, i32 4)
// COLMAJOR: ret <8 x i32> %hlsl.mul
// ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call <8 x i32> @llvm.matrix.transpose.v8i32(<8 x i32> %hlsl.mul, i32 2, i32 4)
// ROWMAJOR: ret <8 x i32> %[[TRANSPOSE_RES]]
@@ -162,21 +166,24 @@ export half test_vec_vec_mulh(half3 a, half3 b) { return mul(a, b); }
export half2x3 test_mat_scalar_mulh(half2x3 a, half b) { return mul(a, b); }
// CHECK-LABEL: test_vec_mat_mulh
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2)
-// CHECK: %hlsl.mul = {{.*}}call {{.*}} <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> %v, <6 x half> %{{.*}}, i32 1, i32 2, i32 3)
+// ROWMAJOR: %[[TRANSPOSE:.*]] = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %m, i32 3, i32 2)
+// ROWMAJOR: %hlsl.mul = {{.*}}call {{.*}} <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> %v, <6 x half> %[[TRANSPOSE]], i32 1, i32 2, i32 3)
+// COLMAJOR: %hlsl.mul = {{.*}}call {{.*}} <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> %v, <6 x half> %m, i32 1, i32 2, i32 3)
// CHECK: ret <3 x half> %hlsl.mul
export half3 test_vec_mat_mulh(half2 v, half2x3 m) { return mul(v, m); }
// CHECK-LABEL: test_mat_vec_mulh
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2)
-// CHECK: %hlsl.mul = {{.*}}call {{.*}} <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %{{.*}}, <3 x half> %v, i32 2, i32 3, i32 1)
+// ROWMAJOR: %[[TRANSPOSE:.*]] = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %m, i32 3, i32 2)
+// ROWMAJOR: %hlsl.mul = {{.*}}call {{.*}} <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %[[TRANSPOSE]], <3 x half> %v, i32 2, i32 3, i32 1)
+// COLMAJOR: %hlsl.mul = {{.*}}call {{.*}} <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %m, <3 x half> %v, i32 2, i32 3, i32 1)
// CHECK: ret <2 x half> %hlsl.mul
export half2 test_mat_vec_mulh(half2x3 m, half3 v) { return mul(m, v); }
// CHECK-LABEL: test_mat_mat_mulh
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2)
-// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <12 x half> @llvm.matrix.transpose.v12f16(<12 x half> %{{.*}}, i32 4, i32 3)
-// CHECK: %hlsl.mul = {{.*}}call {{.*}} <8 x half> @llvm.matrix.multiply.v8f16.v6f16.v12f16(<6 x half> %{{.*}}, <12 x half> %{{.*}}, i32 2, i32 3, i32 4)
+// ROWMAJOR: %[[TRANSPOSE_A:.*]] = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %a, i32 3, i32 2)
+// ROWMAJOR: %[[TRANSPOSE_B:.*]] = {{.*}} call {{.*}} <12 x half> @llvm.matrix.transpose.v12f16(<12 x half> %b, i32 4, i32 3)
+// ROWMAJOR: %hlsl.mul = {{.*}}call {{.*}} <8 x half> @llvm.matrix.multiply.v8f16.v6f16.v12f16(<6 x half> %[[TRANSPOSE_A]], <12 x half> %[[TRANSPOSE_B]], i32 2, i32 3, i32 4)
+// COLMAJOR: %hlsl.mul = {{.*}}call {{.*}} <8 x half> @llvm.matrix.multiply.v8f16.v6f16.v12f16(<6 x half> %a, <12 x half> %b, i32 2, i32 3, i32 4)
// COLMAJOR: ret <8 x half> %hlsl.mul
// ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call {{.*}} <8 x half> @llvm.matrix.transpose.v8f16(<8 x half> %hlsl.mul, i32 2, i32 4)
// ROWMAJOR: ret <8 x half> %[[TRANSPOSE_RES]]
>From fa7a7e48daf3656d15017a6b316997d03ae1b57d Mon Sep 17 00:00:00 2001
From: Deric Cheung <cheung.deric at gmail.com>
Date: Fri, 20 Mar 2026 13:05:54 -0700
Subject: [PATCH 7/7] Do not capture fmul because it may be vectorized
depending on build config
---
clang/test/CodeGenHLSL/builtins/mul.hlsl | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/clang/test/CodeGenHLSL/builtins/mul.hlsl b/clang/test/CodeGenHLSL/builtins/mul.hlsl
index 408355054200d..5e7468763654b 100644
--- a/clang/test/CodeGenHLSL/builtins/mul.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/mul.hlsl
@@ -67,8 +67,8 @@ export uint test_vec_vec_mulu(uint3 a, uint3 b) { return mul(a, b); }
// CHECK-LABEL: test_vec_vec_muld
// CHECK-NOT: @llvm.dx.fdot
// CHECK-NOT: @llvm.spv.fdot
-// CHECK: %[[FMUL:.*]] = fmul {{.*}} double
-// CHECK: %hlsl.fmad.i = {{.*}}call {{.*}} double @llvm.fmuladd.f64(double %{{.*}}, double %{{.*}}, double %[[FMUL]])
+// CHECK: fmul {{.*}} double
+// CHECK: %hlsl.fmad.i = {{.*}}call {{.*}} double @llvm.fmuladd.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
// CHECK: %hlsl.fmad.i.1 = {{.*}}call {{.*}} double @llvm.fmuladd.f64(double %{{.*}}, double %{{.*}}, double %hlsl.fmad.i)
// CHECK: ret double %hlsl.fmad.i.1
export double test_vec_vec_muld(double3 a, double3 b) { return mul(a, b); }
More information about the cfe-commits
mailing list