[clang] [llvm] [HLSL][DirectX] Add `transpose` HLSL intrinsic and DXIL lowering of `llvm.matrix.transpose` (PR #186263)

Fri Mar 13 10:31:05 PDT 2026

================
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s
+
+; Verify that llvm.matrix.transpose is expanded to shufflevector for DXIL.
+
+declare <6 x float> @llvm.matrix.transpose.v6f32(<6 x float>, i32, i32)
+declare <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32>, i32, i32)
+declare <16 x float> @llvm.matrix.transpose.v16f32(<16 x float>, i32, i32)
+declare <4 x float> @llvm.matrix.transpose.v4f32(<4 x float>, i32, i32)
+declare <4 x half> @llvm.matrix.transpose.v4f16(<4 x half>, i32, i32)
+
+; 2x3 float -> 3x2 float
+define <6 x float> @test_transpose_float2x3(<6 x float> %m) {
+; CHECK-LABEL: define <6 x float> @test_transpose_float2x3(
+; CHECK-SAME: <6 x float> [[M:%.*]]) {
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x float> [[M]], <6 x float> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NEXT:    ret <6 x float> [[TMP12]]
+;
+  %r = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %m, i32 2, i32 3)
+  ret <6 x float> %r
+}
+
+; 4x3 int -> 3x4 int
+define <12 x i32> @test_transpose_int4x3(<12 x i32> %m) {
+; CHECK-LABEL: define <12 x i32> @test_transpose_int4x3(
+; CHECK-SAME: <12 x i32> [[M:%.*]]) {
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <12 x i32> [[M]], <12 x i32> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:    ret <12 x i32> [[TMP24]]
+;
+  %r = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> %m, i32 4, i32 3)
+  ret <12 x i32> %r
+}
+
+; 4x4 float -> 4x4 float
+define <16 x float> @test_transpose_float4x4(<16 x float> %m) {
+; CHECK-LABEL: define <16 x float> @test_transpose_float4x4(
+; CHECK-SAME: <16 x float> [[M:%.*]]) {
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <16 x float> [[M]], <16 x float> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
----------------
farzonl wrote:

Do we know how the scalarizer handles shuffle vectors this large? I think it is probably fine.  We should just make sure  it is well tested if not by our backend other ones.

https://github.com/llvm/llvm-project/pull/186263