[clang] [HLSL][Matrix] Add support for Matrix element and trunc Casts (PR #168915)

Tue Dec 2 10:13:39 PST 2025

https://github.com/farzonl updated https://github.com/llvm/llvm-project/pull/168915

>From 49bc3915548e9755c5448d9ecff9b3128ae1c3ba Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Thu, 20 Nov 2025 12:16:26 -0500
Subject: [PATCH 01/12] [HLSL][Matrix] Add support for Matrix element and trunc
 Casts

fixes #168737
fixes #168755

This change fixes adds support for Matrix truncations
via the ICK_HLSL_Matrix_Truncation enum. That ends up being
most of the files changed.

It also allows Matrix as an HLSL Elementwise cast as long as the
cast does not perform a shape transformation ie  3x2 to 2x3.

Tests for the new elementwise and truncation behavior were added.
As well as sema tests to make sure we error n the shape transformation
cast.

I am punting right now on the ConstExpr Matrix support.
That will need to be addressed later. Will file a seperate issue for
that if reviewers agree it can wait.
---
 clang/include/clang/AST/OperationKinds.def    |   3 +
 clang/include/clang/Sema/Overload.h           |   3 +
 clang/lib/AST/Expr.cpp                        |   1 +
 clang/lib/AST/ExprConstant.cpp                |  13 ++
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          |   2 +
 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp   |   1 +
 clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp  |   1 +
 clang/lib/CodeGen/CGExpr.cpp                  |   1 +
 clang/lib/CodeGen/CGExprAgg.cpp               |   3 +-
 clang/lib/CodeGen/CGExprComplex.cpp           |   1 +
 clang/lib/CodeGen/CGExprConstant.cpp          |   1 +
 clang/lib/CodeGen/CGExprScalar.cpp            |  35 +++-
 clang/lib/Sema/SemaExprCXX.cpp                |  22 ++-
 clang/lib/Sema/SemaHLSL.cpp                   |   5 +-
 clang/lib/Sema/SemaOverload.cpp               |  75 ++++++-
 clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp |   1 +
 .../BasicFeatures/MatrixElementTypeCast.hlsl  | 186 ++++++++++++++++++
 .../BasicFeatures/MatrixTruncation.hlsl       | 156 +++++++++++++++
 .../Types/BuiltinMatrix/MatrixCastErrors.hlsl |  21 ++
 19 files changed, 516 insertions(+), 15 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
 create mode 100644 clang/test/CodeGenHLSL/BasicFeatures/MatrixTruncation.hlsl
 create mode 100644 clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixCastErrors.hlsl

diff --git a/clang/include/clang/AST/OperationKinds.def b/clang/include/clang/AST/OperationKinds.def
index c2dca895e8411..8a13ad988403b 100644
--- a/clang/include/clang/AST/OperationKinds.def
+++ b/clang/include/clang/AST/OperationKinds.def
@@ -364,6 +364,9 @@ CAST_OPERATION(IntToOCLSampler)
 // Truncate a vector type by dropping elements from the end (HLSL only).
 CAST_OPERATION(HLSLVectorTruncation)
 
+// Truncate a matrix type by dropping elements from the end (HLSL only).
+CAST_OPERATION(HLSLMatrixTruncation)
+
 // Non-decaying array RValue cast (HLSL only).
 CAST_OPERATION(HLSLArrayRValue)
 
diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index 59bbd0fbd9e95..1ad52cb9da517 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -198,6 +198,9 @@ class Sema;
     /// HLSL vector truncation.
     ICK_HLSL_Vector_Truncation,
 
+    /// HLSL Matrid truncation.
+    ICK_HLSL_Matrix_Truncation,
+
     /// HLSL non-decaying array rvalue cast.
     ICK_HLSL_Array_RValue,
 
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 1f405920ce6b5..ca7f3e16a9276 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -1934,6 +1934,7 @@ bool CastExpr::CastConsistency() const {
   case CK_FixedPointToBoolean:
   case CK_HLSLArrayRValue:
   case CK_HLSLVectorTruncation:
+  case CK_HLSLMatrixTruncation:
   case CK_HLSLElementwiseCast:
   case CK_HLSLAggregateSplatCast:
   CheckNoBasePath:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index e5af4cb049ba9..baadaab0973a2 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11770,6 +11770,10 @@ bool VectorExprEvaluator::VisitCastExpr(const CastExpr *E) {
       Elements.push_back(Val.getVectorElt(I));
     return Success(Elements, E);
   }
+  case CK_HLSLMatrixTruncation: {
+    // TODO: support Expr Constant for Matrix Truncation
+    return Error(E);
+  }
   case CK_HLSLAggregateSplatCast: {
     APValue Val;
     QualType ValTy;
@@ -18430,6 +18434,10 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
       return Error(E);
     return Success(Val.getVectorElt(0), E);
   }
+  case CK_HLSLMatrixTruncation: {
+    // TODO: support Expr Constant for Matrix Truncation
+    return Error(E);
+  }
   case CK_HLSLElementwiseCast: {
     SmallVector<APValue> SrcVals;
     SmallVector<QualType> SrcTypes;
@@ -19023,6 +19031,10 @@ bool FloatExprEvaluator::VisitCastExpr(const CastExpr *E) {
       return Error(E);
     return Success(Val.getVectorElt(0), E);
   }
+  case CK_HLSLMatrixTruncation: {
+    // TODO: support Expr Constant for Matrix Truncation
+    return Error(E);
+  }
   case CK_HLSLElementwiseCast: {
     SmallVector<APValue> SrcVals;
     SmallVector<QualType> SrcTypes;
@@ -19180,6 +19192,7 @@ bool ComplexExprEvaluator::VisitCastExpr(const CastExpr *E) {
   case CK_IntegralToFixedPoint:
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
+  case CK_HLSLMatrixTruncation:
   case CK_HLSLElementwiseCast:
   case CK_HLSLAggregateSplatCast:
     llvm_unreachable("invalid cast kind for complex value");
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 4065124f8f568..d48fb45aed78c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -188,6 +188,7 @@ Address CIRGenFunction::emitPointerWithAlignment(const Expr *expr,
     case CK_HLSLArrayRValue:
     case CK_HLSLElementwiseCast:
     case CK_HLSLVectorTruncation:
+    case CK_HLSLMatrixTruncation:
     case CK_IntToOCLSampler:
     case CK_IntegralCast:
     case CK_IntegralComplexCast:
@@ -1323,6 +1324,7 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
   case CK_IntegralToFixedPoint:
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
+  case CK_HLSLMatrixTruncation:
   case CK_HLSLArrayRValue:
   case CK_HLSLElementwiseCast:
   case CK_HLSLAggregateSplatCast:
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 9ed920085c8c6..fe06f8cc2c430 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -534,6 +534,7 @@ mlir::Value ComplexExprEmitter::emitCast(CastKind ck, Expr *op,
   case CK_IntegralToFixedPoint:
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
+  case CK_HLSLMatrixTruncation:
   case CK_HLSLArrayRValue:
   case CK_HLSLElementwiseCast:
   case CK_HLSLAggregateSplatCast:
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 66f8ef9b05913..329fd08bc8914 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -1012,6 +1012,7 @@ class ConstExprEmitter
     case CK_MatrixCast:
     case CK_HLSLArrayRValue:
     case CK_HLSLVectorTruncation:
+    case CK_HLSLMatrixTruncation:
     case CK_HLSLElementwiseCast:
     case CK_HLSLAggregateSplatCast:
       return {};
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index c8f669b69d991..a54d400c66968 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -5772,6 +5772,7 @@ LValue CodeGenFunction::EmitCastLValue(const CastExpr *E) {
   case CK_IntegralToFixedPoint:
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
+  case CK_HLSLMatrixTruncation:
   case CK_HLSLArrayRValue:
   case CK_HLSLElementwiseCast:
   case CK_HLSLAggregateSplatCast:
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 67b5f919d1b2a..7cc4d6c8f06f6 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -1036,7 +1036,7 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
   case CK_ZeroToOCLOpaqueType:
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
-
+  case CK_HLSLMatrixTruncation:
   case CK_IntToOCLSampler:
   case CK_FloatingToFixedPoint:
   case CK_FixedPointToFloating:
@@ -1550,6 +1550,7 @@ static bool castPreservesZero(const CastExpr *CE) {
   case CK_NonAtomicToAtomic:
   case CK_AtomicToNonAtomic:
   case CK_HLSLVectorTruncation:
+  case CK_HLSLMatrixTruncation:
   case CK_HLSLElementwiseCast:
   case CK_HLSLAggregateSplatCast:
     return true;
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index bca7c30557f03..e5815ef1130dc 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -621,6 +621,7 @@ ComplexPairTy ComplexExprEmitter::EmitCast(CastKind CK, Expr *Op,
   case CK_IntegralToFixedPoint:
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
+  case CK_HLSLMatrixTruncation:
   case CK_HLSLArrayRValue:
   case CK_HLSLElementwiseCast:
   case CK_HLSLAggregateSplatCast:
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 6407afc3d9447..0eec4dba4824a 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -1333,6 +1333,7 @@ class ConstExprEmitter
     case CK_ZeroToOCLOpaqueType:
     case CK_MatrixCast:
     case CK_HLSLVectorTruncation:
+    case CK_HLSLMatrixTruncation:
     case CK_HLSLArrayRValue:
     case CK_HLSLElementwiseCast:
     case CK_HLSLAggregateSplatCast:
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 714192db1b15c..a9e2ebdffa59a 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2422,9 +2422,27 @@ static Value *EmitHLSLElementwiseCast(CodeGenFunction &CGF, LValue SrcVal,
     }
     return V;
   }
+  if (auto *MatTy = DestTy->getAs<ConstantMatrixType>()) {
+    assert(LoadList.size() >= MatTy->getNumElementsFlattened() &&
+           "Flattened type on RHS must have the same number or more elements "
+           "than vector on LHS.");
+    llvm::Value *V =
+        CGF.Builder.CreateLoad(CGF.CreateIRTemp(DestTy, "flatcast.tmp"));
+    // write to V.
+    for (unsigned I = 0, E = MatTy->getNumElementsFlattened(); I < E; I++) {
+      RValue RVal = CGF.EmitLoadOfLValue(LoadList[I], Loc);
+      assert(RVal.isScalar() &&
+             "All flattened source values should be scalars.");
+      llvm::Value *Cast =
+          CGF.EmitScalarConversion(RVal.getScalarVal(), LoadList[I].getType(),
+                                   MatTy->getElementType(), Loc);
+      V = CGF.Builder.CreateInsertElement(V, Cast, I);
+    }
+    return V;
+  }
   // if its a builtin just do an extract element or load.
   assert(DestTy->isBuiltinType() &&
-         "Destination type must be a vector or builtin type.");
+         "Destination type must be a vector, matrix, or builtin type.");
   RValue RVal = CGF.EmitLoadOfLValue(LoadList[0], Loc);
   assert(RVal.isScalar() && "All flattened source values should be scalars.");
   return CGF.EmitScalarConversion(RVal.getScalarVal(), LoadList[0].getType(),
@@ -2954,6 +2972,21 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     llvm::Value *Zero = llvm::Constant::getNullValue(CGF.SizeTy);
     return Builder.CreateExtractElement(Vec, Zero, "cast.vtrunc");
   }
+  case CK_HLSLMatrixTruncation: {
+    assert((DestTy->isMatrixType() || DestTy->isBuiltinType()) &&
+           "Destination type must be a matrix or builtin type.");
+    Value *Mat = Visit(E);
+    if (auto *MatTy = DestTy->getAs<ConstantMatrixType>()) {
+      SmallVector<int> Mask;
+      unsigned NumElts = MatTy->getNumElementsFlattened();
+      for (unsigned I = 0; I != NumElts; ++I)
+        Mask.push_back(I);
+
+      return Builder.CreateShuffleVector(Mat, Mask, "trunc");
+    }
+    llvm::Value *Zero = llvm::Constant::getNullValue(CGF.SizeTy);
+    return Builder.CreateExtractElement(Mat, Zero, "cast.mtrunc");
+  }
   case CK_HLSLElementwiseCast: {
     RValue RV = CGF.EmitAnyExpr(E);
     SourceLocation Loc = CE->getExprLoc();
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index d6f70e728be29..69719ebd1fc8c 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -5196,6 +5196,7 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
   case ICK_Incompatible_Pointer_Conversion:
   case ICK_HLSL_Array_RValue:
   case ICK_HLSL_Vector_Truncation:
+  case ICK_HLSL_Matrix_Truncation:
   case ICK_HLSL_Vector_Splat:
     llvm_unreachable("Improper second standard conversion");
   }
@@ -5203,12 +5204,10 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
   if (SCS.Dimension != ICK_Identity) {
     // If SCS.Element is not ICK_Identity the To and From types must be HLSL
     // vectors or matrices.
-
-    // TODO: Support HLSL matrices.
-    assert((!From->getType()->isMatrixType() && !ToType->isMatrixType()) &&
-           "Dimension conversion for matrix types is not implemented yet.");
-    assert((ToType->isVectorType() || ToType->isBuiltinType()) &&
-           "Dimension conversion output must be vector or scalar type.");
+    assert(
+        (ToType->isVectorType() || ToType->isConstantMatrixType() ||
+         ToType->isBuiltinType()) &&
+        "Dimension conversion output must be vector, matrix, or scalar type.");
     switch (SCS.Dimension) {
     case ICK_HLSL_Vector_Splat: {
       // Vector splat from any arithmetic type to a vector.
@@ -5234,6 +5233,17 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
 
       break;
     }
+    case ICK_HLSL_Matrix_Truncation: {
+      auto *FromMat = From->getType()->castAs<ConstantMatrixType>();
+      QualType TruncTy = FromMat->getElementType();
+      if (auto *ToMat = ToType->getAs<ConstantMatrixType>())
+        TruncTy = Context.getConstantMatrixType(TruncTy, ToMat->getNumRows(),
+                                                ToMat->getNumColumns());
+      From = ImpCastExprToType(From, TruncTy, CK_HLSLMatrixTruncation,
+                               From->getValueKind())
+                 .get();
+      break;
+    }
     case ICK_Identity:
     default:
       llvm_unreachable("Improper element standard conversion");
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index ecab3946b58c7..921fed93c4268 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3721,7 +3721,6 @@ bool SemaHLSL::CanPerformAggregateSplatCast(Expr *Src, QualType DestTy) {
 }
 
 // Can we perform an HLSL Elementwise cast?
-// TODO: update this code when matrices are added; see issue #88060
 bool SemaHLSL::CanPerformElementwiseCast(Expr *Src, QualType DestTy) {
 
   // Don't handle casts where LHS and RHS are any combination of scalar/vector
@@ -3734,6 +3733,10 @@ bool SemaHLSL::CanPerformElementwiseCast(Expr *Src, QualType DestTy) {
       (DestTy->isScalarType() || DestTy->isVectorType()))
     return false;
 
+  if (SrcTy->isConstantMatrixType() &&
+      (DestTy->isScalarType() || DestTy->isConstantMatrixType()))
+    return false;
+
   llvm::SmallVector<QualType> DestTypes;
   BuildFlattenedTypeList(DestTy, DestTypes);
   llvm::SmallVector<QualType> SrcTypes;
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index c12f92dfdab66..b5f92eab9b2f1 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -162,6 +162,7 @@ ImplicitConversionRank clang::GetConversionRank(ImplicitConversionKind Kind) {
       ICR_C_Conversion_Extension,
       ICR_Conversion,
       ICR_HLSL_Dimension_Reduction,
+      ICR_HLSL_Dimension_Reduction,
       ICR_Conversion,
       ICR_HLSL_Scalar_Widening,
   };
@@ -224,6 +225,7 @@ static const char *GetImplicitConversionName(ImplicitConversionKind Kind) {
       "Incompatible pointer conversion",
       "Fixed point conversion",
       "HLSL vector truncation",
+      "HLSL matrix truncation",
       "Non-decaying array conversion",
       "HLSL vector splat",
   };
@@ -2060,9 +2062,10 @@ static bool IsFloatingPointConversion(Sema &S, QualType FromType,
   return true;
 }
 
-static bool IsVectorElementConversion(Sema &S, QualType FromType,
-                                      QualType ToType,
-                                      ImplicitConversionKind &ICK, Expr *From) {
+static bool IsVectorOrMatrixElementConversion(Sema &S, QualType FromType,
+                                              QualType ToType,
+                                              ImplicitConversionKind &ICK,
+                                              Expr *From) {
   if (S.Context.hasSameUnqualifiedType(FromType, ToType))
     return true;
 
@@ -2102,6 +2105,59 @@ static bool IsVectorElementConversion(Sema &S, QualType FromType,
   return false;
 }
 
+/// Determine whether the conversion from FromType to ToType is a valid
+/// matrix conversion.
+///
+/// \param ICK Will be set to the matrix conversion kind, if this is a matrix
+/// conversion.
+static bool IsMatrixConversion(Sema &S, QualType FromType, QualType ToType,
+                               ImplicitConversionKind &ICK,
+                               ImplicitConversionKind &ElConv, Expr *From,
+                               bool InOverloadResolution, bool CStyle) {
+  // The non HLSL Matrix conversion rules are not clear.
+  if (!S.getLangOpts().HLSL)
+    return false;
+
+  auto *ToMatrixType = ToType->getAs<ConstantMatrixType>();
+  auto *FromMatrixType = FromType->getAs<ConstantMatrixType>();
+
+  // If both arguments are vectors, handle possible vector truncation and
+  // element conversion.
+  if (ToMatrixType && FromMatrixType) {
+    unsigned FromCols = FromMatrixType->getNumColumns();
+    unsigned ToCols = ToMatrixType->getNumColumns();
+    if (FromCols < ToCols)
+      return false;
+
+    unsigned FromRows = FromMatrixType->getNumRows();
+    unsigned ToRows = ToMatrixType->getNumRows();
+    if (FromRows < ToRows)
+      return false;
+
+    unsigned FromElts = FromMatrixType->getNumElementsFlattened();
+    unsigned ToElts = ToMatrixType->getNumElementsFlattened();
+    if (FromElts == ToElts)
+      ElConv = ICK_Identity;
+    else
+      ElConv = ICK_HLSL_Matrix_Truncation;
+
+    QualType FromElTy = FromMatrixType->getElementType();
+    QualType ToElTy = ToMatrixType->getElementType();
+    if (S.Context.hasSameUnqualifiedType(FromElTy, ToElTy))
+      return true;
+    return IsVectorOrMatrixElementConversion(S, FromElTy, ToElTy, ICK, From);
+  }
+  if (FromMatrixType && !ToMatrixType) {
+    ElConv = ICK_HLSL_Matrix_Truncation;
+    QualType FromElTy = FromMatrixType->getElementType();
+    if (S.Context.hasSameUnqualifiedType(FromElTy, ToType))
+      return true;
+    return IsVectorOrMatrixElementConversion(S, FromElTy, ToType, ICK, From);
+  }
+
+  return false;
+}
+
 /// Determine whether the conversion from FromType to ToType is a valid
 /// vector conversion.
 ///
@@ -2141,14 +2197,14 @@ static bool IsVectorConversion(Sema &S, QualType FromType, QualType ToType,
       QualType ToElTy = ToExtType->getElementType();
       if (S.Context.hasSameUnqualifiedType(FromElTy, ToElTy))
         return true;
-      return IsVectorElementConversion(S, FromElTy, ToElTy, ICK, From);
+      return IsVectorOrMatrixElementConversion(S, FromElTy, ToElTy, ICK, From);
     }
     if (FromExtType && !ToExtType) {
       ElConv = ICK_HLSL_Vector_Truncation;
       QualType FromElTy = FromExtType->getElementType();
       if (S.Context.hasSameUnqualifiedType(FromElTy, ToType))
         return true;
-      return IsVectorElementConversion(S, FromElTy, ToType, ICK, From);
+      return IsVectorOrMatrixElementConversion(S, FromElTy, ToType, ICK, From);
     }
     // Fallthrough for the case where ToType is a vector and FromType is not.
   }
@@ -2175,7 +2231,8 @@ static bool IsVectorConversion(Sema &S, QualType FromType, QualType ToType,
       if (S.getLangOpts().HLSL) {
         ElConv = ICK_HLSL_Vector_Splat;
         QualType ToElTy = ToExtType->getElementType();
-        return IsVectorElementConversion(S, FromType, ToElTy, ICK, From);
+        return IsVectorOrMatrixElementConversion(S, FromType, ToElTy, ICK,
+                                                 From);
       }
       ICK = ICK_Vector_Splat;
       return true;
@@ -2474,6 +2531,11 @@ static bool IsStandardConversion(Sema &S, Expr* From, QualType ToType,
     SCS.Second = SecondICK;
     SCS.Dimension = DimensionICK;
     FromType = ToType.getUnqualifiedType();
+  } else if (IsMatrixConversion(S, FromType, ToType, SecondICK, DimensionICK,
+                                From, InOverloadResolution, CStyle)) {
+    SCS.Second = SecondICK;
+    SCS.Dimension = DimensionICK;
+    FromType = ToType.getUnqualifiedType();
   } else if (!S.getLangOpts().CPlusPlus &&
              S.Context.typesAreCompatible(ToType, FromType)) {
     // Compatible conversions (Clang extension for C function overloading)
@@ -6251,6 +6313,7 @@ static bool CheckConvertedConstantConversions(Sema &S,
   case ICK_Incompatible_Pointer_Conversion:
   case ICK_Fixed_Point_Conversion:
   case ICK_HLSL_Vector_Truncation:
+  case ICK_HLSL_Matrix_Truncation:
     return false;
 
   case ICK_Lvalue_To_Rvalue:
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
index 4ddf8fd5b4b0f..db27c06cd18a3 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
@@ -560,6 +560,7 @@ void ExprEngine::VisitCast(const CastExpr *CastE, const Expr *Ex,
       case CK_VectorSplat:
       case CK_HLSLElementwiseCast:
       case CK_HLSLAggregateSplatCast:
+      case CK_HLSLMatrixTruncation:
       case CK_HLSLVectorTruncation: {
         QualType resultType = CastE->getType();
         if (CastE->isGLValue())
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
new file mode 100644
index 0000000000000..081b8013efcbc
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
@@ -0,0 +1,186 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -fnative-half-type -fnative-int16-type -o - %s | FileCheck %s
+
+
+// CHECK-LABEL: define hidden noundef <6 x i32> @_Z22elementwise_type_cast0u11matrix_typeILm3ELm2EfE(
+// CHECK-SAME: <6 x float> noundef nofpclass(nan inf) [[F32:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca [6 x float], align 4
+// CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    store <6 x float> [[F32]], ptr [[F32_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <6 x float>, ptr [[F32_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = fptosi <6 x float> [[TMP0]] to <6 x i32>
+// CHECK-NEXT:    store <6 x i32> [[CONV]], ptr [[I32]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i32>, ptr [[I32]], align 4
+// CHECK-NEXT:    ret <6 x i32> [[TMP1]]
+//
+int3x2 elementwise_type_cast0(float3x2 f32) {
+    int3x2 i32 = (int3x2)f32;
+    return i32;
+}
+
+// CHECK-LABEL: define hidden noundef <6 x i32> @_Z22elementwise_type_cast1u11matrix_typeILm3ELm2EsE(
+// CHECK-SAME: <6 x i16> noundef [[I16_32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I16_32_ADDR:%.*]] = alloca [6 x i16], align 2
+// CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    store <6 x i16> [[I16_32]], ptr [[I16_32_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i16>, ptr [[I16_32_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = sext <6 x i16> [[TMP0]] to <6 x i32>
+// CHECK-NEXT:    store <6 x i32> [[CONV]], ptr [[I32]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i32>, ptr [[I32]], align 4
+// CHECK-NEXT:    ret <6 x i32> [[TMP1]]
+//
+int3x2 elementwise_type_cast1(int16_t3x2 i16_32) {
+    int3x2 i32 = (int3x2)i16_32;
+    return i32;
+}
+
+// CHECK-LABEL: define hidden noundef <6 x i32> @_Z22elementwise_type_cast2u11matrix_typeILm3ELm2ElE(
+// CHECK-SAME: <6 x i64> noundef [[I64_32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I64_32_ADDR:%.*]] = alloca [6 x i64], align 8
+// CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    store <6 x i64> [[I64_32]], ptr [[I64_32_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i64>, ptr [[I64_32_ADDR]], align 8
+// CHECK-NEXT:    [[CONV:%.*]] = trunc <6 x i64> [[TMP0]] to <6 x i32>
+// CHECK-NEXT:    store <6 x i32> [[CONV]], ptr [[I32]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i32>, ptr [[I32]], align 4
+// CHECK-NEXT:    ret <6 x i32> [[TMP1]]
+//
+int3x2 elementwise_type_cast2(int64_t3x2 i64_32) {
+    int3x2 i32 = (int3x2)i64_32;
+    return i32;
+}
+
+// CHECK-LABEL: define hidden noundef <6 x i16> @_Z22elementwise_type_cast3u11matrix_typeILm2ELm3EDhE(
+// CHECK-SAME: <6 x half> noundef nofpclass(nan inf) [[H23:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[H23_ADDR:%.*]] = alloca [6 x half], align 2
+// CHECK-NEXT:    [[I23:%.*]] = alloca [6 x i16], align 2
+// CHECK-NEXT:    store <6 x half> [[H23]], ptr [[H23_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <6 x half>, ptr [[H23_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = fptosi <6 x half> [[TMP0]] to <6 x i16>
+// CHECK-NEXT:    store <6 x i16> [[CONV]], ptr [[I23]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i16>, ptr [[I23]], align 2
+// CHECK-NEXT:    ret <6 x i16> [[TMP1]]
+//
+int16_t2x3 elementwise_type_cast3(half2x3 h23) {
+    int16_t2x3 i23 = (int16_t2x3)h23;
+    return i23;
+}
+
+// CHECK-LABEL: define hidden noundef <6 x i32> @_Z22elementwise_type_cast4u11matrix_typeILm3ELm2EdE(
+// CHECK-SAME: <6 x double> noundef nofpclass(nan inf) [[D32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[D32_ADDR:%.*]] = alloca [6 x double], align 8
+// CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    store <6 x double> [[D32]], ptr [[D32_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <6 x double>, ptr [[D32_ADDR]], align 8
+// CHECK-NEXT:    [[CONV:%.*]] = fptosi <6 x double> [[TMP0]] to <6 x i32>
+// CHECK-NEXT:    store <6 x i32> [[CONV]], ptr [[I32]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i32>, ptr [[I32]], align 4
+// CHECK-NEXT:    ret <6 x i32> [[TMP1]]
+//
+int3x2 elementwise_type_cast4(double3x2 d32) {
+    int3x2 i32 = (int3x2)d32;
+    return i32;
+}
+
+// CHECK-LABEL: define hidden void @_Z5call2v(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A:%.*]] = alloca [2 x [1 x i32]], align 4
+// CHECK-NEXT:    [[B:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT:    [[AGG_TEMP:%.*]] = alloca [2 x [1 x i32]], align 4
+// CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <2 x i32>, align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @__const._Z5call2v.A, i32 8, i1 false)
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_TEMP]], ptr align 4 [[A]], i32 8, i1 false)
+// CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[AGG_TEMP]], i32 0, i32 0, i32 0
+// CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[AGG_TEMP]], i32 0, i32 1, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[FLATCAST_TMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[GEP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP1]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP3]], i64 1
+// CHECK-NEXT:    store <2 x i32> [[TMP4]], ptr [[B]], align 4
+// CHECK-NEXT:    ret void
+//
+void call2() {
+  int A[2][1] = {{1},{2}};
+  int2x1 B = (int2x1)A;
+}
+
+struct S {
+  int X;
+  float Y;
+};
+
+// CHECK-LABEL: define hidden void @_Z5call3v(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 1
+// CHECK-NEXT:    [[A:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT:    [[AGG_TEMP:%.*]] = alloca [[STRUCT_S]], align 1
+// CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <2 x i32>, align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[S]], ptr align 1 @__const._Z5call3v.s, i32 8, i1 false)
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[AGG_TEMP]], ptr align 1 [[S]], i32 8, i1 false)
+// CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_TEMP]], i32 0, i32 0
+// CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[AGG_TEMP]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[FLATCAST_TMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[GEP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[GEP1]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[TMP3]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[CONV]], i64 1
+// CHECK-NEXT:    store <2 x i32> [[TMP4]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
+void call3() {
+  S s = {1, 2.0};
+  int2x1 A = (int2x1)s;
+}
+
+struct BFields {
+  double D;
+  int E: 15;
+  int : 8;
+  float F;
+};
+
+struct Derived : BFields {
+  int G;
+};
+
+// CHECK-LABEL: define hidden void @_Z5call47Derived(
+// CHECK-SAME: ptr noundef byval([[STRUCT_DERIVED:%.*]]) align 1 [[D:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A:%.*]] = alloca [4 x i32], align 4
+// CHECK-NEXT:    [[AGG_TEMP:%.*]] = alloca [[STRUCT_DERIVED]], align 1
+// CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <4 x i32>, align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[AGG_TEMP]], ptr align 1 [[D]], i32 19, i1 false)
+// CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[STRUCT_DERIVED]], ptr [[AGG_TEMP]], i32 0, i32 0
+// CHECK-NEXT:    [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_BFIELDS:%.*]], ptr [[GEP]], i32 0, i32 1
+// CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_DERIVED]], ptr [[AGG_TEMP]], i32 0, i32 0, i32 0
+// CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [[STRUCT_DERIVED]], ptr [[AGG_TEMP]], i32 0, i32 0, i32 2
+// CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [[STRUCT_DERIVED]], ptr [[AGG_TEMP]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[FLATCAST_TMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[GEP1]], align 8
+// CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[TMP1]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[CONV]], i64 0
+// CHECK-NEXT:    [[BF_LOAD:%.*]] = load i24, ptr [[E]], align 1
+// CHECK-NEXT:    [[BF_SHL:%.*]] = shl i24 [[BF_LOAD]], 9
+// CHECK-NEXT:    [[BF_ASHR:%.*]] = ashr i24 [[BF_SHL]], 9
+// CHECK-NEXT:    [[BF_CAST:%.*]] = sext i24 [[BF_ASHR]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[BF_CAST]], i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[GEP2]], align 4
+// CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP4]] to i32
+// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CONV4]], i64 2
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP3]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 3
+// CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
+void call4(Derived D) {
+  int2x2 A = (int2x2)D;
+}
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixTruncation.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixTruncation.hlsl
new file mode 100644
index 0000000000000..f16d01e1d12ea
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixTruncation.hlsl
@@ -0,0 +1,156 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.7-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - %s | FileCheck %s
+
+// CHECK-LABEL: define hidden noundef <12 x i32> @_Z10trunc_castu11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I34:%.*]] = alloca [12 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+// CHECK-NEXT:    store <12 x i32> [[TRUNC]], ptr [[I34]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <12 x i32>, ptr [[I34]], align 4
+// CHECK-NEXT:    ret <12 x i32> [[TMP1]]
+//
+ int3x4 trunc_cast(int4x4 i44) {
+    int3x4 i34 = (int3x4)i44;
+    return i34;
+}
+
+// CHECK-LABEL: define hidden noundef <12 x i32> @_Z11trunc_cast0u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I43:%.*]] = alloca [12 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+// CHECK-NEXT:    store <12 x i32> [[TRUNC]], ptr [[I43]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <12 x i32>, ptr [[I43]], align 4
+// CHECK-NEXT:    ret <12 x i32> [[TMP1]]
+//
+ int4x3 trunc_cast0(int4x4 i44) {
+    int4x3 i43 = (int4x3)i44;
+    return i43;
+}
+
+// CHECK-LABEL: define hidden noundef <9 x i32> @_Z11trunc_cast1u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I33:%.*]] = alloca [9 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+// CHECK-NEXT:    store <9 x i32> [[TRUNC]], ptr [[I33]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <9 x i32>, ptr [[I33]], align 4
+// CHECK-NEXT:    ret <9 x i32> [[TMP1]]
+//
+ int3x3 trunc_cast1(int4x4 i44) {
+    int3x3 i33 = (int3x3)i44;
+    return i33;
+}
+
+// CHECK-LABEL: define hidden noundef <6 x i32> @_Z11trunc_cast2u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+// CHECK-NEXT:    store <6 x i32> [[TRUNC]], ptr [[I32]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i32>, ptr [[I32]], align 4
+// CHECK-NEXT:    ret <6 x i32> [[TMP1]]
+//
+ int3x2 trunc_cast2(int4x4 i44) {
+    int3x2 i32 = (int3x2)i44;
+    return i32;
+}
+
+// CHECK-LABEL: define hidden noundef <6 x i32> @_Z11trunc_cast3u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I23:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+// CHECK-NEXT:    store <6 x i32> [[TRUNC]], ptr [[I23]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i32>, ptr [[I23]], align 4
+// CHECK-NEXT:    ret <6 x i32> [[TMP1]]
+//
+ int2x3 trunc_cast3(int4x4 i44) {
+    int2x3 i23 = (int2x3)i44;
+    return i23;
+}
+
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z11trunc_cast4u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I22:%.*]] = alloca [4 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x i32> [[TRUNC]], ptr [[I22]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[I22]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+ int2x2 trunc_cast4(int4x4 i44) {
+    int2x2 i22 = (int2x2)i44;
+    return i22;
+}
+
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z11trunc_cast5u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I21:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x i32> [[TRUNC]], ptr [[I21]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[I21]], align 4
+// CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+//
+ int2x1 trunc_cast5(int4x4 i44) {
+    int2x1 i21 = (int2x1)i44;
+    return i21;
+}
+
+// CHECK-LABEL: define hidden noundef i32 @_Z11trunc_cast6u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I1:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[CAST_MTRUNC:%.*]] = extractelement <16 x i32> [[TMP0]], i32 0
+// CHECK-NEXT:    store i32 [[CAST_MTRUNC]], ptr [[I1]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I1]], align 4
+// CHECK-NEXT:    ret i32 [[TMP1]]
+//
+ int trunc_cast6(int4x4 i44) {
+    int i1 = (int)i44;
+    return i1;
+}
+
+// CHECK-LABEL: define hidden noundef i32 @_Z16trunc_multi_castu11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I1:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+// CHECK-NEXT:    [[CAST_MTRUNC:%.*]] = extractelement <12 x i32> [[TRUNC]], i32 0
+// CHECK-NEXT:    store i32 [[CAST_MTRUNC]], ptr [[I1]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I1]], align 4
+// CHECK-NEXT:    ret i32 [[TMP1]]
+//
+ int trunc_multi_cast(int4x4 i44) {
+    int i1 = (int)(int3x4)i44;
+    return i1;
+}
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixCastErrors.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixCastErrors.hlsl
new file mode 100644
index 0000000000000..59d432cd3eb00
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixCastErrors.hlsl
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -std=hlsl202x -verify %s
+
+// Note column is too large
+export int3x2 shape_cast_error(float2x3 f23) {
+    int3x2 i32 = (int3x2)f23;
+    // expected-error at -1 {{conversion between matrix types 'int3x2' (aka 'matrix<int, 3, 2>') and 'matrix<float, 2, 3>' of different size is not allowed}}
+    return i32;
+}
+// Note row is too large
+export int2x3 shape_cast_error2(float3x2 f32) {
+    int2x3 i23 = (int2x3)f32;
+    // expected-error at -1 {{conversion between matrix types 'int2x3' (aka 'matrix<int, 2, 3>') and 'matrix<float, 3, 2>' of different size is not allowed}}
+    return i23;
+}
+
+// Note do the type change independent of the shape should still error
+export int2x3 shape_cast_error3(float3x2 f32) {
+    int2x3 i23 = (int3x2)f32;
+    // expected-error at -1 {{cannot initialize a variable of type 'matrix<[...], 2, 3>' with an rvalue of type 'matrix<[...], 3, 2>}}
+    return i23;
+}

>From ba2dab99a64b3cfbf00f9286e72a58b735c332a0 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Thu, 20 Nov 2025 13:05:55 -0500
Subject: [PATCH 02/12] fix ObjC warning

---
 clang/lib/Edit/RewriteObjCFoundationAPI.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/Edit/RewriteObjCFoundationAPI.cpp b/clang/lib/Edit/RewriteObjCFoundationAPI.cpp
index 40f8348241ecc..e8d4660fd36b2 100644
--- a/clang/lib/Edit/RewriteObjCFoundationAPI.cpp
+++ b/clang/lib/Edit/RewriteObjCFoundationAPI.cpp
@@ -1085,6 +1085,7 @@ static bool rewriteToNumericBoxedExpression(const ObjCMessageExpr *Msg,
       llvm_unreachable("OpenCL-specific cast in Objective-C?");
 
     case CK_HLSLVectorTruncation:
+    case CK_HLSLMatrixTruncation:
     case CK_HLSLElementwiseCast:
     case CK_HLSLAggregateSplatCast:
       llvm_unreachable("HLSL-specific cast in Objective-C?");

>From 909ad8da1c24ae4ea792af0fc18c1b52e53f66eb Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Thu, 20 Nov 2025 18:21:13 -0500
Subject: [PATCH 03/12] address pr comments

---
 clang/include/clang/Sema/Overload.h           |   2 +-
 clang/lib/AST/ExprConstant.cpp                |   6 +-
 clang/lib/Sema/SemaOverload.cpp               |   4 +-
 ...ion.hlsl => MatrixExplicitTruncation.hlsl} |   0
 .../MatrixImplicitTruncation.hlsl             | 138 +++++++++
 .../MatrixElementOverloadResolution.hlsl      | 287 ++++++++++++++++++
 6 files changed, 430 insertions(+), 7 deletions(-)
 rename clang/test/CodeGenHLSL/BasicFeatures/{MatrixTruncation.hlsl => MatrixExplicitTruncation.hlsl} (100%)
 create mode 100644 clang/test/CodeGenHLSL/BasicFeatures/MatrixImplicitTruncation.hlsl
 create mode 100644 clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl

diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index 1ad52cb9da517..ab45328ee8ab7 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -198,7 +198,7 @@ class Sema;
     /// HLSL vector truncation.
     ICK_HLSL_Vector_Truncation,
 
-    /// HLSL Matrid truncation.
+    /// HLSL Matrix truncation.
     ICK_HLSL_Matrix_Truncation,
 
     /// HLSL non-decaying array rvalue cast.
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index baadaab0973a2..c6366a9df4dd8 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11771,7 +11771,7 @@ bool VectorExprEvaluator::VisitCastExpr(const CastExpr *E) {
     return Success(Elements, E);
   }
   case CK_HLSLMatrixTruncation: {
-    // TODO: support Expr Constant for Matrix Truncation
+    // TODO: See #168935. Add matrix truncation support to expr constant.
     return Error(E);
   }
   case CK_HLSLAggregateSplatCast: {
@@ -18435,7 +18435,7 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
     return Success(Val.getVectorElt(0), E);
   }
   case CK_HLSLMatrixTruncation: {
-    // TODO: support Expr Constant for Matrix Truncation
+    // TODO: See #168935. Add matrix truncation support to expr constant.
     return Error(E);
   }
   case CK_HLSLElementwiseCast: {
@@ -19032,7 +19032,7 @@ bool FloatExprEvaluator::VisitCastExpr(const CastExpr *E) {
     return Success(Val.getVectorElt(0), E);
   }
   case CK_HLSLMatrixTruncation: {
-    // TODO: support Expr Constant for Matrix Truncation
+    // TODO: See #168935. Add matrix truncation support to expr constant.
     return Error(E);
   }
   case CK_HLSLElementwiseCast: {
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index b5f92eab9b2f1..1acbda85601af 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -2134,9 +2134,7 @@ static bool IsMatrixConversion(Sema &S, QualType FromType, QualType ToType,
     if (FromRows < ToRows)
       return false;
 
-    unsigned FromElts = FromMatrixType->getNumElementsFlattened();
-    unsigned ToElts = ToMatrixType->getNumElementsFlattened();
-    if (FromElts == ToElts)
+    if (FromRows == ToRows && FromCols == ToCols)
       ElConv = ICK_Identity;
     else
       ElConv = ICK_HLSL_Matrix_Truncation;
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixTruncation.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixExplicitTruncation.hlsl
similarity index 100%
rename from clang/test/CodeGenHLSL/BasicFeatures/MatrixTruncation.hlsl
rename to clang/test/CodeGenHLSL/BasicFeatures/MatrixExplicitTruncation.hlsl
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixImplicitTruncation.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixImplicitTruncation.hlsl
new file mode 100644
index 0000000000000..6a53d2e8ee96c
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixImplicitTruncation.hlsl
@@ -0,0 +1,138 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.7-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - %s | FileCheck %s
+
+// CHECK-LABEL: define hidden noundef <12 x i32> @_Z10trunc_castu11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I34:%.*]] = alloca [12 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+// CHECK-NEXT:    store <12 x i32> [[TRUNC]], ptr [[I34]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <12 x i32>, ptr [[I34]], align 4
+// CHECK-NEXT:    ret <12 x i32> [[TMP1]]
+//
+ int3x4 trunc_cast(int4x4 i44) {
+    int3x4 i34 = i44;
+    return i34;
+}
+
+// CHECK-LABEL: define hidden noundef <12 x i32> @_Z11trunc_cast0u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I43:%.*]] = alloca [12 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+// CHECK-NEXT:    store <12 x i32> [[TRUNC]], ptr [[I43]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <12 x i32>, ptr [[I43]], align 4
+// CHECK-NEXT:    ret <12 x i32> [[TMP1]]
+//
+ int4x3 trunc_cast0(int4x4 i44) {
+    int4x3 i43 = i44;
+    return i43;
+}
+
+// CHECK-LABEL: define hidden noundef <9 x i32> @_Z11trunc_cast1u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I33:%.*]] = alloca [9 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+// CHECK-NEXT:    store <9 x i32> [[TRUNC]], ptr [[I33]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <9 x i32>, ptr [[I33]], align 4
+// CHECK-NEXT:    ret <9 x i32> [[TMP1]]
+//
+ int3x3 trunc_cast1(int4x4 i44) {
+    int3x3 i33 = i44;
+    return i33;
+}
+
+// CHECK-LABEL: define hidden noundef <6 x i32> @_Z11trunc_cast2u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+// CHECK-NEXT:    store <6 x i32> [[TRUNC]], ptr [[I32]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i32>, ptr [[I32]], align 4
+// CHECK-NEXT:    ret <6 x i32> [[TMP1]]
+//
+ int3x2 trunc_cast2(int4x4 i44) {
+    int3x2 i32 = i44;
+    return i32;
+}
+
+// CHECK-LABEL: define hidden noundef <6 x i32> @_Z11trunc_cast3u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I23:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+// CHECK-NEXT:    store <6 x i32> [[TRUNC]], ptr [[I23]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i32>, ptr [[I23]], align 4
+// CHECK-NEXT:    ret <6 x i32> [[TMP1]]
+//
+ int2x3 trunc_cast3(int4x4 i44) {
+    int2x3 i23 = i44;
+    return i23;
+}
+
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z11trunc_cast4u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I22:%.*]] = alloca [4 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x i32> [[TRUNC]], ptr [[I22]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[I22]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+ int2x2 trunc_cast4(int4x4 i44) {
+    int2x2 i22 = i44;
+    return i22;
+}
+
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z11trunc_cast5u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I21:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x i32> [[TRUNC]], ptr [[I21]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[I21]], align 4
+// CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+//
+ int2x1 trunc_cast5(int4x4 i44) {
+    int2x1 i21 = i44;
+    return i21;
+}
+
+// CHECK-LABEL: define hidden noundef i32 @_Z11trunc_cast6u11matrix_typeILm4ELm4EiE(
+// CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I1:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
+// CHECK-NEXT:    [[CAST_MTRUNC:%.*]] = extractelement <16 x i32> [[TMP0]], i32 0
+// CHECK-NEXT:    store i32 [[CAST_MTRUNC]], ptr [[I1]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I1]], align 4
+// CHECK-NEXT:    ret i32 [[TMP1]]
+//
+ int trunc_cast6(int4x4 i44) {
+    int i1 = i44;
+    return i1;
+}
diff --git a/clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl b/clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl
new file mode 100644
index 0000000000000..ebe4db7a9e26e
--- /dev/null
+++ b/clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl
@@ -0,0 +1,287 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -finclude-default-header -Wconversion -verify -o - %s -DERROR=1
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -finclude-default-header -Wno-conversion -ast-dump %s | FileCheck %s
+
+// This test verifies floating point type implicit conversion ranks for overload
+// resolution. In HLSL the built-in type ranks are half < float < double. This
+// applies to both scalar and matrix types.
+
+// HLSL allows implicit truncation fo types, so it differentiates between
+// promotions (converting to larger types) and conversions (converting to
+// smaller types). Promotions are preferred over conversions. Promotions prefer
+// promoting to the next lowest type in the ranking order. Conversions prefer
+// converting to the next highest type in the ranking order.
+
+void HalfFloatDouble(double2x2 D);
+void HalfFloatDouble(float2x2 F);
+void HalfFloatDouble(half2x2 H);
+
+// CHECK: FunctionDecl {{.*}} used HalfFloatDouble 'void (double2x2)'
+// CHECK: FunctionDecl {{.*}} used HalfFloatDouble 'void (float2x2)'
+// CHECK: FunctionDecl {{.*}} used HalfFloatDouble 'void (half2x2)'
+
+void FloatDouble(double2x2 D); // expected-note {{candidate function}}
+void FloatDouble(float2x2 F); // expected-note {{candidate function}}
+
+// CHECK: FunctionDecl {{.*}} used FloatDouble 'void (double2x2)'
+// CHECK: FunctionDecl {{.*}} used FloatDouble 'void (float2x2)'
+
+void HalfDouble(double2x2 D);
+void HalfDouble(half2x2 H);
+
+// CHECK: FunctionDecl {{.*}} used HalfDouble 'void (double2x2)'
+// CHECK: FunctionDecl {{.*}} used HalfDouble 'void (half2x2)'
+
+void HalfFloat(float2x2 F); // expected-note {{candidate function}}
+void HalfFloat(half2x2 H); // expected-note {{candidate function}}
+
+// CHECK: FunctionDecl {{.*}} used HalfFloat 'void (float2x2)'
+// CHECK: FunctionDecl {{.*}} used HalfFloat 'void (half2x2)'
+
+void Double(double2x2 D);
+void Float(float2x2 F);
+void Half(half2x2 H);
+
+// CHECK: FunctionDecl {{.*}} used Double 'void (double2x2)'
+// CHECK: FunctionDecl {{.*}} used Float 'void (float2x2)'
+// CHECK: FunctionDecl {{.*}} used Half 'void (half2x2)'
+
+// Case 1: A function declared with overloads for half float and double types.
+//   (a) When called with half, it will resolve to half because half is an exact
+//   match.
+//   (b) When called with float it will resolve to float because float is an
+//   exact match.
+//   (c) When called with double it will resolve to double because it is an
+//   exact match.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case1 'void (half2x2, float2x2, double2x2)'
+void Case1(half2x2 H, float2x2 F, double2x2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half2x2)' lvalue Function {{.*}} 'HalfFloatDouble' 'void (half2x2)'
+  HalfFloatDouble(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2x2)' lvalue Function {{.*}} 'HalfFloatDouble' 'void (float2x2)'
+  HalfFloatDouble(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2x2)' lvalue Function {{.*}} 'HalfFloatDouble' 'void (double2x2)'
+  HalfFloatDouble(D);
+}
+
+// Case 2: A function declared with double and float overlaods.
+//   (a) When called with half, it fails to resulve the ambiguous promotion.
+//   (b) When called with float it will resolve to float because float is an
+//   exact match.
+//   (c) When called with double it will resolve to double because it is an
+//   exact match.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case2 'void (half2x2, float2x2, double2x2)'
+void Case2(half2x2 H, float2x2 F, double2x2 D) {
+#if ERROR
+  FloatDouble(H); // expected-error {{call to 'FloatDouble' is ambiguous}}
+#endif
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2x2)' lvalue Function {{.*}} 'FloatDouble' 'void (float2x2)'
+  FloatDouble(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2x2)' lvalue Function {{.*}} 'FloatDouble' 'void (double2x2)'
+  FloatDouble(D);
+}
+
+// Case 3: A function declared with half and double overloads
+//   (a) When called with half, it will resolve to half because it is an exact
+//   match.
+//   (b) When called with flaot, it will resolve to double because double is a
+//   valid promotion.
+//   (c) When called with double, it will resolve to double because it is an
+//   exact match.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case3 'void (half2x2, float2x2, double2x2)'
+void Case3(half2x2 H, float2x2 F, double2x2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half2x2)' lvalue Function {{.*}} 'HalfDouble' 'void (half2x2)'
+  HalfDouble(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2x2)' lvalue Function {{.*}} 'HalfDouble' 'void (double2x2)'
+  HalfDouble(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2x2)' lvalue Function {{.*}} 'HalfDouble' 'void (double2x2)'
+  HalfDouble(D);
+}
+
+// Case 4: A function declared with half and float overloads.
+//   (a) When called with half, it will resolve to half because half is an exact
+//   match.
+//   (b) When called with float it will resolve to float because float is an
+//   exact match.
+//   (c) When called with double it fails to resolve the ambigjuous conversion.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case4 'void (half2x2, float2x2, double2x2)'
+void Case4(half2x2 H, float2x2 F, double2x2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half2x2)' lvalue Function {{.*}} 'HalfFloat' 'void (half2x2)'
+  HalfFloat(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2x2)' lvalue Function {{.*}} 'HalfFloat' 'void (float2x2)'
+  HalfFloat(F);
+
+#if ERROR
+  HalfFloat(D); // expected-error{{call to 'HalfFloat' is ambiguous}}
+#endif
+}
+
+// Case 5: A function declared with only a double overload.
+//   (a) When called with half, it will resolve to double because double is a
+//   valid promotion.
+//   (b) When called with float it will resolve to double because double is a
+//   valid promotion.
+//   (c) When called with double it will resolve to double because it is an
+//   exact match.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case5 'void (half2x2, float2x2, double2x2)'
+void Case5(half2x2 H, float2x2 F, double2x2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2x2)' lvalue Function {{.*}} 'Double' 'void (double2x2)'
+  Double(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2x2)' lvalue Function {{.*}} 'Double' 'void (double2x2)'
+  Double(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2x2)' lvalue Function {{.*}} 'Double' 'void (double2x2)'
+  Double(D);
+}
+
+// Case 6: A function declared with only a float overload.
+//   (a) When called with half, it will resolve to float because float is a
+//   valid promotion.
+//   (b) When called with float it will resolve to float because float is an
+//   exact match.
+//   (c) When called with double it will resolve to float because it is a
+//   valid conversion.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case6 'void (half2x2, float2x2, double2x2)'
+void Case6(half2x2 H, float2x2 F, double2x2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2x2)' lvalue Function {{.*}} 'Float' 'void (float2x2)'
+  Float(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2x2)' lvalue Function {{.*}} 'Float' 'void (float2x2)'
+  Float(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2x2)' lvalue Function {{.*}} 'Float' 'void (float2x2)'
+  Float(D); // TODO: See #168944. Make this an expect warning. {{implicit conversion loses floating-point precision: 'double2x2' (aka 'matrix<double, 2, 2>') to 'matrix<float, 2, 2>' (matrix of 2 'float' values)}}
+}
+
+// Case 7: A function declared with only a half overload.
+//   (a) When called with half, it will resolve to half because half is an
+//   exact match
+//   (b) When called with float it will resolve to half because half is a
+//   valid conversion.
+//   (c) When called with double it will resolve to float because it is a
+//   valid conversion.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case7 'void (half2x2, float2x2, double2x2)'
+void Case7(half2x2 H, float2x2 F, double2x2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half2x2)' lvalue Function {{.*}} 'Half' 'void (half2x2)'
+  Half(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half2x2)' lvalue Function {{.*}} 'Half' 'void (half2x2)'
+  Half(F); // TODO: See #168944. Make this an expect warning. {{implicit conversion loses floating-point precision: 'float2x2' (aka 'matrix<float, 2, 2>') to 'matrix<half, 2, 2>' (matrix of 4 'half' values)}}
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half2x2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half2x2)' lvalue Function {{.*}} 'Half' 'void (half2x2)'
+  Half(D); // TODO: See #168944. Make this an expect warning.  {{implicit conversion loses floating-point precision: 'double2x2' (aka 'matrix<double, 2, 2>') to 'matrix<half, 2, 2>' (matrix of 4 'half' values)}}
+}
+
+void fn3x2(float3x2) {} // expected-note{{candidate function}}
+void fn2x2(float2x2) {}
+void fn2x2IO(inout float2x2) {}
+void fnI2x2IO(inout int2x2) {}
+
+void matOrVec(float4 F) {}
+void matOrVec(float2x2 F) {}
+
+void matOrVec2(float3 F) {} // expected-note{{candidate function}}
+void matOrVec2(float2x3 F) {} // expected-note{{candidate function}}
+
+export void Case8(float2x3 f23, float4x4 f44, float3x3 f33, float3x2 f32) {
+  int2x2 i22 = f23;
+  //CHECK: VarDecl {{.*}} i22 'int2x2':'matrix<int, 2, 2>' cinit
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'int2x2':'matrix<int, 2, 2>' <FloatingToIntegral>
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2x3':'matrix<float, 2, 3>' <LValueToRValue>
+#ifdef ERROR
+  int3x2 i32 = f23; // expected-error{{cannot initialize a variable of type 'matrix<int, 3, 2>' with an lvalue of type 'matrix<float, 2, 3>'}}
+  fn3x2(f23); // expected-error{{no matching function for call to 'fn3x2'}}
+#endif
+  
+  fn2x2(f23);
+  //CHECK: DeclRefExpr {{.*}} 'void (float2x2)' lvalue Function {{.*}} 'fn2x2' 'void (float2x2)'
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'matrix<float, 2, 2>' <HLSLMatrixTruncation>
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2x3':'matrix<float, 2, 3>' <LValueToRValue>
+
+#ifdef ERROR
+  fn2x2IO(f23); // expected-error{{assigning to 'matrix<[2 * ...], 3>' from incompatible type 'matrix<[2 * ...], 2>'}}
+  fnI2x2IO(f23); // expected-error{{assigning to 'matrix<float, [...], 3>' from incompatible type 'matrix<int, [...], 2>'}}
+#endif
+
+  matOrVec(f23);
+  //CHECK: DeclRefExpr {{.*}} 'void (float2x2)' lvalue Function {{.*}} 'matOrVec' 'void (float2x2)'
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'matrix<float, 2, 2>' <HLSLMatrixTruncation>
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2x3':'matrix<float, 2, 3>' <LValueToRValue>
+
+  matOrVec(f44);
+  //CHECK: DeclRefExpr {{.*}} 'void (float2x2)' lvalue Function {{.*}} 'matOrVec' 'void (float2x2)'
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'matrix<float, 2, 2>' <HLSLMatrixTruncation>
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4x4':'matrix<float, 4, 4>' <LValueToRValue>
+
+#ifdef ERROR
+  matOrVec(2.0); // TODO: See #168960 this should be ambiguous once we implement ICK_HLSL_Matrix_Splat.
+#endif
+  matOrVec2(f23);
+  //CHECK: DeclRefExpr {{.*}} 'void (float2x3)' lvalue Function {{.*}} 'matOrVec2' 'void (float2x3)'
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2x3':'matrix<float, 2, 3>' <LValueToRValue>
+
+  matOrVec2(f44);
+  //CHECK: DeclRefExpr {{.*}} 'void (float2x3)' lvalue Function {{.*}} 'matOrVec2' 'void (float2x3)'
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'matrix<float, 2, 3>' <HLSLMatrixTruncation>
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4x4':'matrix<float, 4, 4>' <LValueToRValue>
+
+  matOrVec2(f33);
+  //CHECK: DeclRefExpr {{.*}} 'void (float2x3)' lvalue Function {{.*}} 'matOrVec2' 'void (float2x3)'
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'matrix<float, 2, 3>' <HLSLMatrixTruncation>
+  //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float3x3':'matrix<float, 3, 3>' <LValueToRValue>
+  
+#ifdef ERROR
+  matOrVec2(f32); // expected-error{{no matching function for call to 'matOrVec2'}}
+#endif
+}

>From 3e13f119828fbc6783bda5b51269b1b43d4e977e Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Fri, 21 Nov 2025 15:25:20 -0500
Subject: [PATCH 04/12] add implicit trunc warnings

---
 .../clang/Basic/DiagnosticSemaKinds.td        |  6 +++
 clang/lib/Sema/SemaChecking.cpp               | 14 ++++++
 .../MatrixImplicitTruncCastWarnings.hlsl      | 50 +++++++++++++++++++
 3 files changed, 70 insertions(+)
 create mode 100644 clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 4a145fd71eedd..a107e7eea8b79 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -4356,6 +4356,9 @@ def warn_param_typestate_mismatch : Warning<
 def warn_unknown_sanitizer_ignored : Warning<
   "unknown sanitizer '%0' ignored">, InGroup<UnknownSanitizers>;
 
+def warn_impcast_matrix_scalar : Warning<
+  "implicit conversion turns matrix to scalar: %0 to %1">,
+  InGroup<Conversion>, DefaultIgnore;
 def warn_impcast_vector_scalar : Warning<
   "implicit conversion turns vector to scalar: %0 to %1">,
   InGroup<Conversion>, DefaultIgnore;
@@ -13276,6 +13279,9 @@ def err_hlsl_builtin_scalar_vector_mismatch
 def warn_hlsl_impcast_vector_truncation : Warning<
   "implicit conversion truncates vector: %0 to %1">, InGroup<Conversion>;
 
+def warn_hlsl_impcast_matrix_truncation : Warning<
+  "implicit conversion truncates matrix: %0 to %1">, InGroup<Conversion>;
+
 def warn_hlsl_availability : Warning<
   "%0 is only available %select{|in %4 environment }3on %1 %2 or newer">,
   InGroup<HLSLAvailability>, DefaultError;
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 0ffb4854ba86d..cbb909ebbe9cf 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -37,6 +37,7 @@
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/TemplateName.h"
 #include "clang/AST/Type.h"
+#include "clang/AST/TypeBase.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/AST/UnresolvedSet.h"
 #include "clang/Basic/AddressSpaces.h"
@@ -12591,6 +12592,19 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
   if (auto VecTy = dyn_cast<VectorType>(Target))
     Target = VecTy->getElementType().getTypePtr();
 
+  if (isa<ConstantMatrixType>(Source)) {
+    if (!isa<ConstantMatrixType>(Target)) {
+      return DiagnoseImpCast(*this, E, T, CC, diag::warn_impcast_matrix_scalar);
+    } else if (getLangOpts().HLSL &&
+               Target->castAs<ConstantMatrixType>()->getNumElementsFlattened() <
+                   Source->castAs<ConstantMatrixType>()
+                       ->getNumElementsFlattened()) {
+      // Diagnose Matrix truncation but don't return. We may also want to
+      // diagnose an element conversion.
+      DiagnoseImpCast(*this, E, T, CC,
+                      diag::warn_hlsl_impcast_matrix_truncation);
+    }
+  }
   // Strip complex types.
   if (isa<ComplexType>(Source)) {
     if (!isa<ComplexType>(Target)) {
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl
new file mode 100644
index 0000000000000..360c9f7f31b15
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -Wconversion -verify %s
+
+export int3x4 trunc_cast(int4x4 i44) {
+    int3x4 i34 = i44;
+    // expected-warning at -1{{implicit conversion truncates matrix: 'int4x4' (aka 'matrix<int, 4, 4>') to 'matrix<int, 3, 4>'}}
+    return i34;
+}
+
+export int4x3 trunc_cast0(int4x4 i44) {
+    int4x3 i43 = i44;
+    // expected-warning at -1{{implicit conversion truncates matrix: 'int4x4' (aka 'matrix<int, 4, 4>') to 'matrix<int, 4, 3>'}}
+    return i43;
+}
+
+export int3x3 trunc_cast1(int4x4 i44) {
+    int3x3 i33 = i44;
+    // expected-warning at -1{{implicit conversion truncates matrix: 'int4x4' (aka 'matrix<int, 4, 4>') to 'matrix<int, 3, 3>'}}
+    return i33;
+}
+
+export int3x2 trunc_cast2(int4x4 i44) {
+    int3x2 i32 = i44;
+    // expected-warning at -1{{implicit conversion truncates matrix: 'int4x4' (aka 'matrix<int, 4, 4>') to 'matrix<int, 3, 2>'}}
+    return i32;
+}
+
+export int2x3 trunc_cast3(int4x4 i44) {
+    int2x3 i23 = i44;
+    // expected-warning at -1{{implicit conversion truncates matrix: 'int4x4' (aka 'matrix<int, 4, 4>') to 'matrix<int, 2, 3>'}}
+    return i23;
+}
+
+export int2x2 trunc_cast4(int4x4 i44) {
+    int2x2 i22 = i44;
+    // expected-warning at -1{{implicit conversion truncates matrix: 'int4x4' (aka 'matrix<int, 4, 4>') to 'matrix<int, 2, 2>'}}
+    return i22;
+}
+
+export int2x1 trunc_cast5(int4x4 i44) {
+    int2x1 i21 = i44;
+    // expected-warning at -1{{implicit conversion truncates matrix: 'int4x4' (aka 'matrix<int, 4, 4>') to 'matrix<int, 2, 1>'}}
+    return i21;
+}
+
+export int trunc_scalar_cast6(int4x4 i44) {
+    int i1 = i44;
+    // expected-warning at -1{{implicit conversion turns matrix to scalar: 'int4x4' (aka 'matrix<int, 4, 4>') to 'int'}}
+    return i1;
+}
+

>From 023ef0eaa0ddf9e68267f5b595a7ffab8d535eb6 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Fri, 21 Nov 2025 16:09:19 -0500
Subject: [PATCH 05/12] create a -Wmatrix-conversion diagnostic group. Move
 some of the vector warnings to the Vector diagnostic group

---
 clang/include/clang/Basic/DiagnosticGroups.td             | 3 +++
 clang/include/clang/Basic/DiagnosticSemaKinds.td          | 8 ++++----
 clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl  | 6 ++++++
 .../BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl    | 2 +-
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 2fff32bbc4d6c..d99a6b8a568c7 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1061,6 +1061,7 @@ def SuperSubClassMismatch : DiagGroup<"super-class-method-mismatch">;
 def OverridingMethodMismatch : DiagGroup<"overriding-method-mismatch">;
 def VariadicMacros : DiagGroup<"variadic-macros">;
 def VectorConversion : DiagGroup<"vector-conversion">;      // clang specific
+def MatrixConversion : DiagGroup<"matrix-conversion">;      // clang specific
 def VexingParse : DiagGroup<"vexing-parse">;
 def VLAUseStaticAssert : DiagGroup<"vla-extension-static-assert">;
 def VLACxxExtension : DiagGroup<"vla-cxx-extension", [VLAUseStaticAssert]>;
@@ -1335,6 +1336,8 @@ def : DiagGroup<"int-conversions",
                 [IntConversion]>; // -Wint-conversions = -Wint-conversion
 def : DiagGroup<"vector-conversions",
                 [VectorConversion]>; // -Wvector-conversions = -Wvector-conversion
+def : DiagGroup<"matrix-conversions",
+                [MatrixConversion]>; // -Wvector-conversions = -Wmatrix-conversion
 def : DiagGroup<"unused-local-typedefs", [UnusedLocalTypedef]>;
                 // -Wunused-local-typedefs = -Wunused-local-typedef
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a107e7eea8b79..97e3418f6d22b 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -4358,10 +4358,10 @@ def warn_unknown_sanitizer_ignored : Warning<
 
 def warn_impcast_matrix_scalar : Warning<
   "implicit conversion turns matrix to scalar: %0 to %1">,
-  InGroup<Conversion>, DefaultIgnore;
+  InGroup<MatrixConversion>, DefaultIgnore;
 def warn_impcast_vector_scalar : Warning<
   "implicit conversion turns vector to scalar: %0 to %1">,
-  InGroup<Conversion>, DefaultIgnore;
+  InGroup<VectorConversion>, DefaultIgnore;
 def warn_impcast_complex_scalar : Warning<
   "implicit conversion discards imaginary component: %0 to %1">,
   InGroup<Conversion>, DefaultIgnore;
@@ -13277,10 +13277,10 @@ def err_hlsl_builtin_scalar_vector_mismatch
           "vector type with matching scalar element type%diff{: $ vs $|}2,3">;
 
 def warn_hlsl_impcast_vector_truncation : Warning<
-  "implicit conversion truncates vector: %0 to %1">, InGroup<Conversion>;
+  "implicit conversion truncates vector: %0 to %1">, InGroup<VectorConversion>;
 
 def warn_hlsl_impcast_matrix_truncation : Warning<
-  "implicit conversion truncates matrix: %0 to %1">, InGroup<Conversion>;
+  "implicit conversion truncates matrix: %0 to %1">, InGroup<MatrixConversion>;
 
 def warn_hlsl_availability : Warning<
   "%0 is only available %select{|in %4 environment }3on %1 %2 or newer">,
diff --git a/clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl b/clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl
index ebe4db7a9e26e..e92688ac1a920 100644
--- a/clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl
+++ b/clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl
@@ -236,6 +236,7 @@ void matOrVec2(float2x3 F) {} // expected-note{{candidate function}}
 
 export void Case8(float2x3 f23, float4x4 f44, float3x3 f33, float3x2 f32) {
   int2x2 i22 = f23;
+  // expected-warning at -1{{implicit conversion truncates matrix: 'float2x3' (aka 'matrix<float, 2, 3>') to 'int2x2' (aka 'matrix<int, 2, 2>')}}
   //CHECK: VarDecl {{.*}} i22 'int2x2':'matrix<int, 2, 2>' cinit
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'int2x2':'matrix<int, 2, 2>' <FloatingToIntegral>
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2x3':'matrix<float, 2, 3>' <LValueToRValue>
@@ -245,6 +246,7 @@ export void Case8(float2x3 f23, float4x4 f44, float3x3 f33, float3x2 f32) {
 #endif
   
   fn2x2(f23);
+  // expected-warning at -1{{implicit conversion truncates matrix: 'float2x3' (aka 'matrix<float, 2, 3>') to 'matrix<float, 2, 2>'}}
   //CHECK: DeclRefExpr {{.*}} 'void (float2x2)' lvalue Function {{.*}} 'fn2x2' 'void (float2x2)'
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'matrix<float, 2, 2>' <HLSLMatrixTruncation>
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2x3':'matrix<float, 2, 3>' <LValueToRValue>
@@ -255,11 +257,13 @@ export void Case8(float2x3 f23, float4x4 f44, float3x3 f33, float3x2 f32) {
 #endif
 
   matOrVec(f23);
+  // expected-warning at -1{{implicit conversion truncates matrix: 'float2x3' (aka 'matrix<float, 2, 3>') to 'matrix<float, 2, 2>'}}
   //CHECK: DeclRefExpr {{.*}} 'void (float2x2)' lvalue Function {{.*}} 'matOrVec' 'void (float2x2)'
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'matrix<float, 2, 2>' <HLSLMatrixTruncation>
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2x3':'matrix<float, 2, 3>' <LValueToRValue>
 
   matOrVec(f44);
+  // expected-warning at -1{{implicit conversion truncates matrix: 'float4x4' (aka 'matrix<float, 4, 4>') to 'matrix<float, 2, 2>'}}
   //CHECK: DeclRefExpr {{.*}} 'void (float2x2)' lvalue Function {{.*}} 'matOrVec' 'void (float2x2)'
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'matrix<float, 2, 2>' <HLSLMatrixTruncation>
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4x4':'matrix<float, 4, 4>' <LValueToRValue>
@@ -272,11 +276,13 @@ export void Case8(float2x3 f23, float4x4 f44, float3x3 f33, float3x2 f32) {
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2x3':'matrix<float, 2, 3>' <LValueToRValue>
 
   matOrVec2(f44);
+  // expected-warning at -1{{implicit conversion truncates matrix: 'float4x4' (aka 'matrix<float, 4, 4>') to 'matrix<float, 2, 3>'}}
   //CHECK: DeclRefExpr {{.*}} 'void (float2x3)' lvalue Function {{.*}} 'matOrVec2' 'void (float2x3)'
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'matrix<float, 2, 3>' <HLSLMatrixTruncation>
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4x4':'matrix<float, 4, 4>' <LValueToRValue>
 
   matOrVec2(f33);
+  // expected-warning at -1{{implicit conversion truncates matrix: 'float3x3' (aka 'matrix<float, 3, 3>') to 'matrix<float, 2, 3>'}}
   //CHECK: DeclRefExpr {{.*}} 'void (float2x3)' lvalue Function {{.*}} 'matOrVec2' 'void (float2x3)'
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'matrix<float, 2, 3>' <HLSLMatrixTruncation>
   //CHECK-NEXT: ImplicitCastExpr {{.*}} 'float3x3':'matrix<float, 3, 3>' <LValueToRValue>
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl
index 360c9f7f31b15..7d51a2062b3ae 100644
--- a/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -Wconversion -verify %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -Wmatrix-conversion -verify %s
 
 export int3x4 trunc_cast(int4x4 i44) {
     int3x4 i34 = i44;

>From 0259c34bd884498bd7d376ad4bf6fdd6e9b4fc28 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Fri, 21 Nov 2025 16:46:26 -0500
Subject: [PATCH 06/12] fix CI failure, revert changes to
 warn_impcast_matrix_scalar. the VectorConversion group is more invasive than
 expected

---
 clang/include/clang/Basic/DiagnosticGroups.td    | 2 +-
 clang/include/clang/Basic/DiagnosticSemaKinds.td | 2 +-
 clang/test/Driver/autocomplete.c                 | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index d99a6b8a568c7..063957e7b18ae 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1337,7 +1337,7 @@ def : DiagGroup<"int-conversions",
 def : DiagGroup<"vector-conversions",
                 [VectorConversion]>; // -Wvector-conversions = -Wvector-conversion
 def : DiagGroup<"matrix-conversions",
-                [MatrixConversion]>; // -Wvector-conversions = -Wmatrix-conversion
+                [MatrixConversion]>; // -Wmatrix-conversions = -Wmatrix-conversion
 def : DiagGroup<"unused-local-typedefs", [UnusedLocalTypedef]>;
                 // -Wunused-local-typedefs = -Wunused-local-typedef
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 97e3418f6d22b..7af88461738c4 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -4361,7 +4361,7 @@ def warn_impcast_matrix_scalar : Warning<
   InGroup<MatrixConversion>, DefaultIgnore;
 def warn_impcast_vector_scalar : Warning<
   "implicit conversion turns vector to scalar: %0 to %1">,
-  InGroup<VectorConversion>, DefaultIgnore;
+  InGroup<Conversion>, DefaultIgnore;
 def warn_impcast_complex_scalar : Warning<
   "implicit conversion discards imaginary component: %0 to %1">,
   InGroup<Conversion>, DefaultIgnore;
diff --git a/clang/test/Driver/autocomplete.c b/clang/test/Driver/autocomplete.c
index 4983b71496834..1fd60929751ee 100644
--- a/clang/test/Driver/autocomplete.c
+++ b/clang/test/Driver/autocomplete.c
@@ -117,6 +117,8 @@
 // WARNING-NEXT: -Wmany-braces-around-scalar-init
 // WARNING-NEXT: -Wmath-errno-enabled-with-veclib
 // WARNING-NEXT: -Wmathematical-notation-identifier-extension
+// WARNING-NEXT: -Wmatrix-conversion
+// WARNING-NEXT: -Wmatrix-conversions
 // WARNING-NEXT: -Wmax-tokens
 // WARNING-NEXT: -Wmax-unsigned-zero
 // RUN: %clang --autocomplete=-Wno-invalid-pp- | FileCheck %s -check-prefix=NOWARNING

>From d60c0ce6f61daf76f3a294fca7f52d3f9e81900c Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Tue, 25 Nov 2025 15:08:46 -0500
Subject: [PATCH 07/12] address pr comments

---
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 +-
 clang/lib/CodeGen/CGExprScalar.cpp            | 54 ++++++++++++++-----
 clang/lib/Sema/SemaOverload.cpp               |  2 +-
 .../BasicFeatures/MatrixElementTypeCast.hlsl  | 33 ++++++++++++
 .../MatrixExplicitTruncation.hlsl             | 12 ++---
 .../MatrixImplicitTruncation.hlsl             | 12 ++---
 6 files changed, 87 insertions(+), 28 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 7af88461738c4..17d2a78b96f9f 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -4358,7 +4358,7 @@ def warn_unknown_sanitizer_ignored : Warning<
 
 def warn_impcast_matrix_scalar : Warning<
   "implicit conversion turns matrix to scalar: %0 to %1">,
-  InGroup<MatrixConversion>, DefaultIgnore;
+  InGroup<MatrixConversion>;
 def warn_impcast_vector_scalar : Warning<
   "implicit conversion turns vector to scalar: %0 to %1">,
   InGroup<Conversion>, DefaultIgnore;
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index a9e2ebdffa59a..94f27cd3da8a0 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2426,17 +2426,26 @@ static Value *EmitHLSLElementwiseCast(CodeGenFunction &CGF, LValue SrcVal,
     assert(LoadList.size() >= MatTy->getNumElementsFlattened() &&
            "Flattened type on RHS must have the same number or more elements "
            "than vector on LHS.");
+
     llvm::Value *V =
         CGF.Builder.CreateLoad(CGF.CreateIRTemp(DestTy, "flatcast.tmp"));
     // write to V.
-    for (unsigned I = 0, E = MatTy->getNumElementsFlattened(); I < E; I++) {
-      RValue RVal = CGF.EmitLoadOfLValue(LoadList[I], Loc);
-      assert(RVal.isScalar() &&
-             "All flattened source values should be scalars.");
-      llvm::Value *Cast =
-          CGF.EmitScalarConversion(RVal.getScalarVal(), LoadList[I].getType(),
-                                   MatTy->getElementType(), Loc);
-      V = CGF.Builder.CreateInsertElement(V, Cast, I);
+    unsigned NumCols = MatTy->getNumColumns();
+    unsigned NumRows = MatTy->getNumRows();
+    unsigned ColOffset = NumCols;
+    if (auto *SrcMatTy = SrcVal.getType()->getAs<ConstantMatrixType>())
+      ColOffset = SrcMatTy->getNumColumns();
+    for (unsigned R = 0; R < NumRows; R++) {
+      for (unsigned C = 0; C < NumCols; C++) {
+        unsigned I = R * ColOffset + C;
+        RValue RVal = CGF.EmitLoadOfLValue(LoadList[I], Loc);
+        assert(RVal.isScalar() &&
+               "All flattened source values should be scalars.");
+        llvm::Value *Cast =
+            CGF.EmitScalarConversion(RVal.getScalarVal(), LoadList[I].getType(),
+                                     MatTy->getElementType(), Loc);
+        V = CGF.Builder.CreateInsertElement(V, Cast, I);
+      }
     }
     return V;
   }
@@ -2978,9 +2987,17 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     Value *Mat = Visit(E);
     if (auto *MatTy = DestTy->getAs<ConstantMatrixType>()) {
       SmallVector<int> Mask;
-      unsigned NumElts = MatTy->getNumElementsFlattened();
-      for (unsigned I = 0; I != NumElts; ++I)
-        Mask.push_back(I);
+      unsigned NumCols = MatTy->getNumColumns();
+      unsigned NumRows = MatTy->getNumRows();
+      unsigned ColOffset = NumCols;
+      if (auto *SrcMatTy = E->getType()->getAs<ConstantMatrixType>())
+        ColOffset = SrcMatTy->getNumColumns();
+      for (unsigned R = 0; R < NumRows; R++) {
+        for (unsigned C = 0; C < NumCols; C++) {
+          unsigned I = R * ColOffset + C;
+          Mask.push_back(I);
+        }
+      }
 
       return Builder.CreateShuffleVector(Mat, Mask, "trunc");
     }
@@ -2991,11 +3008,20 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     RValue RV = CGF.EmitAnyExpr(E);
     SourceLocation Loc = CE->getExprLoc();
 
-    assert(RV.isAggregate() && "Not a valid HLSL Elementwise Cast.");
-    // RHS is an aggregate
-    LValue SrcVal = CGF.MakeAddrLValue(RV.getAggregateAddress(), E->getType());
+    Address SrcAddr = Address::invalid();
+
+    if (RV.isAggregate()) {
+      SrcAddr = RV.getAggregateAddress();
+    } else {
+      SrcAddr = CGF.CreateMemTemp(E->getType(), "hlsl.ewcast.src");
+      LValue TmpLV = CGF.MakeAddrLValue(SrcAddr, E->getType());
+      CGF.EmitStoreThroughLValue(RV, TmpLV);
+    }
+
+    LValue SrcVal = CGF.MakeAddrLValue(SrcAddr, E->getType());
     return EmitHLSLElementwiseCast(CGF, SrcVal, DestTy, Loc);
   }
+
   } // end of switch
 
   llvm_unreachable("unknown scalar cast");
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 1acbda85601af..17c98e9b50aa9 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -2114,7 +2114,7 @@ static bool IsMatrixConversion(Sema &S, QualType FromType, QualType ToType,
                                ImplicitConversionKind &ICK,
                                ImplicitConversionKind &ElConv, Expr *From,
                                bool InOverloadResolution, bool CStyle) {
-  // The non HLSL Matrix conversion rules are not clear.
+  // Implicit conversions for matrices are an HLSL feature not present in C/C++.
   if (!S.getLangOpts().HLSL)
     return false;
 
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
index 081b8013efcbc..eb79e6e46d83a 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
@@ -184,3 +184,36 @@ struct Derived : BFields {
 void call4(Derived D) {
   int2x2 A = (int2x2)D;
 }
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z5call5Dv4_f(
+// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[M:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[M2:%.*]] = alloca [4 x float], align 4
+// CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <4 x float>, align 4
+// CHECK-NEXT:    store <4 x float> [[M]], ptr [[M_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[HLSL_EWCAST_SRC]], align 16
+// CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[HLSL_EWCAST_SRC]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[FLATCAST_TMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[VECTOR_GEP]], align 16
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP1]], float [[VECEXT]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[VECTOR_GEP]], align 16
+// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP3]], float [[VECEXT1]], i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[VECTOR_GEP]], align 16
+// CHECK-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float [[VECEXT2]], i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[VECTOR_GEP]], align 16
+// CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
+// CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP7]], float [[VECEXT3]], i64 3
+// CHECK-NEXT:    store <4 x float> [[TMP9]], ptr [[M2]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[M2]], align 4
+// CHECK-NEXT:    ret <4 x float> [[TMP10]]
+//
+float2x2 call5(float4 v) {
+    float2x2 m = (float2x2)v;
+    return m;
+}
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixExplicitTruncation.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixExplicitTruncation.hlsl
index f16d01e1d12ea..f3c4bc496d5a4 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixExplicitTruncation.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixExplicitTruncation.hlsl
@@ -25,7 +25,7 @@
 // CHECK-NEXT:    [[I43:%.*]] = alloca [12 x i32], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
-// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 12, i32 13, i32 14>
 // CHECK-NEXT:    store <12 x i32> [[TRUNC]], ptr [[I43]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <12 x i32>, ptr [[I43]], align 4
 // CHECK-NEXT:    ret <12 x i32> [[TMP1]]
@@ -42,7 +42,7 @@
 // CHECK-NEXT:    [[I33:%.*]] = alloca [9 x i32], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
-// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10>
 // CHECK-NEXT:    store <9 x i32> [[TRUNC]], ptr [[I33]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <9 x i32>, ptr [[I33]], align 4
 // CHECK-NEXT:    ret <9 x i32> [[TMP1]]
@@ -59,7 +59,7 @@
 // CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
-// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9>
 // CHECK-NEXT:    store <6 x i32> [[TRUNC]], ptr [[I32]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i32>, ptr [[I32]], align 4
 // CHECK-NEXT:    ret <6 x i32> [[TMP1]]
@@ -76,7 +76,7 @@
 // CHECK-NEXT:    [[I23:%.*]] = alloca [6 x i32], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
-// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6>
 // CHECK-NEXT:    store <6 x i32> [[TRUNC]], ptr [[I23]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i32>, ptr [[I23]], align 4
 // CHECK-NEXT:    ret <6 x i32> [[TMP1]]
@@ -93,7 +93,7 @@
 // CHECK-NEXT:    [[I22:%.*]] = alloca [4 x i32], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
-// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 // CHECK-NEXT:    store <4 x i32> [[TRUNC]], ptr [[I22]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[I22]], align 4
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -110,7 +110,7 @@
 // CHECK-NEXT:    [[I21:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
-// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <2 x i32> <i32 0, i32 4>
 // CHECK-NEXT:    store <2 x i32> [[TRUNC]], ptr [[I21]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[I21]], align 4
 // CHECK-NEXT:    ret <2 x i32> [[TMP1]]
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixImplicitTruncation.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixImplicitTruncation.hlsl
index 6a53d2e8ee96c..e621f68623bd1 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixImplicitTruncation.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixImplicitTruncation.hlsl
@@ -25,7 +25,7 @@
 // CHECK-NEXT:    [[I43:%.*]] = alloca [12 x i32], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
-// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 12, i32 13, i32 14>
 // CHECK-NEXT:    store <12 x i32> [[TRUNC]], ptr [[I43]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <12 x i32>, ptr [[I43]], align 4
 // CHECK-NEXT:    ret <12 x i32> [[TMP1]]
@@ -42,7 +42,7 @@
 // CHECK-NEXT:    [[I33:%.*]] = alloca [9 x i32], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
-// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10>
 // CHECK-NEXT:    store <9 x i32> [[TRUNC]], ptr [[I33]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <9 x i32>, ptr [[I33]], align 4
 // CHECK-NEXT:    ret <9 x i32> [[TMP1]]
@@ -59,7 +59,7 @@
 // CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
-// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9>
 // CHECK-NEXT:    store <6 x i32> [[TRUNC]], ptr [[I32]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i32>, ptr [[I32]], align 4
 // CHECK-NEXT:    ret <6 x i32> [[TMP1]]
@@ -76,7 +76,7 @@
 // CHECK-NEXT:    [[I23:%.*]] = alloca [6 x i32], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
-// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6>
 // CHECK-NEXT:    store <6 x i32> [[TRUNC]], ptr [[I23]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <6 x i32>, ptr [[I23]], align 4
 // CHECK-NEXT:    ret <6 x i32> [[TMP1]]
@@ -93,7 +93,7 @@
 // CHECK-NEXT:    [[I22:%.*]] = alloca [4 x i32], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
-// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 // CHECK-NEXT:    store <4 x i32> [[TRUNC]], ptr [[I22]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[I22]], align 4
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -110,7 +110,7 @@
 // CHECK-NEXT:    [[I21:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
-// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <2 x i32> <i32 0, i32 4>
 // CHECK-NEXT:    store <2 x i32> [[TRUNC]], ptr [[I21]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[I21]], align 4
 // CHECK-NEXT:    ret <2 x i32> [[TMP1]]

>From 3a48516c7d051a266efcd36ac4b11f7950ecc885 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Tue, 25 Nov 2025 20:55:22 -0500
Subject: [PATCH 08/12] fix element cast indexing to pass offload test

---
 clang/lib/CodeGen/CGExprScalar.cpp | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 94f27cd3da8a0..01123b78abba2 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2430,22 +2430,17 @@ static Value *EmitHLSLElementwiseCast(CodeGenFunction &CGF, LValue SrcVal,
     llvm::Value *V =
         CGF.Builder.CreateLoad(CGF.CreateIRTemp(DestTy, "flatcast.tmp"));
     // write to V.
-    unsigned NumCols = MatTy->getNumColumns();
-    unsigned NumRows = MatTy->getNumRows();
-    unsigned ColOffset = NumCols;
-    if (auto *SrcMatTy = SrcVal.getType()->getAs<ConstantMatrixType>())
-      ColOffset = SrcMatTy->getNumColumns();
-    for (unsigned R = 0; R < NumRows; R++) {
-      for (unsigned C = 0; C < NumCols; C++) {
-        unsigned I = R * ColOffset + C;
-        RValue RVal = CGF.EmitLoadOfLValue(LoadList[I], Loc);
-        assert(RVal.isScalar() &&
-               "All flattened source values should be scalars.");
-        llvm::Value *Cast =
-            CGF.EmitScalarConversion(RVal.getScalarVal(), LoadList[I].getType(),
-                                     MatTy->getElementType(), Loc);
-        V = CGF.Builder.CreateInsertElement(V, Cast, I);
-      }
+    for (unsigned I = 0, E = MatTy->getNumElementsFlattened(); I < E; I++) {
+      unsigned ColMajorIndex =
+          (I % MatTy->getNumRows()) * MatTy->getNumColumns() +
+          (I / MatTy->getNumRows());
+      RValue RVal = CGF.EmitLoadOfLValue(LoadList[ColMajorIndex], Loc);
+      assert(RVal.isScalar() &&
+             "All flattened source values should be scalars.");
+      llvm::Value *Cast = CGF.EmitScalarConversion(
+          RVal.getScalarVal(), LoadList[ColMajorIndex].getType(),
+          MatTy->getElementType(), Loc);
+      V = CGF.Builder.CreateInsertElement(V, Cast, I);
     }
     return V;
   }

>From 4fb00914b90736771b011e3764766252e953889c Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Wed, 26 Nov 2025 00:28:33 -0500
Subject: [PATCH 09/12] fix test failure

---
 .../BasicFeatures/MatrixElementTypeCast.hlsl  | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
index eb79e6e46d83a..3bd7636212862 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
@@ -168,14 +168,14 @@ struct Derived : BFields {
 // CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[GEP1]], align 8
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[TMP1]] to i32
 // CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[CONV]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[GEP2]], align 4
+// CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP3]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CONV4]], i64 1
 // CHECK-NEXT:    [[BF_LOAD:%.*]] = load i24, ptr [[E]], align 1
 // CHECK-NEXT:    [[BF_SHL:%.*]] = shl i24 [[BF_LOAD]], 9
 // CHECK-NEXT:    [[BF_ASHR:%.*]] = ashr i24 [[BF_SHL]], 9
 // CHECK-NEXT:    [[BF_CAST:%.*]] = sext i24 [[BF_ASHR]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[BF_CAST]], i64 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[GEP2]], align 4
-// CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP4]] to i32
-// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CONV4]], i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[BF_CAST]], i64 2
 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP3]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 3
 // CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[A]], align 4
@@ -186,14 +186,14 @@ void call4(Derived D) {
 }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z5call5Dv4_f(
-// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[M:%.*]]) #[[ATTR0]] {
+// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[M2:%.*]] = alloca [4 x float], align 4
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[M:%.*]] = alloca [4 x float], align 4
 // CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca <4 x float>, align 16
 // CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <4 x float>, align 4
-// CHECK-NEXT:    store <4 x float> [[M]], ptr [[M_ADDR]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 16
+// CHECK-NEXT:    store <4 x float> [[V]], ptr [[V_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V_ADDR]], align 16
 // CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[HLSL_EWCAST_SRC]], align 16
 // CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[HLSL_EWCAST_SRC]], i32 0
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[FLATCAST_TMP]], align 4
@@ -201,16 +201,16 @@ void call4(Derived D) {
 // CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 // CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP1]], float [[VECEXT]], i64 0
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[VECTOR_GEP]], align 16
-// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
 // CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP3]], float [[VECEXT1]], i64 1
 // CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[VECTOR_GEP]], align 16
-// CHECK-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+// CHECK-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
 // CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float [[VECEXT2]], i64 2
 // CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[VECTOR_GEP]], align 16
 // CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
 // CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP7]], float [[VECEXT3]], i64 3
-// CHECK-NEXT:    store <4 x float> [[TMP9]], ptr [[M2]], align 4
-// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[M2]], align 4
+// CHECK-NEXT:    store <4 x float> [[TMP9]], ptr [[M]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[M]], align 4
 // CHECK-NEXT:    ret <4 x float> [[TMP10]]
 //
 float2x2 call5(float4 v) {

>From e6b4ab6c66c65c54d5ec10cd4bb7494d35093dd2 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Tue, 2 Dec 2025 12:15:09 -0500
Subject: [PATCH 10/12] address pr comments

---
 clang/lib/CodeGen/CGExprScalar.cpp            | 32 ++++++++-----------
 clang/lib/Sema/SemaChecking.cpp               | 17 +++++-----
 .../MatrixElementOverloadResolution.hlsl      |  4 +--
 .../MatrixImplicitTruncCastWarnings.hlsl      |  2 +-
 4 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 01123b78abba2..0fc11f641d7f2 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2429,7 +2429,7 @@ static Value *EmitHLSLElementwiseCast(CodeGenFunction &CGF, LValue SrcVal,
 
     llvm::Value *V =
         CGF.Builder.CreateLoad(CGF.CreateIRTemp(DestTy, "flatcast.tmp"));
-    // write to V.
+    // V is an allocated temporary to build the truncated matrix into.
     for (unsigned I = 0, E = MatTy->getNumElementsFlattened(); I < E; I++) {
       unsigned ColMajorIndex =
           (I % MatTy->getNumRows()) * MatTy->getNumColumns() +
@@ -2977,27 +2977,23 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     return Builder.CreateExtractElement(Vec, Zero, "cast.vtrunc");
   }
   case CK_HLSLMatrixTruncation: {
-    assert((DestTy->isMatrixType() || DestTy->isBuiltinType()) &&
+    assert((DestTy->isConstantMatrixType() || DestTy->isBuiltinType()) &&
            "Destination type must be a matrix or builtin type.");
     Value *Mat = Visit(E);
-    if (auto *MatTy = DestTy->getAs<ConstantMatrixType>()) {
-      SmallVector<int> Mask;
-      unsigned NumCols = MatTy->getNumColumns();
-      unsigned NumRows = MatTy->getNumRows();
-      unsigned ColOffset = NumCols;
-      if (auto *SrcMatTy = E->getType()->getAs<ConstantMatrixType>())
-        ColOffset = SrcMatTy->getNumColumns();
-      for (unsigned R = 0; R < NumRows; R++) {
-        for (unsigned C = 0; C < NumCols; C++) {
-          unsigned I = R * ColOffset + C;
-          Mask.push_back(I);
-        }
+    auto *MatTy = DestTy->getAs<ConstantMatrixType>();
+    SmallVector<int> Mask;
+    unsigned NumCols = MatTy->getNumColumns();
+    unsigned NumRows = MatTy->getNumRows();
+    unsigned ColOffset = NumCols;
+    if (auto *SrcMatTy = E->getType()->getAs<ConstantMatrixType>())
+      ColOffset = SrcMatTy->getNumColumns();
+    for (unsigned R = 0; R < NumRows; R++) {
+      for (unsigned C = 0; C < NumCols; C++) {
+        unsigned I = R * ColOffset + C;
+        Mask.push_back(I);
       }
-
-      return Builder.CreateShuffleVector(Mat, Mask, "trunc");
     }
-    llvm::Value *Zero = llvm::Constant::getNullValue(CGF.SizeTy);
-    return Builder.CreateExtractElement(Mat, Zero, "cast.mtrunc");
+    return Builder.CreateShuffleVector(Mat, Mask, "trunc");
   }
   case CK_HLSLElementwiseCast: {
     RValue RV = CGF.EmitAnyExpr(E);
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index cbb909ebbe9cf..72202f1f67ab4 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -12571,9 +12571,10 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
       if (SourceMgr.isInSystemMacro(CC))
         return;
       return DiagnoseImpCast(*this, E, T, CC, diag::warn_impcast_vector_scalar);
-    } else if (getLangOpts().HLSL &&
-               Target->castAs<VectorType>()->getNumElements() <
-                   Source->castAs<VectorType>()->getNumElements()) {
+    }
+    if (getLangOpts().HLSL &&
+        Target->castAs<VectorType>()->getNumElements() <
+            Source->castAs<VectorType>()->getNumElements()) {
       // Diagnose vector truncation but don't return. We may also want to
       // diagnose an element conversion.
       DiagnoseImpCast(*this, E, T, CC,
@@ -12593,12 +12594,12 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
     Target = VecTy->getElementType().getTypePtr();
 
   if (isa<ConstantMatrixType>(Source)) {
-    if (!isa<ConstantMatrixType>(Target)) {
+    if (Target->isScalarType())
       return DiagnoseImpCast(*this, E, T, CC, diag::warn_impcast_matrix_scalar);
-    } else if (getLangOpts().HLSL &&
-               Target->castAs<ConstantMatrixType>()->getNumElementsFlattened() <
-                   Source->castAs<ConstantMatrixType>()
-                       ->getNumElementsFlattened()) {
+
+    if (getLangOpts().HLSL &&
+        Target->castAs<ConstantMatrixType>()->getNumElementsFlattened() <
+            Source->castAs<ConstantMatrixType>()->getNumElementsFlattened()) {
       // Diagnose Matrix truncation but don't return. We may also want to
       // diagnose an element conversion.
       DiagnoseImpCast(*this, E, T, CC,
diff --git a/clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl b/clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl
index e92688ac1a920..04149e176edbd 100644
--- a/clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl
+++ b/clang/test/SemaHLSL/MatrixElementOverloadResolution.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -finclude-default-header -Wconversion -verify -o - %s -DERROR=1
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -finclude-default-header -Wno-conversion -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -finclude-default-header -verify -o - %s -DERROR=1
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -finclude-default-header -ast-dump %s | FileCheck %s
 
 // This test verifies floating point type implicit conversion ranks for overload
 // resolution. In HLSL the built-in type ranks are half < float < double. This
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl
index 7d51a2062b3ae..2c50b957578ec 100644
--- a/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixImplicitTruncCastWarnings.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -Wmatrix-conversion -verify %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s
 
 export int3x4 trunc_cast(int4x4 i44) {
     int3x4 i34 = i44;

>From fef4b65f095963bfc9ae933675e039981adce8f6 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Tue, 2 Dec 2025 12:42:09 -0500
Subject: [PATCH 11/12] fix to support BuiltinType

---
 clang/lib/CodeGen/CGExprScalar.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 0fc11f641d7f2..f95a494732ef9 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2980,10 +2980,13 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     assert((DestTy->isConstantMatrixType() || DestTy->isBuiltinType()) &&
            "Destination type must be a matrix or builtin type.");
     Value *Mat = Visit(E);
-    auto *MatTy = DestTy->getAs<ConstantMatrixType>();
+    unsigned NumCols = 1;
+    unsigned NumRows = 1;
     SmallVector<int> Mask;
-    unsigned NumCols = MatTy->getNumColumns();
-    unsigned NumRows = MatTy->getNumRows();
+    if (auto *MatTy = DestTy->getAs<ConstantMatrixType>()) {
+      NumCols = MatTy->getNumColumns();
+      NumRows = MatTy->getNumRows();
+    }
     unsigned ColOffset = NumCols;
     if (auto *SrcMatTy = E->getType()->getAs<ConstantMatrixType>())
       ColOffset = SrcMatTy->getNumColumns();

>From 71a3ece410f75e201e94508b4a12b34b61afda9a Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Tue, 2 Dec 2025 13:13:18 -0500
Subject: [PATCH 12/12] rever changes to CGExprScalar.cpp as the suggestion was
 a missunderstanding. fix up some typos

---
 clang/lib/CodeGen/CGExprScalar.cpp | 31 +++++++++++++++---------------
 clang/lib/Sema/SemaChecking.cpp    |  2 +-
 clang/lib/Sema/SemaOverload.cpp    |  2 +-
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index f95a494732ef9..769bc37b0e131 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2977,26 +2977,27 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     return Builder.CreateExtractElement(Vec, Zero, "cast.vtrunc");
   }
   case CK_HLSLMatrixTruncation: {
-    assert((DestTy->isConstantMatrixType() || DestTy->isBuiltinType()) &&
+    assert((DestTy->isMatrixType() || DestTy->isBuiltinType()) &&
            "Destination type must be a matrix or builtin type.");
     Value *Mat = Visit(E);
-    unsigned NumCols = 1;
-    unsigned NumRows = 1;
-    SmallVector<int> Mask;
     if (auto *MatTy = DestTy->getAs<ConstantMatrixType>()) {
-      NumCols = MatTy->getNumColumns();
-      NumRows = MatTy->getNumRows();
-    }
-    unsigned ColOffset = NumCols;
-    if (auto *SrcMatTy = E->getType()->getAs<ConstantMatrixType>())
-      ColOffset = SrcMatTy->getNumColumns();
-    for (unsigned R = 0; R < NumRows; R++) {
-      for (unsigned C = 0; C < NumCols; C++) {
-        unsigned I = R * ColOffset + C;
-        Mask.push_back(I);
+      SmallVector<int> Mask;
+      unsigned NumCols = MatTy->getNumColumns();
+      unsigned NumRows = MatTy->getNumRows();
+      unsigned ColOffset = NumCols;
+      if (auto *SrcMatTy = E->getType()->getAs<ConstantMatrixType>())
+        ColOffset = SrcMatTy->getNumColumns();
+      for (unsigned R = 0; R < NumRows; R++) {
+        for (unsigned C = 0; C < NumCols; C++) {
+          unsigned I = R * ColOffset + C;
+          Mask.push_back(I);
+        }
       }
+
+      return Builder.CreateShuffleVector(Mat, Mask, "trunc");
     }
-    return Builder.CreateShuffleVector(Mat, Mask, "trunc");
+    llvm::Value *Zero = llvm::Constant::getNullValue(CGF.SizeTy);
+    return Builder.CreateExtractElement(Mat, Zero, "cast.mtrunc");
   }
   case CK_HLSLElementwiseCast: {
     RValue RV = CGF.EmitAnyExpr(E);
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 72202f1f67ab4..58de9fe48162b 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -12590,7 +12590,7 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
     Source = cast<VectorType>(Source)->getElementType().getTypePtr();
     Target = cast<VectorType>(Target)->getElementType().getTypePtr();
   }
-  if (auto VecTy = dyn_cast<VectorType>(Target))
+  if (const auto *VecTy = dyn_cast<VectorType>(Target))
     Target = VecTy->getElementType().getTypePtr();
 
   if (isa<ConstantMatrixType>(Source)) {
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 17c98e9b50aa9..9a3a78164f0f8 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -2121,7 +2121,7 @@ static bool IsMatrixConversion(Sema &S, QualType FromType, QualType ToType,
   auto *ToMatrixType = ToType->getAs<ConstantMatrixType>();
   auto *FromMatrixType = FromType->getAs<ConstantMatrixType>();
 
-  // If both arguments are vectors, handle possible vector truncation and
+  // If both arguments are matrix, handle possible matrix truncation and
   // element conversion.
   if (ToMatrixType && FromMatrixType) {
     unsigned FromCols = FromMatrixType->getNumColumns();