[clang] [HLSL] Implement HLSL Flat casting (excluding splat cases) (PR #118842)

Thu Feb 6 08:26:00 PST 2025

https://github.com/spall updated https://github.com/llvm/llvm-project/pull/118842

>From 2e932a57ccb992b856b58bec4c30c6b64f24f711 Mon Sep 17 00:00:00 2001
From: Sarah Spall <spall at planetbauer.com>
Date: Thu, 28 Nov 2024 16:23:57 +0000
Subject: [PATCH 01/18] Flat casts WIP

---
 clang/include/clang/AST/OperationKinds.def    |   3 +
 clang/include/clang/Sema/SemaHLSL.h           |   2 +
 clang/lib/AST/Expr.cpp                        |   1 +
 clang/lib/AST/ExprConstant.cpp                |   1 +
 clang/lib/CodeGen/CGExpr.cpp                  |  84 ++++++++++
 clang/lib/CodeGen/CGExprAgg.cpp               |  83 +++++++++-
 clang/lib/CodeGen/CGExprComplex.cpp           |   1 +
 clang/lib/CodeGen/CGExprConstant.cpp          |   1 +
 clang/lib/CodeGen/CGExprScalar.cpp            |  39 +++++
 clang/lib/CodeGen/CodeGenFunction.h           |   7 +
 clang/lib/Edit/RewriteObjCFoundationAPI.cpp   |   1 +
 clang/lib/Sema/Sema.cpp                       |   1 +
 clang/lib/Sema/SemaCast.cpp                   |  20 ++-
 clang/lib/Sema/SemaHLSL.cpp                   | 143 ++++++++++++++++++
 clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp |   1 +
 15 files changed, 384 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/AST/OperationKinds.def b/clang/include/clang/AST/OperationKinds.def
index 8788b8ff0ef0a45..9323d4e861a7342 100644
--- a/clang/include/clang/AST/OperationKinds.def
+++ b/clang/include/clang/AST/OperationKinds.def
@@ -367,6 +367,9 @@ CAST_OPERATION(HLSLVectorTruncation)
 // Non-decaying array RValue cast (HLSL only).
 CAST_OPERATION(HLSLArrayRValue)
 
+// Aggregate by Value cast (HLSL only).
+CAST_OPERATION(HLSLAggregateCast)
+
 //===- Binary Operations  -------------------------------------------------===//
 // Operators listed in order of precedence.
 // Note that additions to this should also update the StmtVisitor class,
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index ee685d95c961542..6bda1e8ce0ea5be 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -140,6 +140,8 @@ class SemaHLSL : public SemaBase {
   // Diagnose whether the input ID is uint/unit2/uint3 type.
   bool diagnoseInputIDType(QualType T, const ParsedAttr &AL);
 
+  bool CanPerformScalarCast(QualType SrcTy, QualType DestTy);
+  bool CanPerformAggregateCast(Expr *Src, QualType DestType);
   ExprResult ActOnOutParamExpr(ParmVarDecl *Param, Expr *Arg);
 
   QualType getInoutParameterType(QualType Ty);
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index a4fb4d5a1f2ec41..4764bc84ce498a6 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -1942,6 +1942,7 @@ bool CastExpr::CastConsistency() const {
   case CK_FixedPointToBoolean:
   case CK_HLSLArrayRValue:
   case CK_HLSLVectorTruncation:
+  case CK_HLSLAggregateCast:
   CheckNoBasePath:
     assert(path_empty() && "Cast kind should not have a base path!");
     break;
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 6b5b95aee35522d..b548cef41b75257 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -15733,6 +15733,7 @@ bool ComplexExprEvaluator::VisitCastExpr(const CastExpr *E) {
   case CK_IntegralToFixedPoint:
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
+  case CK_HLSLAggregateCast:
     llvm_unreachable("invalid cast kind for complex value");
 
   case CK_LValueToRValue:
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 5fccc9cbb37ec11..b7608b1226758d7 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -5320,6 +5320,7 @@ LValue CodeGenFunction::EmitCastLValue(const CastExpr *E) {
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
   case CK_HLSLArrayRValue:
+  case CK_HLSLAggregateCast:
     return EmitUnsupportedLValue(E, "unexpected cast lvalue");
 
   case CK_Dependent:
@@ -6358,3 +6359,86 @@ RValue CodeGenFunction::EmitPseudoObjectRValue(const PseudoObjectExpr *E,
 LValue CodeGenFunction::EmitPseudoObjectLValue(const PseudoObjectExpr *E) {
   return emitPseudoObjectExpr(*this, E, true, AggValueSlot::ignored()).LV;
 }
+
+llvm::Value* CodeGenFunction::PerformLoad(std::pair<Address, llvm::Value *> &GEP) {
+  Address GEPAddress = GEP.first;
+  llvm::Value *Idx = GEP.second;
+  llvm::Value *V = Builder.CreateLoad(GEPAddress, "load");
+  if (Idx) { // loading from a vector so perform an extract as well
+    return Builder.CreateExtractElement(V, Idx, "vec.load");
+  }
+  return V;
+}
+
+llvm::Value* CodeGenFunction::PerformStore(std::pair<Address, llvm::Value *> &GEP,
+				           llvm::Value *Val) {
+  Address GEPAddress = GEP.first;
+  llvm::Value *Idx = GEP.second;
+  if (Idx) {
+    llvm::Value *V = Builder.CreateLoad(GEPAddress, "load.for.insert");
+    return Builder.CreateInsertElement(V, Val, Idx);
+  } else {
+    return Builder.CreateStore(Val, GEPAddress);
+  }
+}
+
+void CodeGenFunction::FlattenAccessAndType(Address Val, QualType SrcTy,
+			         SmallVector<llvm::Value *, 4> &IdxList,
+			         SmallVector<std::pair<Address, llvm::Value *>, 16> &GEPList,
+				 SmallVector<QualType> &FlatTypes) {
+  llvm::IntegerType *IdxTy = llvm::IntegerType::get(getLLVMContext(),32);
+  if (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(SrcTy)) {
+    uint64_t Size = CAT->getZExtSize();
+    for(unsigned i = 0; i < Size; i ++) {
+      // flatten each member of the array
+      // add index of this element to index list
+      llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, i);
+      IdxList.push_back(Idx);
+      // recur on this object
+      FlattenAccessAndType(Val, CAT->getElementType(), IdxList, GEPList, FlatTypes);
+      // remove index of this element from index list
+      IdxList.pop_back();
+    }
+  } else if (const RecordType *RT = SrcTy->getAs<RecordType>()) {
+    RecordDecl *Record = RT->getDecl();
+    const CGRecordLayout &RL = getTypes().getCGRecordLayout(Record);
+    // do I need to check if its a cxx record decl?
+
+    for (auto fieldIter = Record->field_begin(), fieldEnd = Record->field_end();
+	 fieldIter != fieldEnd; ++fieldIter) {
+      // get the field number
+      unsigned FieldNum = RL.getLLVMFieldNo(*fieldIter);
+      // can we just do *fieldIter->getFieldIndex();
+      // add that index to the index list
+      llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, FieldNum);
+      IdxList.push_back(Idx);
+      // recur on the field
+      FlattenAccessAndType(Val, fieldIter->getType(), IdxList, GEPList,
+			   FlatTypes);
+      // remove index of this element from index list
+      IdxList.pop_back();
+    }
+  } else if (const VectorType *VT = SrcTy->getAs<VectorType>()) {
+    llvm::Type *VTy = ConvertTypeForMem(SrcTy);
+    CharUnits Align = getContext().getTypeAlignInChars(SrcTy);
+    Address GEP = Builder.CreateInBoundsGEP(Val, IdxList,
+						 VTy, Align, "vector.gep");
+    for(unsigned i = 0; i < VT->getNumElements(); i ++) {
+      // add index to the list
+      llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, i);
+      // create gep. no need to recur since its always a scalar
+      // gep on vector is not recommended so combine gep with extract/insert
+      GEPList.push_back({GEP, Idx});
+      FlatTypes.push_back(VT->getElementType());
+    }
+  } else { // should be a scalar should we assert or check?
+    // create a gep
+    llvm::Type *Ty = ConvertTypeForMem(SrcTy);
+    CharUnits Align = getContext().getTypeAlignInChars(SrcTy);
+    Address GEP = Builder.CreateInBoundsGEP(Val, IdxList,
+						     Ty, Align,  "gep");
+    GEPList.push_back({GEP, NULL});
+    FlatTypes.push_back(SrcTy);
+  }
+  // target extension types?
+}
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 2ad6587089f1014..bc8e1f0f9248efc 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -491,6 +491,70 @@ static bool isTrivialFiller(Expr *E) {
   return false;
 }
 
+
+
+// emit a flat cast where the RHS is a scalar, including vector
+static void EmitHLSLScalarFlatCast(CodeGenFunction &CGF, Address DestVal,
+			    QualType DestTy, llvm::Value *SrcVal,
+			    QualType SrcTy, SourceLocation Loc) {
+  // Flatten our destination
+  SmallVector<QualType> DestTypes; // Flattened type
+  SmallVector<llvm::Value *, 4> IdxList;
+  SmallVector<std::pair<Address, llvm::Value *>, 16> StoreGEPList;
+  // ^^ Flattened accesses to DestVal we want to store into
+  CGF.FlattenAccessAndType(DestVal, DestTy, IdxList, StoreGEPList,
+		       DestTypes);
+
+  if (const VectorType *VT = SrcTy->getAs<VectorType>()) {
+    SrcTy = VT->getElementType();
+    assert(StoreGEPList.size() <= VT->getNumElements() &&
+	   "Cannot perform HLSL flat cast when vector source \
+           object has less elements than flattened destination \
+           object.");
+      for(unsigned i = 0; i < StoreGEPList.size(); i ++) {
+        llvm::Value *Load = CGF.Builder.CreateExtractElement(SrcVal, i,
+							     "vec.load");
+	llvm::Value *Cast = CGF.EmitScalarConversion(Load, SrcTy,
+						     DestTypes[i],
+						     Loc);
+	CGF.PerformStore(StoreGEPList[i], Cast);
+      }
+      return;
+  }
+  llvm_unreachable("HLSL Flat cast doesn't handle splatting.");
+}
+
+// emit a flat cast where the RHS is an aggregate
+static void EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address DestVal,
+			       QualType DestTy, Address SrcVal,
+			       QualType SrcTy, SourceLocation Loc) {
+  // Flatten our destination
+  SmallVector<QualType> DestTypes; // Flattened type
+  SmallVector<llvm::Value *, 4> IdxList;
+  SmallVector<std::pair<Address, llvm::Value *>, 16> StoreGEPList;
+  // ^^ Flattened accesses to DestVal we want to store into
+  CGF.FlattenAccessAndType(DestVal, DestTy, IdxList, StoreGEPList,
+		       DestTypes);
+  // Flatten our src
+  SmallVector<QualType> SrcTypes; // Flattened type
+  SmallVector<std::pair<Address, llvm::Value *>, 16> LoadGEPList;
+  // ^^ Flattened accesses to SrcVal we want to load from
+  IdxList.clear();
+  CGF.FlattenAccessAndType(SrcVal, SrcTy, IdxList, LoadGEPList, SrcTypes);
+
+  assert(StoreGEPList.size() <= LoadGEPList.size() &&
+	 "Cannot perform HLSL flat cast when flattened source object \
+          has less elements than flattened destination object.");
+  // apply casts to what we load from LoadGEPList
+  // and store result in Dest
+  for(unsigned i = 0; i < StoreGEPList.size(); i ++) {
+    llvm::Value *Load = CGF.PerformLoad(LoadGEPList[i]);
+    llvm::Value *Cast = CGF.EmitScalarConversion(Load, SrcTypes[i],
+						 DestTypes[i], Loc);
+    CGF.PerformStore(StoreGEPList[i], Cast);
+  }
+}
+
 /// Emit initialization of an array from an initializer list. ExprToVisit must
 /// be either an InitListEpxr a CXXParenInitListExpr.
 void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
@@ -890,7 +954,24 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
   case CK_HLSLArrayRValue:
     Visit(E->getSubExpr());
     break;
-
+  case CK_HLSLAggregateCast: {
+    Expr *Src = E->getSubExpr();
+    QualType SrcTy = Src->getType();
+    RValue RV = CGF.EmitAnyExpr(Src);
+    QualType DestTy = E->getType();
+    Address DestVal = Dest.getAddress();
+    SourceLocation Loc = E->getExprLoc();
+
+    if (RV.isScalar()) {
+      llvm::Value *SrcVal = RV.getScalarVal();
+      EmitHLSLScalarFlatCast(CGF, DestVal, DestTy, SrcVal, SrcTy, Loc);
+    } else { // RHS is an aggregate
+      assert(RV.isAggregate() &&
+	     "Can't perform HLSL Aggregate cast on a complex type.");
+      Address SrcVal = RV.getAggregateAddress();
+      EmitHLSLAggregateFlatCast(CGF, DestVal, DestTy, SrcVal, SrcTy, Loc);
+    }
+    break; }
   case CK_NoOp:
   case CK_UserDefinedConversion:
   case CK_ConstructorConversion:
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index ac31dff11b585ec..05680d36aa2bd77 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -610,6 +610,7 @@ ComplexPairTy ComplexExprEmitter::EmitCast(CastKind CK, Expr *Op,
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
   case CK_HLSLArrayRValue:
+  case CK_HLSLAggregateCast:
     llvm_unreachable("invalid cast kind for complex value");
 
   case CK_FloatingRealToComplex:
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 655fc3dc954c819..6d15bc9058e4501 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -1335,6 +1335,7 @@ class ConstExprEmitter
     case CK_MatrixCast:
     case CK_HLSLVectorTruncation:
     case CK_HLSLArrayRValue:
+    case CK_HLSLAggregateCast:
       return nullptr;
     }
     llvm_unreachable("Invalid CastKind");
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 4ae8a2b22b1bba8..d7bb702ec3ca20c 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2262,6 +2262,35 @@ bool CodeGenFunction::ShouldNullCheckClassCastValue(const CastExpr *CE) {
   return true;
 }
 
+// RHS is an aggregate type
+static Value *EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address RHSVal,
+					QualType RHSTy, QualType LHSTy,
+					SourceLocation Loc) {
+  SmallVector<llvm::Value *, 4> IdxList;
+  SmallVector<std::pair<Address, llvm::Value *>, 16> LoadGEPList;
+  SmallVector<QualType> SrcTypes; // Flattened type
+  CGF.FlattenAccessAndType(RHSVal, RHSTy, IdxList, LoadGEPList, SrcTypes);
+  // LHS is either a vector or a builtin?
+  // if its a vector create a temp alloca to store into and return that
+  if (auto *VecTy = LHSTy->getAs<VectorType>()) {
+    llvm::Value *V = CGF.Builder.CreateLoad(CGF.CreateIRTemp(LHSTy, "flatcast.tmp"));
+    // write to V.
+    for(unsigned i = 0; i < VecTy->getNumElements(); i ++) {
+      llvm::Value *Load = CGF.PerformLoad(LoadGEPList[i]);
+      llvm::Value *Cast = CGF.EmitScalarConversion(Load, SrcTypes[i],
+						   VecTy->getElementType(), Loc);
+      V = CGF.Builder.CreateInsertElement(V, Cast, i);
+    }
+    return V;
+  }
+  // i its a builtin just do an extract element or load.
+  assert(LHSTy->isBuiltinType() &&
+	 "Destination type must be a vector or builtin type.");
+  // TODO add asserts about things being long enough
+  return CGF.EmitScalarConversion(CGF.PerformLoad(LoadGEPList[0]),
+				  LHSTy, SrcTypes[0], Loc);
+}
+
 // VisitCastExpr - Emit code for an explicit or implicit cast.  Implicit casts
 // have to handle a more broad range of conversions than explicit casts, as they
 // handle things like function to ptr-to-function decay etc.
@@ -2752,7 +2781,17 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     llvm::Value *Zero = llvm::Constant::getNullValue(CGF.SizeTy);
     return Builder.CreateExtractElement(Vec, Zero, "cast.vtrunc");
   }
+  case CK_HLSLAggregateCast: {
+    RValue RV = CGF.EmitAnyExpr(E);
+    SourceLocation Loc = CE->getExprLoc();
+    QualType SrcTy = E->getType();
 
+    if (RV.isAggregate()) { // RHS is an aggregate
+      Address SrcVal = RV.getAggregateAddress();
+      return EmitHLSLAggregateFlatCast(CGF, SrcVal, SrcTy, DestTy, Loc);
+    }
+    llvm_unreachable("Not a valid HLSL Flat Cast.");
+  }
   } // end of switch
 
   llvm_unreachable("unknown scalar cast");
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index eaea0d8a08ac06f..b17ead377610e67 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4359,6 +4359,13 @@ class CodeGenFunction : public CodeGenTypeCache {
                                 AggValueSlot slot = AggValueSlot::ignored());
   LValue EmitPseudoObjectLValue(const PseudoObjectExpr *e);
 
+  llvm::Value *PerformLoad(std::pair<Address, llvm::Value *> &GEP);
+  llvm::Value *PerformStore(std::pair<Address, llvm::Value *> &GEP, llvm::Value *Val);
+  void FlattenAccessAndType(Address Val, QualType SrcTy,
+			    SmallVector<llvm::Value *, 4> &IdxList,
+			    SmallVector<std::pair<Address, llvm::Value *>, 16> &GEPList,
+			    SmallVector<QualType> &FlatTypes);
+
   llvm::Value *EmitIvarOffset(const ObjCInterfaceDecl *Interface,
                               const ObjCIvarDecl *Ivar);
   llvm::Value *EmitIvarOffsetAsPointerDiff(const ObjCInterfaceDecl *Interface,
diff --git a/clang/lib/Edit/RewriteObjCFoundationAPI.cpp b/clang/lib/Edit/RewriteObjCFoundationAPI.cpp
index 81797c8c4dc75a2..63308319a78d1cc 100644
--- a/clang/lib/Edit/RewriteObjCFoundationAPI.cpp
+++ b/clang/lib/Edit/RewriteObjCFoundationAPI.cpp
@@ -1085,6 +1085,7 @@ static bool rewriteToNumericBoxedExpression(const ObjCMessageExpr *Msg,
       llvm_unreachable("OpenCL-specific cast in Objective-C?");
 
     case CK_HLSLVectorTruncation:
+    case CK_HLSLAggregateCast:
       llvm_unreachable("HLSL-specific cast in Objective-C?");
       break;
 
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index d6517511d7db4d2..2f0528d6ab5ce14 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -707,6 +707,7 @@ ExprResult Sema::ImpCastExprToType(Expr *E, QualType Ty,
     case CK_ToVoid:
     case CK_NonAtomicToAtomic:
     case CK_HLSLArrayRValue:
+    case CK_HLSLAggregateCast:
       break;
     }
   }
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index f98857f852b5af3..955c44cf4a6a424 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -25,6 +25,7 @@
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/SemaObjC.h"
 #include "clang/Sema/SemaRISCV.h"
+#include "clang/Sema/SemaHLSL.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include <set>
@@ -2768,6 +2769,22 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle,
     return;
   }
 
+  CheckedConversionKind CCK = FunctionalStyle
+                                  ? CheckedConversionKind::FunctionalCast
+                                  : CheckedConversionKind::CStyleCast;
+  // todo what else should i be doing lvalue to rvalue cast for?
+  // why dont they do it for records below?
+  // This case should not trigger on regular vector splat
+  // Or vector cast or vector truncation.
+  QualType SrcTy = SrcExpr.get()->getType();
+  if (Self.getLangOpts().HLSL &&
+      Self.HLSL().CanPerformAggregateCast(SrcExpr.get(), DestType)) {
+    if (SrcTy->isConstantArrayType())
+      SrcExpr = Self.ImpCastExprToType(SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy), CK_HLSLArrayRValue, VK_PRValue, nullptr, CCK);
+    Kind = CK_HLSLAggregateCast;
+    return;
+  }
+
   if (ValueKind == VK_PRValue && !DestType->isRecordType() &&
       !isPlaceholder(BuiltinType::Overload)) {
     SrcExpr = Self.DefaultFunctionArrayLvalueConversion(SrcExpr.get());
@@ -2820,9 +2837,6 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle,
   if (isValidCast(tcr))
     Kind = CK_NoOp;
 
-  CheckedConversionKind CCK = FunctionalStyle
-                                  ? CheckedConversionKind::FunctionalCast
-                                  : CheckedConversionKind::CStyleCast;
   if (tcr == TC_NotApplicable) {
     tcr = TryAddressSpaceCast(Self, SrcExpr, DestType, /*CStyle*/ true, msg,
                               Kind);
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 88db3e125411935..942c0a8fcaab09f 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2412,6 +2412,149 @@ bool SemaHLSL::CheckCompatibleParameterABI(FunctionDecl *New,
   return HadError;
 }
 
+// Follows PerformScalarCast
+bool SemaHLSL::CanPerformScalarCast(QualType SrcTy, QualType DestTy) {
+
+  if (SemaRef.getASTContext().hasSameUnqualifiedType(SrcTy, DestTy))
+    return true;
+
+  switch (Type::ScalarTypeKind SrcKind = SrcTy->getScalarTypeKind()) {
+  case Type::STK_MemberPointer:
+    return false;
+
+  case Type::STK_CPointer:
+  case Type::STK_BlockPointer:
+  case Type::STK_ObjCObjectPointer:
+    switch (DestTy->getScalarTypeKind()) {
+    case Type::STK_CPointer:
+    case Type::STK_BlockPointer:
+    case Type::STK_ObjCObjectPointer:
+    case Type::STK_Bool:
+    case Type::STK_Integral:
+      return true;
+    case Type::STK_Floating:
+    case Type::STK_FloatingComplex:
+    case Type::STK_IntegralComplex:
+    case Type::STK_MemberPointer:
+      return false;
+    case Type::STK_FixedPoint:
+      llvm_unreachable("HLSL doesn't have fixed point types.");
+    }
+    llvm_unreachable("Should have returned before this");
+
+  case Type::STK_FixedPoint:
+    llvm_unreachable("HLSL doesn't have fixed point types.");
+
+  case Type::STK_Bool: // casting from bool is like casting from an integer
+  case Type::STK_Integral:
+    switch (DestTy->getScalarTypeKind()) {
+    case Type::STK_CPointer:
+    case Type::STK_ObjCObjectPointer:
+    case Type::STK_BlockPointer:
+    case Type::STK_Bool:
+    case Type::STK_Integral:
+    case Type::STK_Floating:
+    case Type::STK_IntegralComplex:
+    case Type::STK_FloatingComplex:
+      return true;
+    case Type::STK_FixedPoint:
+      llvm_unreachable("HLSL doesn't have fixed point types.");
+    case Type::STK_MemberPointer:
+      return false;
+    }
+    llvm_unreachable("Should have returned before this");
+
+  case Type::STK_Floating:
+    switch (DestTy->getScalarTypeKind()) {
+    case Type::STK_Floating:
+    case Type::STK_Bool:
+    case Type::STK_Integral:
+    case Type::STK_FloatingComplex:
+    case Type::STK_IntegralComplex:
+      return true;
+    case Type::STK_FixedPoint:
+      llvm_unreachable("HLSL doesn't have fixed point types.");
+    case Type::STK_CPointer:
+    case Type::STK_ObjCObjectPointer:
+    case Type::STK_BlockPointer:
+    case Type::STK_MemberPointer:
+      return false;
+    }
+    llvm_unreachable("Should have returned before this");
+
+  case Type::STK_FloatingComplex:
+    switch (DestTy->getScalarTypeKind()) {
+    case Type::STK_FloatingComplex:
+    case Type::STK_IntegralComplex:
+    case Type::STK_Floating:
+    case Type::STK_Bool:
+    case Type::STK_Integral:
+      return true;
+    case Type::STK_CPointer:
+    case Type::STK_ObjCObjectPointer:
+    case Type::STK_BlockPointer:
+    case Type::STK_MemberPointer:
+      return false;
+    case Type::STK_FixedPoint:
+      llvm_unreachable("HLSL doesn't have fixed point types.");
+    }
+    llvm_unreachable("Should have returned before this");
+
+  case Type::STK_IntegralComplex:
+    switch (DestTy->getScalarTypeKind()) {
+    case Type::STK_FloatingComplex:
+    case Type::STK_IntegralComplex:
+    case Type::STK_Integral:
+    case Type::STK_Bool:
+    case Type::STK_Floating:
+      return true;
+    case Type::STK_CPointer:
+    case Type::STK_ObjCObjectPointer:
+    case Type::STK_BlockPointer:
+    case Type::STK_MemberPointer:
+      return false;
+    case Type::STK_FixedPoint:
+      llvm_unreachable("HLSL doesn't have fixed point types.");
+    }
+    llvm_unreachable("Should have returned before this");
+  }
+
+  llvm_unreachable("Unhandled scalar cast");
+}
+
+// Can we perform an HLSL Flattened cast?
+bool SemaHLSL::CanPerformAggregateCast(Expr *Src, QualType DestTy) {
+
+  // Don't handle casts where LHS and RHS are any combination of scalar/vector
+  // There must be an aggregate somewhere
+  QualType SrcTy = Src->getType();
+  if (SrcTy->isScalarType()) // always a splat and this cast doesn't handle that
+    return false;
+  
+  if ((DestTy->isScalarType() || DestTy->isVectorType()) &&
+      (SrcTy->isScalarType() || SrcTy->isVectorType()))
+    return false;
+
+  llvm::SmallVector<QualType> DestTypes;
+  BuildFlattenedTypeList(DestTy, DestTypes);
+  llvm::SmallVector<QualType> SrcTypes;
+  BuildFlattenedTypeList(SrcTy, SrcTypes);
+
+  // Usually the size of SrcTypes must be greater than or equal to the size of DestTypes.
+  if (SrcTypes.size() >= DestTypes.size()) {
+
+    unsigned i;
+    for(i = 0; i < DestTypes.size() && i < SrcTypes.size(); i ++) {
+      if (!CanPerformScalarCast(SrcTypes[i], DestTypes[i])) {
+        return false;
+      }
+    }
+    return true;
+  } else { // can't cast, Src is wrong size for Dest
+    return false;
+  }
+}
+
 ExprResult SemaHLSL::ActOnOutParamExpr(ParmVarDecl *Param, Expr *Arg) {
   assert(Param->hasAttr<HLSLParamModifierAttr>() &&
          "We should not get here without a parameter modifier expression");
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
index 7a900780384a91d..067ff064861ce75 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
@@ -522,6 +522,7 @@ void ExprEngine::VisitCast(const CastExpr *CastE, const Expr *Ex,
       case CK_ToUnion:
       case CK_MatrixCast:
       case CK_VectorSplat:
+    case CK_HLSLAggregateCast:
       case CK_HLSLVectorTruncation: {
         QualType resultType = CastE->getType();
         if (CastE->isGLValue())

>From 121f2a9ac38f8a8098db51f3fd3ccdc6e3fa6f7b Mon Sep 17 00:00:00 2001
From: Sarah Spall <spall at planetbauer.com>
Date: Thu, 5 Dec 2024 17:41:51 +0000
Subject: [PATCH 02/18] fix broken test

---
 clang/test/SemaHLSL/BuiltIns/vector-constructors-erros.hlsl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/clang/test/SemaHLSL/BuiltIns/vector-constructors-erros.hlsl b/clang/test/SemaHLSL/BuiltIns/vector-constructors-erros.hlsl
index 7f6bdc7e67836b7..b004acdc7c502ce 100644
--- a/clang/test/SemaHLSL/BuiltIns/vector-constructors-erros.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/vector-constructors-erros.hlsl
@@ -17,6 +17,4 @@ void entry() {
   // These _should_ work in HLSL but aren't yet supported.
   S s;
   float2 GettingStrange = float2(s, s); // expected-error{{no viable conversion from 'S' to 'float'}} expected-error{{no viable conversion from 'S' to 'float'}}
-  S2 s2;
-  float2 EvenStranger = float2(s2); // expected-error{{cannot convert 'S2' to 'float2' (vector of 2 'float' values) without a conversion operator}}
 }

>From 9cc06ce79bbae61309ff0ab060e570d129fb0be8 Mon Sep 17 00:00:00 2001
From: Sarah Spall <spall at planetbauer.com>
Date: Thu, 5 Dec 2024 17:44:38 +0000
Subject: [PATCH 03/18] make clang format happy

---
 clang/lib/CodeGen/CGExpr.cpp                  | 36 +++++++-------
 clang/lib/CodeGen/CGExprAgg.cpp               | 48 +++++++++----------
 clang/lib/CodeGen/CGExprScalar.cpp            | 19 ++++----
 clang/lib/CodeGen/CodeGenFunction.h           | 11 +++--
 clang/lib/Sema/SemaCast.cpp                   |  6 ++-
 clang/lib/Sema/SemaHLSL.cpp                   |  7 +--
 clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp |  2 +-
 7 files changed, 66 insertions(+), 63 deletions(-)

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index b7608b1226758d7..6b9c437ef7e2429 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6360,7 +6360,8 @@ LValue CodeGenFunction::EmitPseudoObjectLValue(const PseudoObjectExpr *E) {
   return emitPseudoObjectExpr(*this, E, true, AggValueSlot::ignored()).LV;
 }
 
-llvm::Value* CodeGenFunction::PerformLoad(std::pair<Address, llvm::Value *> &GEP) {
+llvm::Value *
+CodeGenFunction::PerformLoad(std::pair<Address, llvm::Value *> &GEP) {
   Address GEPAddress = GEP.first;
   llvm::Value *Idx = GEP.second;
   llvm::Value *V = Builder.CreateLoad(GEPAddress, "load");
@@ -6370,8 +6371,9 @@ llvm::Value* CodeGenFunction::PerformLoad(std::pair<Address, llvm::Value *> &GEP
   return V;
 }
 
-llvm::Value* CodeGenFunction::PerformStore(std::pair<Address, llvm::Value *> &GEP,
-				           llvm::Value *Val) {
+llvm::Value *
+CodeGenFunction::PerformStore(std::pair<Address, llvm::Value *> &GEP,
+                              llvm::Value *Val) {
   Address GEPAddress = GEP.first;
   llvm::Value *Idx = GEP.second;
   if (Idx) {
@@ -6382,20 +6384,21 @@ llvm::Value* CodeGenFunction::PerformStore(std::pair<Address, llvm::Value *> &GE
   }
 }
 
-void CodeGenFunction::FlattenAccessAndType(Address Val, QualType SrcTy,
-			         SmallVector<llvm::Value *, 4> &IdxList,
-			         SmallVector<std::pair<Address, llvm::Value *>, 16> &GEPList,
-				 SmallVector<QualType> &FlatTypes) {
-  llvm::IntegerType *IdxTy = llvm::IntegerType::get(getLLVMContext(),32);
+void CodeGenFunction::FlattenAccessAndType(
+    Address Val, QualType SrcTy, SmallVector<llvm::Value *, 4> &IdxList,
+    SmallVector<std::pair<Address, llvm::Value *>, 16> &GEPList,
+    SmallVector<QualType> &FlatTypes) {
+  llvm::IntegerType *IdxTy = llvm::IntegerType::get(getLLVMContext(), 32);
   if (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(SrcTy)) {
     uint64_t Size = CAT->getZExtSize();
-    for(unsigned i = 0; i < Size; i ++) {
+    for (unsigned i = 0; i < Size; i++) {
       // flatten each member of the array
       // add index of this element to index list
       llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, i);
       IdxList.push_back(Idx);
       // recur on this object
-      FlattenAccessAndType(Val, CAT->getElementType(), IdxList, GEPList, FlatTypes);
+      FlattenAccessAndType(Val, CAT->getElementType(), IdxList, GEPList,
+                           FlatTypes);
       // remove index of this element from index list
       IdxList.pop_back();
     }
@@ -6405,7 +6408,7 @@ void CodeGenFunction::FlattenAccessAndType(Address Val, QualType SrcTy,
     // do I need to check if its a cxx record decl?
 
     for (auto fieldIter = Record->field_begin(), fieldEnd = Record->field_end();
-	 fieldIter != fieldEnd; ++fieldIter) {
+         fieldIter != fieldEnd; ++fieldIter) {
       // get the field number
       unsigned FieldNum = RL.getLLVMFieldNo(*fieldIter);
       // can we just do *fieldIter->getFieldIndex();
@@ -6414,16 +6417,16 @@ void CodeGenFunction::FlattenAccessAndType(Address Val, QualType SrcTy,
       IdxList.push_back(Idx);
       // recur on the field
       FlattenAccessAndType(Val, fieldIter->getType(), IdxList, GEPList,
-			   FlatTypes);
+                           FlatTypes);
       // remove index of this element from index list
       IdxList.pop_back();
     }
   } else if (const VectorType *VT = SrcTy->getAs<VectorType>()) {
     llvm::Type *VTy = ConvertTypeForMem(SrcTy);
     CharUnits Align = getContext().getTypeAlignInChars(SrcTy);
-    Address GEP = Builder.CreateInBoundsGEP(Val, IdxList,
-						 VTy, Align, "vector.gep");
-    for(unsigned i = 0; i < VT->getNumElements(); i ++) {
+    Address GEP =
+        Builder.CreateInBoundsGEP(Val, IdxList, VTy, Align, "vector.gep");
+    for (unsigned i = 0; i < VT->getNumElements(); i++) {
       // add index to the list
       llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, i);
       // create gep. no need to recur since its always a scalar
@@ -6435,8 +6438,7 @@ void CodeGenFunction::FlattenAccessAndType(Address Val, QualType SrcTy,
     // create a gep
     llvm::Type *Ty = ConvertTypeForMem(SrcTy);
     CharUnits Align = getContext().getTypeAlignInChars(SrcTy);
-    Address GEP = Builder.CreateInBoundsGEP(Val, IdxList,
-						     Ty, Align,  "gep");
+    Address GEP = Builder.CreateInBoundsGEP(Val, IdxList, Ty, Align, "gep");
     GEPList.push_back({GEP, NULL});
     FlatTypes.push_back(SrcTy);
   }
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index bc8e1f0f9248efc..e3b47de958ce550 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -491,50 +491,45 @@ static bool isTrivialFiller(Expr *E) {
   return false;
 }
 
-
-
 // emit a flat cast where the RHS is a scalar, including vector
 static void EmitHLSLScalarFlatCast(CodeGenFunction &CGF, Address DestVal,
-			    QualType DestTy, llvm::Value *SrcVal,
-			    QualType SrcTy, SourceLocation Loc) {
+                                   QualType DestTy, llvm::Value *SrcVal,
+                                   QualType SrcTy, SourceLocation Loc) {
   // Flatten our destination
   SmallVector<QualType> DestTypes; // Flattened type
   SmallVector<llvm::Value *, 4> IdxList;
   SmallVector<std::pair<Address, llvm::Value *>, 16> StoreGEPList;
   // ^^ Flattened accesses to DestVal we want to store into
-  CGF.FlattenAccessAndType(DestVal, DestTy, IdxList, StoreGEPList,
-		       DestTypes);
+  CGF.FlattenAccessAndType(DestVal, DestTy, IdxList, StoreGEPList, DestTypes);
 
   if (const VectorType *VT = SrcTy->getAs<VectorType>()) {
     SrcTy = VT->getElementType();
     assert(StoreGEPList.size() <= VT->getNumElements() &&
-	   "Cannot perform HLSL flat cast when vector source \
+           "Cannot perform HLSL flat cast when vector source \
            object has less elements than flattened destination \
            object.");
-      for(unsigned i = 0; i < StoreGEPList.size(); i ++) {
-        llvm::Value *Load = CGF.Builder.CreateExtractElement(SrcVal, i,
-							     "vec.load");
-	llvm::Value *Cast = CGF.EmitScalarConversion(Load, SrcTy,
-						     DestTypes[i],
-						     Loc);
-	CGF.PerformStore(StoreGEPList[i], Cast);
-      }
-      return;
+    for (unsigned i = 0; i < StoreGEPList.size(); i++) {
+      llvm::Value *Load =
+          CGF.Builder.CreateExtractElement(SrcVal, i, "vec.load");
+      llvm::Value *Cast =
+          CGF.EmitScalarConversion(Load, SrcTy, DestTypes[i], Loc);
+      CGF.PerformStore(StoreGEPList[i], Cast);
+    }
+    return;
   }
   llvm_unreachable("HLSL Flat cast doesn't handle splatting.");
 }
 
 // emit a flat cast where the RHS is an aggregate
 static void EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address DestVal,
-			       QualType DestTy, Address SrcVal,
-			       QualType SrcTy, SourceLocation Loc) {
+                                      QualType DestTy, Address SrcVal,
+                                      QualType SrcTy, SourceLocation Loc) {
   // Flatten our destination
   SmallVector<QualType> DestTypes; // Flattened type
   SmallVector<llvm::Value *, 4> IdxList;
   SmallVector<std::pair<Address, llvm::Value *>, 16> StoreGEPList;
   // ^^ Flattened accesses to DestVal we want to store into
-  CGF.FlattenAccessAndType(DestVal, DestTy, IdxList, StoreGEPList,
-		       DestTypes);
+  CGF.FlattenAccessAndType(DestVal, DestTy, IdxList, StoreGEPList, DestTypes);
   // Flatten our src
   SmallVector<QualType> SrcTypes; // Flattened type
   SmallVector<std::pair<Address, llvm::Value *>, 16> LoadGEPList;
@@ -543,14 +538,14 @@ static void EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address DestVal,
   CGF.FlattenAccessAndType(SrcVal, SrcTy, IdxList, LoadGEPList, SrcTypes);
 
   assert(StoreGEPList.size() <= LoadGEPList.size() &&
-	 "Cannot perform HLSL flat cast when flattened source object \
+         "Cannot perform HLSL flat cast when flattened source object \
           has less elements than flattened destination object.");
   // apply casts to what we load from LoadGEPList
   // and store result in Dest
-  for(unsigned i = 0; i < StoreGEPList.size(); i ++) {
+  for (unsigned i = 0; i < StoreGEPList.size(); i++) {
     llvm::Value *Load = CGF.PerformLoad(LoadGEPList[i]);
-    llvm::Value *Cast = CGF.EmitScalarConversion(Load, SrcTypes[i],
-						 DestTypes[i], Loc);
+    llvm::Value *Cast =
+        CGF.EmitScalarConversion(Load, SrcTypes[i], DestTypes[i], Loc);
     CGF.PerformStore(StoreGEPList[i], Cast);
   }
 }
@@ -967,11 +962,12 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
       EmitHLSLScalarFlatCast(CGF, DestVal, DestTy, SrcVal, SrcTy, Loc);
     } else { // RHS is an aggregate
       assert(RV.isAggregate() &&
-	     "Can't perform HLSL Aggregate cast on a complex type.");
+             "Can't perform HLSL Aggregate cast on a complex type.");
       Address SrcVal = RV.getAggregateAddress();
       EmitHLSLAggregateFlatCast(CGF, DestVal, DestTy, SrcVal, SrcTy, Loc);
     }
-    break; }
+    break;
+  }
   case CK_NoOp:
   case CK_UserDefinedConversion:
   case CK_ConstructorConversion:
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index d7bb702ec3ca20c..3809e3b1db3494d 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2264,8 +2264,8 @@ bool CodeGenFunction::ShouldNullCheckClassCastValue(const CastExpr *CE) {
 
 // RHS is an aggregate type
 static Value *EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address RHSVal,
-					QualType RHSTy, QualType LHSTy,
-					SourceLocation Loc) {
+                                        QualType RHSTy, QualType LHSTy,
+                                        SourceLocation Loc) {
   SmallVector<llvm::Value *, 4> IdxList;
   SmallVector<std::pair<Address, llvm::Value *>, 16> LoadGEPList;
   SmallVector<QualType> SrcTypes; // Flattened type
@@ -2273,22 +2273,23 @@ static Value *EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address RHSVal,
   // LHS is either a vector or a builtin?
   // if its a vector create a temp alloca to store into and return that
   if (auto *VecTy = LHSTy->getAs<VectorType>()) {
-    llvm::Value *V = CGF.Builder.CreateLoad(CGF.CreateIRTemp(LHSTy, "flatcast.tmp"));
+    llvm::Value *V =
+        CGF.Builder.CreateLoad(CGF.CreateIRTemp(LHSTy, "flatcast.tmp"));
     // write to V.
-    for(unsigned i = 0; i < VecTy->getNumElements(); i ++) {
+    for (unsigned i = 0; i < VecTy->getNumElements(); i++) {
       llvm::Value *Load = CGF.PerformLoad(LoadGEPList[i]);
-      llvm::Value *Cast = CGF.EmitScalarConversion(Load, SrcTypes[i],
-						   VecTy->getElementType(), Loc);
+      llvm::Value *Cast = CGF.EmitScalarConversion(
+          Load, SrcTypes[i], VecTy->getElementType(), Loc);
       V = CGF.Builder.CreateInsertElement(V, Cast, i);
     }
     return V;
   }
   // i its a builtin just do an extract element or load.
   assert(LHSTy->isBuiltinType() &&
-	 "Destination type must be a vector or builtin type.");
+         "Destination type must be a vector or builtin type.");
   // TODO add asserts about things being long enough
-  return CGF.EmitScalarConversion(CGF.PerformLoad(LoadGEPList[0]),
-				  LHSTy, SrcTypes[0], Loc);
+  return CGF.EmitScalarConversion(CGF.PerformLoad(LoadGEPList[0]), LHSTy,
+                                  SrcTypes[0], Loc);
 }
 
 // VisitCastExpr - Emit code for an explicit or implicit cast.  Implicit casts
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index b17ead377610e67..873dd781eb2e7d7 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4360,11 +4360,12 @@ class CodeGenFunction : public CodeGenTypeCache {
   LValue EmitPseudoObjectLValue(const PseudoObjectExpr *e);
 
   llvm::Value *PerformLoad(std::pair<Address, llvm::Value *> &GEP);
-  llvm::Value *PerformStore(std::pair<Address, llvm::Value *> &GEP, llvm::Value *Val);
-  void FlattenAccessAndType(Address Val, QualType SrcTy,
-			    SmallVector<llvm::Value *, 4> &IdxList,
-			    SmallVector<std::pair<Address, llvm::Value *>, 16> &GEPList,
-			    SmallVector<QualType> &FlatTypes);
+  llvm::Value *PerformStore(std::pair<Address, llvm::Value *> &GEP,
+                            llvm::Value *Val);
+  void FlattenAccessAndType(
+      Address Val, QualType SrcTy, SmallVector<llvm::Value *, 4> &IdxList,
+      SmallVector<std::pair<Address, llvm::Value *>, 16> &GEPList,
+      SmallVector<QualType> &FlatTypes);
 
   llvm::Value *EmitIvarOffset(const ObjCInterfaceDecl *Interface,
                               const ObjCIvarDecl *Ivar);
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 955c44cf4a6a424..0bd7fc91aee18f9 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -23,9 +23,9 @@
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Initialization.h"
+#include "clang/Sema/SemaHLSL.h"
 #include "clang/Sema/SemaObjC.h"
 #include "clang/Sema/SemaRISCV.h"
-#include "clang/Sema/SemaHLSL.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include <set>
@@ -2780,7 +2780,9 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle,
   if (Self.getLangOpts().HLSL &&
       Self.HLSL().CanPerformAggregateCast(SrcExpr.get(), DestType)) {
     if (SrcTy->isConstantArrayType())
-      SrcExpr = Self.ImpCastExprToType(SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy), CK_HLSLArrayRValue, VK_PRValue, nullptr, CCK);
+      SrcExpr = Self.ImpCastExprToType(
+          SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy),
+          CK_HLSLArrayRValue, VK_PRValue, nullptr, CCK);
     Kind = CK_HLSLAggregateCast;
     return;
   }
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 942c0a8fcaab09f..5c7af8056063adc 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2530,7 +2530,7 @@ bool SemaHLSL::CanPerformAggregateCast(Expr *Src, QualType DestTy) {
   QualType SrcTy = Src->getType();
   if (SrcTy->isScalarType()) // always a splat and this cast doesn't handle that
     return false;
-  
+
   if ((DestTy->isScalarType() || DestTy->isVectorType()) &&
       (SrcTy->isScalarType() || SrcTy->isVectorType()))
     return false;
@@ -2540,11 +2540,12 @@ bool SemaHLSL::CanPerformAggregateCast(Expr *Src, QualType DestTy) {
   llvm::SmallVector<QualType> SrcTypes;
   BuildFlattenedTypeList(SrcTy, SrcTypes);
 
-  // Usually the size of SrcTypes must be greater than or equal to the size of DestTypes.
+  // Usually the size of SrcTypes must be greater than or equal to the size of
+  // DestTypes.
   if (SrcTypes.size() >= DestTypes.size()) {
 
     unsigned i;
-    for(i = 0; i < DestTypes.size() && i < SrcTypes.size(); i ++) {
+    for (i = 0; i < DestTypes.size() && i < SrcTypes.size(); i++) {
       if (!CanPerformScalarCast(SrcTypes[i], DestTypes[i])) {
         return false;
       }
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
index 067ff064861ce75..b105c196fc3bfb3 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
@@ -522,7 +522,7 @@ void ExprEngine::VisitCast(const CastExpr *CastE, const Expr *Ex,
       case CK_ToUnion:
       case CK_MatrixCast:
       case CK_VectorSplat:
-    case CK_HLSLAggregateCast:
+      case CK_HLSLAggregateCast:
       case CK_HLSLVectorTruncation: {
         QualType resultType = CastE->getType();
         if (CastE->isGLValue())

>From e3e51b6761f2e9af61bfa6ae87860e05484e93c0 Mon Sep 17 00:00:00 2001
From: Sarah Spall <spall at planetbauer.com>
Date: Thu, 5 Dec 2024 17:46:16 +0000
Subject: [PATCH 04/18] CodeGen tests

---
 .../BasicFeatures/ArrayFlatCast.hlsl          | 128 ++++++++++++++++++
 .../BasicFeatures/StructFlatCast.hlsl         | 124 +++++++++++++++++
 .../BasicFeatures/VectorFlatCast.hlsl         |  81 +++++++++++
 3 files changed, 333 insertions(+)
 create mode 100644 clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl
 create mode 100644 clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl
 create mode 100644 clang/test/CodeGenHLSL/BasicFeatures/VectorFlatCast.hlsl

diff --git a/clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl
new file mode 100644
index 000000000000000..23a71a2ecc6b965
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl
@@ -0,0 +1,128 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - %s | FileCheck %s
+
+// array truncation
+// CHECK-LABEL: define void {{.*}}call1
+// CHECK: [[A:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: [[B:%.*]] = alloca [1 x i32], align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[B]], ptr align 4 {{.*}}, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [1 x i32], ptr [[B]], i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G2]], align 4
+// CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
+export void call1() {
+  int A[2] = {0,1};
+  int B[1] = {4};
+  B = (int[1])A;
+}
+
+// just a cast
+// CHECK-LABEL: define void {{.*}}call2
+// CHECK: [[A:%.*]] = alloca [1 x i32], align 4
+// CHECK-NEXT: [[B:%.*]] = alloca [1 x float], align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [1 x i32], align 4
+// CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[A]], i8 0, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[B]], ptr align 4 {{.*}}, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 4, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [1 x float], ptr [[B]], i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [1 x i32], ptr [[Tmp]], i32 0
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G2]], align 4
+// CHECK-NEXT: [[C:%.*]] = sitofp i32 [[L]] to float
+// CHECK-NEXT: store float [[C]], ptr [[G1]], align 4
+export void call2() {
+  int A[1] = {0};
+  float B[1] = {1.0};
+  B = (float[1])A;
+}
+
+// vector to array
+// CHECK-LABEL: define void {{.*}}call3
+// CHECK: [[A:%.*]] = alloca <1 x float>, align 4
+// CHECK-NEXT: [[B:%.*]] = alloca [1 x i32], align 4
+// CHECK-NEXT: store <1 x float> splat (float 0x3FF3333340000000), ptr [[A]], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[B]], ptr align 4 {{.*}}, i32 4, i1 false)
+// CHECK-NEXT: [[C:%.*]] = load <1 x float>, ptr [[A]], align 4
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [1 x i32], ptr [[B]], i32 0
+// CHECK-NEXT: [[V:%.*]] = extractelement <1 x float> [[C]], i64 0
+// CHECK-NEXT: [[C:%.*]] = fptosi float [[V]] to i32
+// CHECK-NEXT: store i32 [[C]], ptr [[G1]], align 4
+export void call3() {
+  float1 A = {1.2};
+  int B[1] = {1};
+  B = (int[1])A;
+}
+
+// flatten array of vector to array with cast
+// CHECK-LABEL: define void {{.*}}call5
+// CHECK: [[A:%.*]] = alloca [1 x <2 x float>], align 8
+// CHECK-NEXT: [[B:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [1 x <2 x float>], align 8
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[A]], ptr align 8 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[B]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[Tmp]], ptr align 8 [[A]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i32 1
+// CHECK-NEXT: [[VG:%.*]] = getelementptr inbounds [1 x <2 x float>], ptr [[Tmp]], i32 0
+// CHECK-NEXT: [[L:%.*]] = load <2 x float>, ptr [[VG]], align 8
+// CHECK-NEXT: [[VL:%.*]] = extractelement <2 x float> [[L]], i32 0
+// CHECK-NEXT: [[C:%.*]] = fptosi float [[VL]] to i32
+// CHECK-NEXT: store i32 [[C]], ptr [[G1]], align 4
+// CHECK-NEXT: [[L4:%.*]] = load <2 x float>, ptr [[VG]], align 8
+// CHECK-NEXT: [[VL5:%.*]] = extractelement <2 x float> [[L4]], i32 1
+// CHECK-NEXT: [[C6:%.*]] = fptosi float [[VL5]] to i32
+// CHECK-NEXT: store i32 [[C6]], ptr [[G2]], align 4
+export void call5() {
+  float2 A[1] = {{1.2,3.4}};
+  int B[2] = {1,2};
+  B = (int[2])A;
+}
+
+// flatten 2d array
+// CHECK-LABEL: define void {{.*}}call6
+// CHECK: [[A:%.*]] = alloca [2 x [1 x i32]], align 4
+// CHECK-NEXT: [[B:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x [1 x i32]], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[B]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i32 1
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 1, i32 0
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G3]], align 4
+// CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
+// CHECK-NEXT: [[L4:%.*]] = load i32, ptr [[G4]], align 4
+// CHECK-NEXT: store i32 [[L4]], ptr [[G2]], align 4
+export void call6() {
+  int A[2][1] = {{1},{3}};
+  int B[2] = {1,2};
+  B = (int[2])A;
+}
+
+struct S {
+  int X;
+  float Y;
+};
+
+// flatten and truncate from a struct
+// CHECK-LABEL: define void {{.*}}call7
+// CHECK: [[s:%.*]] = alloca %struct.S, align 4
+// CHECK-NEXT: [[A:%.*]] = alloca [1 x i32], align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.S, align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[s]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[s]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [1 x i32], ptr [[A]], i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G2]], align 4
+// CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
+export void call7() {
+  S s = {1, 2.9};
+  int A[1] = {1};
+  A = (int[1])s;
+}
+
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl
new file mode 100644
index 000000000000000..c44e340109abb2c
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl
@@ -0,0 +1,124 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+
+struct S {
+  int X;
+  float Y;
+};
+
+// struct from vector
+// CHECK-LABEL: define void {{.*}}call1
+// CHECK: [[A:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT: [[s:%.*]] = alloca %struct.S, align 4
+// CHECK-NEXT: store <2 x i32> <i32 1, i32 2>, ptr [[A]], align 8
+// CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[A]], align 8
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 1
+// CHECK-NEXT: [[VL:%.*]] = extractelement <2 x i32> [[L]], i64 0
+// CHECK-NEXT: store i32 [[VL]], ptr [[G1]], align 4
+// CHECK-NEXT: [[VL2:%.*]] = extractelement <2 x i32> [[L]], i64 1
+// CHECK-NEXT: [[C:%.*]] = sitofp i32 [[VL2]] to float
+// CHECK-NEXT: store float [[C]], ptr [[G2]], align 4
+export void call1() {
+  int2 A = {1,2};
+  S s = (S)A;
+}
+
+
+// struct from array
+// CHECK-LABEL: define void {{.*}}call2
+// CHECK: [[A:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: [[s:%.*]] = alloca %struct.S, align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 1
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0
+// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G3]], align 4
+// CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
+// CHECK-NEXT: [[L4:%.*]] = load i32, ptr [[G4]], align 4
+// CHECK-NEXT: [[C:%.*]] = sitofp i32 [[L4]] to float
+// CHECK-NEXT: store float [[C]], ptr [[G2]], align 4
+export void call2() {
+  int A[2] = {1,2};
+  S s = (S)A;
+}
+
+struct Q {
+  int Z;
+};
+
+struct R {
+  Q q;
+  float F;
+};
+
+// struct from nested struct?
+// CHECK-LABEL: define void {{.*}}call6
+// CHECK: [[r:%.*]] = alloca %struct.R, align 4
+// CHECK-NEXT: [[s:%.*]] = alloca %struct.S, align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.R, align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[r]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[r]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 1
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds %struct.R, ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds %struct.R, ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G3]], align 4
+// CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
+// CHECK-NEXT: [[L4:%.*]] = load float, ptr [[G4]], align 4
+// CHECK-NEXT: store float [[L4]], ptr [[G2]], align 4
+export void call6() {
+  R r = {{1}, 2.0};
+  S s = (S)r;
+}
+
+// nested struct from array?
+// CHECK-LABEL: define void {{.*}}call7
+// CHECK: [[A:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: [[r:%.*]] = alloca %struct.R, align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.R, ptr [[r]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.R, ptr [[r]], i32 1
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0
+// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G3]], align 4
+// CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
+// CHECK-NEXT: [[L4:%.*]] = load i32, ptr [[G4]], align 4
+// CHECK-NEXT: [[C:%.*]] = sitofp i32 [[L4]] to float
+// CHECK-NEXT: store float [[C]], ptr [[G2]], align 4
+export void call7() {
+  int A[2] = {1,2};
+  R r = (R)A;
+}
+
+struct T {
+  int A;
+  int B;
+  int C;
+};
+
+// struct truncation
+// CHECK-LABEL: define void {{.*}}call8
+// CHECK: [[t:%.*]] = alloca %struct.T, align 4
+// CHECK-NEXT: [[s:%.*]] = alloca %struct.S, align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.T, align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[t]], ptr align 4 {{.*}}, i32 12, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[t]], i32 12, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 1
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds %struct.T, ptr [[Tmp]], i32 0
+// CHECK-NEXT: %gep3 = getelementptr inbounds %struct.T, ptr %agg-temp, i32 1
+// CHECK-NEXT: %gep4 = getelementptr inbounds %struct.T, ptr %agg-temp, i32 2
+// CHECK-NEXT: %load = load i32, ptr %gep2, align 4
+// CHECK-NEXT: store i32 %load, ptr %gep, align 4
+// CHECK-NEXT: %load5 = load i32, ptr %gep3, align 4
+// CHECK-NEXT: %conv = sitofp i32 %load5 to float
+// CHECK-NEXT: store float %conv, ptr %gep1, align 4
+export void call8() {
+  T t = {1,2,3};
+  S s = (S)t;
+}
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/VectorFlatCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/VectorFlatCast.hlsl
new file mode 100644
index 000000000000000..9cd320ee9f62db3
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/VectorFlatCast.hlsl
@@ -0,0 +1,81 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+
+// vector flat cast from array
+// CHECK-LABEL: define void {{.*}}call2
+// CHECK: [[A:%.*]] = alloca [2 x [1 x i32]], align 4
+// CHECK-NEXT: [[B:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x [1 x i32]], align 4
+// CHECK-NEXT: [[Tmp2:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 1, i32 0
+// CHECK-NEXT: [[C:%.*]] = load <2 x i32>, ptr [[Tmp2]], align 8
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
+// CHECK-NEXT: [[D:%.*]] = insertelement <2 x i32> [[C]], i32 [[L]], i64 0
+// CHECK-NEXT: [[L2:%.*]] = load i32, ptr [[G2]], align 4
+// CHECK-NEXT: [[E:%.*]] = insertelement <2 x i32> [[D]], i32 [[L2]], i64 1
+// CHECK-NEXT: store <2 x i32> [[E]], ptr [[B]], align 8
+export void call2() {
+  int A[2][1] = {{1},{2}};
+  int2 B = (int2)A;
+}
+
+struct S {
+  int X;
+  float Y;
+};
+
+// vector flat cast from struct
+// CHECK-LABEL: define void {{.*}}call3
+// CHECK: [[s:%.*]] = alloca %struct.S, align 4
+// CHECK-NEXT: [[A:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.S, align 4
+// CHECK-NEXT: [[Tmp2:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[s]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[s]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[B:%.*]] = load <2 x i32>, ptr [[Tmp2]], align 8
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
+// CHECK-NEXT: [[C:%.*]] = insertelement <2 x i32> [[B]], i32 [[L]], i64 0
+// CHECK-NEXT: [[L2:%.*]] = load float, ptr [[G2]], align 4
+// CHECK-NEXT: [[D:%.*]] = fptosi float [[L2]] to i32
+// CHECK-NEXT: [[E:%.*]] = insertelement <2 x i32> [[C]], i32 [[D]], i64 1
+// CHECK-NEXT: store <2 x i32> [[E]], ptr [[A]], align 8
+export void call3() {
+  S s = {1, 2.0};
+  int2 A = (int2)s;
+}
+
+// truncate array to scalar
+// CHECK-LABEL: define void {{.*}}call4
+// CHECK: [[A:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: [[B:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
+// CHECK-NEXT: store i32 [[L]], ptr [[B]], align 4
+export void call4() {
+ int A[2] = {1,2};
+ int B = (int)A;
+}
+
+// truncate struct to scalar
+// CHECK-LABEL: define void {{.*}}call5
+// CHECK: [[s:%.*]] = alloca %struct.S, align 4
+// CHECK-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.S, align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[s]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[s]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
+// CHECK-NEXT: store i32 [[L]], ptr [[A]], align 4
+export void call5() {
+ S s = {1, 2.0};
+ int A = (int)s;
+}

>From 28858755ed754b2ba9affd92728505c5a5c39787 Mon Sep 17 00:00:00 2001
From: Sarah Spall <spall at planetbauer.com>
Date: Fri, 6 Dec 2024 19:12:34 +0000
Subject: [PATCH 05/18] address PR comments add some todos that need to be
 addressed

---
 clang/lib/AST/ExprConstant.cpp  |  1 +
 clang/lib/CodeGen/CGExprAgg.cpp |  1 +
 clang/lib/Sema/SemaHLSL.cpp     | 91 ++++++++-------------------------
 3 files changed, 23 insertions(+), 70 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index b548cef41b75257..e782e6227234afe 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14857,6 +14857,7 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
   case CK_FixedPointCast:
   case CK_IntegralToFixedPoint:
   case CK_MatrixCast:
+    // TODO does CK_HLSLAggregateCast belong here?
     llvm_unreachable("invalid cast kind for integral value");
 
   case CK_BitCast:
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index e3b47de958ce550..b0d3a0fb5e5902c 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -1538,6 +1538,7 @@ static bool castPreservesZero(const CastExpr *CE) {
   case CK_NonAtomicToAtomic:
   case CK_AtomicToNonAtomic:
   case CK_HLSLVectorTruncation:
+    // TODO does CK_HLSLAggregateCast preserve zero?
     return true;
 
   case CK_BaseToDerivedMemberPointer:
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 5c7af8056063adc..8c32e77d4f799da 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2420,30 +2420,13 @@ bool SemaHLSL::CanPerformScalarCast(QualType SrcTy, QualType DestTy) {
 
   switch (Type::ScalarTypeKind SrcKind = SrcTy->getScalarTypeKind()) {
   case Type::STK_MemberPointer:
-    return false;
-
   case Type::STK_CPointer:
   case Type::STK_BlockPointer:
   case Type::STK_ObjCObjectPointer:
-    switch (DestTy->getScalarTypeKind()) {
-    case Type::STK_CPointer:
-    case Type::STK_BlockPointer:
-    case Type::STK_ObjCObjectPointer:
-    case Type::STK_Bool:
-    case Type::STK_Integral:
-      return true;
-    case Type::STK_Floating:
-    case Type::STK_FloatingComplex:
-    case Type::STK_IntegralComplex:
-    case Type::STK_MemberPointer:
-      return false;
-    case Type::STK_FixedPoint:
-      llvm_unreachable("HLSL doesn't have fixed point types.");
-    }
-    llvm_unreachable("Should have returned before this");
+    llvm_unreachable("HLSL doesn't support pointers.");
 
   case Type::STK_FixedPoint:
-    llvm_unreachable("HLSL doesn't have fixed point types.");
+    llvm_unreachable("HLSL doesn't support fixed point types.");
 
   case Type::STK_Bool: // casting from bool is like casting from an integer
   case Type::STK_Integral:
@@ -2451,16 +2434,17 @@ bool SemaHLSL::CanPerformScalarCast(QualType SrcTy, QualType DestTy) {
     case Type::STK_CPointer:
     case Type::STK_ObjCObjectPointer:
     case Type::STK_BlockPointer:
+    case Type::STK_MemberPointer:
+      llvm_unreachable("HLSL doesn't support pointers.");
     case Type::STK_Bool:
     case Type::STK_Integral:
     case Type::STK_Floating:
+      return true;
     case Type::STK_IntegralComplex:
     case Type::STK_FloatingComplex:
-      return true;
+      llvm_unreachable("HLSL doesn't support complex types.");
     case Type::STK_FixedPoint:
-      llvm_unreachable("HLSL doesn't have fixed point types.");
-    case Type::STK_MemberPointer:
-      return false;
+      llvm_unreachable("HLSL doesn't support fixed point types.");
     }
     llvm_unreachable("Should have returned before this");
 
@@ -2469,60 +2453,30 @@ bool SemaHLSL::CanPerformScalarCast(QualType SrcTy, QualType DestTy) {
     case Type::STK_Floating:
     case Type::STK_Bool:
     case Type::STK_Integral:
+      return true;
     case Type::STK_FloatingComplex:
     case Type::STK_IntegralComplex:
-      return true;
+      llvm_unreachable("HLSL doesn't support complex types.");
     case Type::STK_FixedPoint:
-      llvm_unreachable("HLSL doesn't have fixed point types.");
+      llvm_unreachable("HLSL doesn't support fixed point types.");
     case Type::STK_CPointer:
     case Type::STK_ObjCObjectPointer:
     case Type::STK_BlockPointer:
     case Type::STK_MemberPointer:
-      return false;
+      llvm_unreachable("HLSL doesn't support pointers.");
     }
     llvm_unreachable("Should have returned before this");
 
   case Type::STK_FloatingComplex:
-    switch (DestTy->getScalarTypeKind()) {
-    case Type::STK_FloatingComplex:
-    case Type::STK_IntegralComplex:
-    case Type::STK_Floating:
-    case Type::STK_Bool:
-    case Type::STK_Integral:
-      return true;
-    case Type::STK_CPointer:
-    case Type::STK_ObjCObjectPointer:
-    case Type::STK_BlockPointer:
-    case Type::STK_MemberPointer:
-      return false;
-    case Type::STK_FixedPoint:
-      llvm_unreachable("HLSL doesn't have fixed point types.");
-    }
-    llvm_unreachable("Should have returned before this");
-
   case Type::STK_IntegralComplex:
-    switch (DestTy->getScalarTypeKind()) {
-    case Type::STK_FloatingComplex:
-    case Type::STK_IntegralComplex:
-    case Type::STK_Integral:
-    case Type::STK_Bool:
-    case Type::STK_Floating:
-      return true;
-    case Type::STK_CPointer:
-    case Type::STK_ObjCObjectPointer:
-    case Type::STK_BlockPointer:
-    case Type::STK_MemberPointer:
-      return false;
-    case Type::STK_FixedPoint:
-      llvm_unreachable("HLSL doesn't have fixed point types.");
-    }
-    llvm_unreachable("Should have returned before this");
+    llvm_unreachable("HLSL doesn't support complex types.");
   }
 
   llvm_unreachable("Unhandled scalar cast");
 }
 
 // Can we perform an HLSL Flattened cast?
+// TODO: update this code when matrices are added
 bool SemaHLSL::CanPerformAggregateCast(Expr *Src, QualType DestTy) {
 
   // Don't handle casts where LHS and RHS are any combination of scalar/vector
@@ -2531,8 +2485,8 @@ bool SemaHLSL::CanPerformAggregateCast(Expr *Src, QualType DestTy) {
   if (SrcTy->isScalarType()) // always a splat and this cast doesn't handle that
     return false;
 
-  if ((DestTy->isScalarType() || DestTy->isVectorType()) &&
-      (SrcTy->isScalarType() || SrcTy->isVectorType()))
+  if (SrcTy->isVectorType() &&
+      (DestTy->isScalarType() || DestTy->isVectorType()))
     return false;
 
   llvm::SmallVector<QualType> DestTypes;
@@ -2542,18 +2496,15 @@ bool SemaHLSL::CanPerformAggregateCast(Expr *Src, QualType DestTy) {
 
   // Usually the size of SrcTypes must be greater than or equal to the size of
   // DestTypes.
-  if (SrcTypes.size() >= DestTypes.size()) {
+  if (SrcTypes.size() < DestTypes.size())
+    return false;
 
-    unsigned i;
-    for (i = 0; i < DestTypes.size() && i < SrcTypes.size(); i++) {
-      if (!CanPerformScalarCast(SrcTypes[i], DestTypes[i])) {
-        return false;
-      }
+  for (unsigned i = 0; i < DestTypes.size() && i < SrcTypes.size(); i++) {
+    if (!CanPerformScalarCast(SrcTypes[i], DestTypes[i])) {
+      return false;
     }
-    return true;
-  } else { // can't cast, Src is wrong size for Dest
-    return false;
   }
+  return true;
 }
 
 ExprResult SemaHLSL::ActOnOutParamExpr(ParmVarDecl *Param, Expr *Arg) {

>From c5650eddc62863948b24646d98bd443c2f4e8edc Mon Sep 17 00:00:00 2001
From: Sarah Spall <spall at planetbauer.com>
Date: Sat, 7 Dec 2024 17:16:39 +0000
Subject: [PATCH 06/18] new tests

---
 .../SemaHLSL/Language/FlatCast-errors.hlsl    |  8 +++++++
 clang/test/SemaHLSL/Language/FlatCasts.hlsl   | 23 +++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 clang/test/SemaHLSL/Language/FlatCast-errors.hlsl
 create mode 100644 clang/test/SemaHLSL/Language/FlatCasts.hlsl

diff --git a/clang/test/SemaHLSL/Language/FlatCast-errors.hlsl b/clang/test/SemaHLSL/Language/FlatCast-errors.hlsl
new file mode 100644
index 000000000000000..0197e8b1e36760a
--- /dev/null
+++ b/clang/test/SemaHLSL/Language/FlatCast-errors.hlsl
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -verify
+
+export void cantCast() {
+  int A[3] = {1,2,3};
+  int B[4] = {1,2,3,4};
+  B = (int[4])A;
+  // expected-error at -1 {{C-style cast from 'int *' to 'int[4]' is not allowed}}
+}
\ No newline at end of file
diff --git a/clang/test/SemaHLSL/Language/FlatCasts.hlsl b/clang/test/SemaHLSL/Language/FlatCasts.hlsl
new file mode 100644
index 000000000000000..c869b32f0276c8b
--- /dev/null
+++ b/clang/test/SemaHLSL/Language/FlatCasts.hlsl
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -fnative-half-type %s -ast-dump | FileCheck %s
+
+// truncation
+// CHECK-LABEL: call1
+// CHECK: CStyleCastExpr {{.*}} 'int[1]' <HLSLAggregateCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int[2]' <HLSLArrayRValue> part_of_explicit_cast
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int[2]' lvalue Var {{.*}} 'A' 'int[2]'
+export void call1() {
+  int A[2] = {0,1};
+  int B[1] = {4};
+  B = (int[1])A;
+}
+
+// flat cast of equal size
+// CHECK-LABEL: call2
+// CHECK: CStyleCastExpr {{.*}} 'float[1]' <HLSLAggregateCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int[1]' <HLSLArrayRValue> part_of_explicit_cast
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int[1]' lvalue Var {{.*}} 'A' 'int[1]'
+export void call2() {
+  int A[1] = {0};
+  float B[1] = {1.0};
+  B = (float[1])A;
+}

>From f4819b80212065b611c6b1efec02cf4db15f664a Mon Sep 17 00:00:00 2001
From: Sarah Spall <spall at planetbauer.com>
Date: Mon, 9 Dec 2024 20:31:57 +0000
Subject: [PATCH 07/18] Self Review.

---
 clang/lib/CodeGen/CGExpr.cpp                  | 24 -------------
 clang/lib/CodeGen/CGExprAgg.cpp               | 34 +++++++++++++++++--
 clang/lib/CodeGen/CGExprScalar.cpp            | 15 ++++++--
 clang/lib/CodeGen/CodeGenFunction.h           |  3 --
 clang/lib/Sema/SemaCast.cpp                   |  4 +--
 clang/lib/Sema/SemaHLSL.cpp                   | 29 ++++++++--------
 .../BasicFeatures/ArrayFlatCast.hlsl          | 32 ++++++++---------
 .../BasicFeatures/StructFlatCast.hlsl         | 34 +++++++++----------
 .../BasicFeatures/VectorFlatCast.hlsl         | 16 ++++-----
 9 files changed, 100 insertions(+), 91 deletions(-)

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 6b9c437ef7e2429..f85f10eeb422d29 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6360,30 +6360,6 @@ LValue CodeGenFunction::EmitPseudoObjectLValue(const PseudoObjectExpr *E) {
   return emitPseudoObjectExpr(*this, E, true, AggValueSlot::ignored()).LV;
 }
 
-llvm::Value *
-CodeGenFunction::PerformLoad(std::pair<Address, llvm::Value *> &GEP) {
-  Address GEPAddress = GEP.first;
-  llvm::Value *Idx = GEP.second;
-  llvm::Value *V = Builder.CreateLoad(GEPAddress, "load");
-  if (Idx) { // loading from a vector so perform an extract as well
-    return Builder.CreateExtractElement(V, Idx, "vec.load");
-  }
-  return V;
-}
-
-llvm::Value *
-CodeGenFunction::PerformStore(std::pair<Address, llvm::Value *> &GEP,
-                              llvm::Value *Val) {
-  Address GEPAddress = GEP.first;
-  llvm::Value *Idx = GEP.second;
-  if (Idx) {
-    llvm::Value *V = Builder.CreateLoad(GEPAddress, "load.for.insert");
-    return Builder.CreateInsertElement(V, Val, Idx);
-  } else {
-    return Builder.CreateStore(Val, GEPAddress);
-  }
-}
-
 void CodeGenFunction::FlattenAccessAndType(
     Address Val, QualType SrcTy, SmallVector<llvm::Value *, 4> &IdxList,
     SmallVector<std::pair<Address, llvm::Value *>, 16> &GEPList,
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index b0d3a0fb5e5902c..cb4e24062aad212 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -498,6 +498,9 @@ static void EmitHLSLScalarFlatCast(CodeGenFunction &CGF, Address DestVal,
   // Flatten our destination
   SmallVector<QualType> DestTypes; // Flattened type
   SmallVector<llvm::Value *, 4> IdxList;
+  IdxList.push_back(
+      llvm::ConstantInt::get(llvm::IntegerType::get(CGF.getLLVMContext(), 32),
+                             0)); // because an Address is a pointer
   SmallVector<std::pair<Address, llvm::Value *>, 16> StoreGEPList;
   // ^^ Flattened accesses to DestVal we want to store into
   CGF.FlattenAccessAndType(DestVal, DestTy, IdxList, StoreGEPList, DestTypes);
@@ -513,7 +516,15 @@ static void EmitHLSLScalarFlatCast(CodeGenFunction &CGF, Address DestVal,
           CGF.Builder.CreateExtractElement(SrcVal, i, "vec.load");
       llvm::Value *Cast =
           CGF.EmitScalarConversion(Load, SrcTy, DestTypes[i], Loc);
-      CGF.PerformStore(StoreGEPList[i], Cast);
+
+      // store back
+      llvm::Value *Idx = StoreGEPList[i].second;
+      if (Idx) {
+        llvm::Value *V =
+            CGF.Builder.CreateLoad(StoreGEPList[i].first, "load.for.insert");
+        Cast = CGF.Builder.CreateInsertElement(V, Cast, Idx);
+      }
+      CGF.Builder.CreateStore(Cast, StoreGEPList[i].first);
     }
     return;
   }
@@ -527,6 +538,9 @@ static void EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address DestVal,
   // Flatten our destination
   SmallVector<QualType> DestTypes; // Flattened type
   SmallVector<llvm::Value *, 4> IdxList;
+  IdxList.push_back(
+      llvm::ConstantInt::get(llvm::IntegerType::get(CGF.getLLVMContext(), 32),
+                             0)); // Because an Address is a pointer
   SmallVector<std::pair<Address, llvm::Value *>, 16> StoreGEPList;
   // ^^ Flattened accesses to DestVal we want to store into
   CGF.FlattenAccessAndType(DestVal, DestTy, IdxList, StoreGEPList, DestTypes);
@@ -535,6 +549,9 @@ static void EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address DestVal,
   SmallVector<std::pair<Address, llvm::Value *>, 16> LoadGEPList;
   // ^^ Flattened accesses to SrcVal we want to load from
   IdxList.clear();
+  IdxList.push_back(
+      llvm::ConstantInt::get(llvm::IntegerType::get(CGF.getLLVMContext(), 32),
+                             0)); // Because an Address is a pointer
   CGF.FlattenAccessAndType(SrcVal, SrcTy, IdxList, LoadGEPList, SrcTypes);
 
   assert(StoreGEPList.size() <= LoadGEPList.size() &&
@@ -543,10 +560,21 @@ static void EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address DestVal,
   // apply casts to what we load from LoadGEPList
   // and store result in Dest
   for (unsigned i = 0; i < StoreGEPList.size(); i++) {
-    llvm::Value *Load = CGF.PerformLoad(LoadGEPList[i]);
+    llvm::Value *Idx = LoadGEPList[i].second;
+    llvm::Value *Load = CGF.Builder.CreateLoad(LoadGEPList[i].first, "load");
+    Load =
+        Idx ? CGF.Builder.CreateExtractElement(Load, Idx, "vec.extract") : Load;
     llvm::Value *Cast =
         CGF.EmitScalarConversion(Load, SrcTypes[i], DestTypes[i], Loc);
-    CGF.PerformStore(StoreGEPList[i], Cast);
+
+    // store back
+    Idx = StoreGEPList[i].second;
+    if (Idx) {
+      llvm::Value *V =
+          CGF.Builder.CreateLoad(StoreGEPList[i].first, "load.for.insert");
+      Cast = CGF.Builder.CreateInsertElement(V, Cast, Idx);
+    }
+    CGF.Builder.CreateStore(Cast, StoreGEPList[i].first);
   }
 }
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 3809e3b1db3494d..b906f89fb620184 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2267,6 +2267,9 @@ static Value *EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address RHSVal,
                                         QualType RHSTy, QualType LHSTy,
                                         SourceLocation Loc) {
   SmallVector<llvm::Value *, 4> IdxList;
+  IdxList.push_back(
+      llvm::ConstantInt::get(llvm::IntegerType::get(CGF.getLLVMContext(), 32),
+                             0)); // because an Address is a pointer
   SmallVector<std::pair<Address, llvm::Value *>, 16> LoadGEPList;
   SmallVector<QualType> SrcTypes; // Flattened type
   CGF.FlattenAccessAndType(RHSVal, RHSTy, IdxList, LoadGEPList, SrcTypes);
@@ -2277,7 +2280,10 @@ static Value *EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address RHSVal,
         CGF.Builder.CreateLoad(CGF.CreateIRTemp(LHSTy, "flatcast.tmp"));
     // write to V.
     for (unsigned i = 0; i < VecTy->getNumElements(); i++) {
-      llvm::Value *Load = CGF.PerformLoad(LoadGEPList[i]);
+      llvm::Value *Load = CGF.Builder.CreateLoad(LoadGEPList[i].first, "load");
+      llvm::Value *Idx = LoadGEPList[i].second;
+      Load = Idx ? CGF.Builder.CreateExtractElement(Load, Idx, "vec.extract")
+                 : Load;
       llvm::Value *Cast = CGF.EmitScalarConversion(
           Load, SrcTypes[i], VecTy->getElementType(), Loc);
       V = CGF.Builder.CreateInsertElement(V, Cast, i);
@@ -2288,8 +2294,11 @@ static Value *EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address RHSVal,
   assert(LHSTy->isBuiltinType() &&
          "Destination type must be a vector or builtin type.");
   // TODO add asserts about things being long enough
-  return CGF.EmitScalarConversion(CGF.PerformLoad(LoadGEPList[0]), LHSTy,
-                                  SrcTypes[0], Loc);
+  llvm::Value *Load = CGF.Builder.CreateLoad(LoadGEPList[0].first, "load");
+  llvm::Value *Idx = LoadGEPList[0].second;
+  Load =
+      Idx ? CGF.Builder.CreateExtractElement(Load, Idx, "vec.extract") : Load;
+  return CGF.EmitScalarConversion(Load, LHSTy, SrcTypes[0], Loc);
 }
 
 // VisitCastExpr - Emit code for an explicit or implicit cast.  Implicit casts
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 873dd781eb2e7d7..9a08f1fcae27f04 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4359,9 +4359,6 @@ class CodeGenFunction : public CodeGenTypeCache {
                                 AggValueSlot slot = AggValueSlot::ignored());
   LValue EmitPseudoObjectLValue(const PseudoObjectExpr *e);
 
-  llvm::Value *PerformLoad(std::pair<Address, llvm::Value *> &GEP);
-  llvm::Value *PerformStore(std::pair<Address, llvm::Value *> &GEP,
-                            llvm::Value *Val);
   void FlattenAccessAndType(
       Address Val, QualType SrcTy, SmallVector<llvm::Value *, 4> &IdxList,
       SmallVector<std::pair<Address, llvm::Value *>, 16> &GEPList,
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 0bd7fc91aee18f9..35a9afc0d160723 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -2772,10 +2772,8 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle,
   CheckedConversionKind CCK = FunctionalStyle
                                   ? CheckedConversionKind::FunctionalCast
                                   : CheckedConversionKind::CStyleCast;
-  // todo what else should i be doing lvalue to rvalue cast for?
-  // why dont they do it for records below?
   // This case should not trigger on regular vector splat
-  // Or vector cast or vector truncation.
+  // vector cast, vector truncation, or special hlsl splat cases
   QualType SrcTy = SrcExpr.get()->getType();
   if (Self.getLangOpts().HLSL &&
       Self.HLSL().CanPerformAggregateCast(SrcExpr.get(), DestType)) {
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 8c32e77d4f799da..3ba5b1f9a95bf29 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2412,34 +2412,26 @@ bool SemaHLSL::CheckCompatibleParameterABI(FunctionDecl *New,
   return HadError;
 }
 
-// Follows PerformScalarCast
+// Generally follows PerformScalarCast, with cases reordered for
+// clarity of what types are supported
 bool SemaHLSL::CanPerformScalarCast(QualType SrcTy, QualType DestTy) {
 
   if (SemaRef.getASTContext().hasSameUnqualifiedType(SrcTy, DestTy))
     return true;
 
   switch (Type::ScalarTypeKind SrcKind = SrcTy->getScalarTypeKind()) {
-  case Type::STK_MemberPointer:
-  case Type::STK_CPointer:
-  case Type::STK_BlockPointer:
-  case Type::STK_ObjCObjectPointer:
-    llvm_unreachable("HLSL doesn't support pointers.");
-
-  case Type::STK_FixedPoint:
-    llvm_unreachable("HLSL doesn't support fixed point types.");
-
   case Type::STK_Bool: // casting from bool is like casting from an integer
   case Type::STK_Integral:
     switch (DestTy->getScalarTypeKind()) {
+    case Type::STK_Bool:
+    case Type::STK_Integral:
+    case Type::STK_Floating:
+      return true;
     case Type::STK_CPointer:
     case Type::STK_ObjCObjectPointer:
     case Type::STK_BlockPointer:
     case Type::STK_MemberPointer:
       llvm_unreachable("HLSL doesn't support pointers.");
-    case Type::STK_Bool:
-    case Type::STK_Integral:
-    case Type::STK_Floating:
-      return true;
     case Type::STK_IntegralComplex:
     case Type::STK_FloatingComplex:
       llvm_unreachable("HLSL doesn't support complex types.");
@@ -2467,6 +2459,15 @@ bool SemaHLSL::CanPerformScalarCast(QualType SrcTy, QualType DestTy) {
     }
     llvm_unreachable("Should have returned before this");
 
+  case Type::STK_MemberPointer:
+  case Type::STK_CPointer:
+  case Type::STK_BlockPointer:
+  case Type::STK_ObjCObjectPointer:
+    llvm_unreachable("HLSL doesn't support pointers.");
+
+  case Type::STK_FixedPoint:
+    llvm_unreachable("HLSL doesn't support fixed point types.");
+
   case Type::STK_FloatingComplex:
   case Type::STK_IntegralComplex:
     llvm_unreachable("HLSL doesn't support complex types.");
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl
index 23a71a2ecc6b965..afe301143ebc69b 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl
@@ -8,9 +8,9 @@
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[B]], ptr align 4 {{.*}}, i32 4, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [1 x i32], ptr [[B]], i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0
-// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [1 x i32], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 1
 // CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G2]], align 4
 // CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
 export void call1() {
@@ -27,8 +27,8 @@ export void call1() {
 // CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[A]], i8 0, i32 4, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[B]], ptr align 4 {{.*}}, i32 4, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 4, i1 false)
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [1 x float], ptr [[B]], i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [1 x i32], ptr [[Tmp]], i32 0
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [1 x float], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [1 x i32], ptr [[Tmp]], i32 0, i32 0
 // CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G2]], align 4
 // CHECK-NEXT: [[C:%.*]] = sitofp i32 [[L]] to float
 // CHECK-NEXT: store float [[C]], ptr [[G1]], align 4
@@ -45,7 +45,7 @@ export void call2() {
 // CHECK-NEXT: store <1 x float> splat (float 0x3FF3333340000000), ptr [[A]], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[B]], ptr align 4 {{.*}}, i32 4, i1 false)
 // CHECK-NEXT: [[C:%.*]] = load <1 x float>, ptr [[A]], align 4
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [1 x i32], ptr [[B]], i32 0
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [1 x i32], ptr [[B]], i32 0, i32 0
 // CHECK-NEXT: [[V:%.*]] = extractelement <1 x float> [[C]], i64 0
 // CHECK-NEXT: [[C:%.*]] = fptosi float [[V]] to i32
 // CHECK-NEXT: store i32 [[C]], ptr [[G1]], align 4
@@ -63,9 +63,9 @@ export void call3() {
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[A]], ptr align 8 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[B]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[Tmp]], ptr align 8 [[A]], i32 8, i1 false)
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i32 1
-// CHECK-NEXT: [[VG:%.*]] = getelementptr inbounds [1 x <2 x float>], ptr [[Tmp]], i32 0
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i32 0, i32 1
+// CHECK-NEXT: [[VG:%.*]] = getelementptr inbounds [1 x <2 x float>], ptr [[Tmp]], i32 0, i32 0
 // CHECK-NEXT: [[L:%.*]] = load <2 x float>, ptr [[VG]], align 8
 // CHECK-NEXT: [[VL:%.*]] = extractelement <2 x float> [[L]], i32 0
 // CHECK-NEXT: [[C:%.*]] = fptosi float [[VL]] to i32
@@ -88,10 +88,10 @@ export void call5() {
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[B]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i32 1
-// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 0, i32 0
-// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 1, i32 0
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i32 0, i32 1
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 0, i32 0, i32 0
+// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 0, i32 1, i32 0
 // CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G3]], align 4
 // CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
 // CHECK-NEXT: [[L4:%.*]] = load i32, ptr [[G4]], align 4
@@ -115,9 +115,9 @@ struct S {
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[s]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 4, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[s]], i32 8, i1 false)
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [1 x i32], ptr [[A]], i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0
-// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [1 x i32], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0, i32 1
 // CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G2]], align 4
 // CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
 export void call7() {
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl
index c44e340109abb2c..3ebdb0c32981707 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl
@@ -11,8 +11,8 @@ struct S {
 // CHECK-NEXT: [[s:%.*]] = alloca %struct.S, align 4
 // CHECK-NEXT: store <2 x i32> <i32 1, i32 2>, ptr [[A]], align 8
 // CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[A]], align 8
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 1
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0, i32 1
 // CHECK-NEXT: [[VL:%.*]] = extractelement <2 x i32> [[L]], i64 0
 // CHECK-NEXT: store i32 [[VL]], ptr [[G1]], align 4
 // CHECK-NEXT: [[VL2:%.*]] = extractelement <2 x i32> [[L]], i64 1
@@ -31,10 +31,10 @@ export void call1() {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 1
-// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0
-// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0, i32 1
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 1
 // CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G3]], align 4
 // CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
 // CHECK-NEXT: [[L4:%.*]] = load i32, ptr [[G4]], align 4
@@ -61,10 +61,10 @@ struct R {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.R, align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[r]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[r]], i32 8, i1 false)
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 1
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0, i32 1
 // CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds %struct.R, ptr [[Tmp]], i32 0, i32 0
-// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds %struct.R, ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds %struct.R, ptr [[Tmp]], i32 0, i32 1
 // CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G3]], align 4
 // CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
 // CHECK-NEXT: [[L4:%.*]] = load float, ptr [[G4]], align 4
@@ -82,9 +82,9 @@ export void call6() {
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
 // CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.R, ptr [[r]], i32 0, i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.R, ptr [[r]], i32 1
-// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0
-// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.R, ptr [[r]], i32 0, i32 1
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 1
 // CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G3]], align 4
 // CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
 // CHECK-NEXT: [[L4:%.*]] = load i32, ptr [[G4]], align 4
@@ -108,11 +108,11 @@ struct T {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.T, align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[t]], ptr align 4 {{.*}}, i32 12, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[t]], i32 12, i1 false)
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 1
-// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds %struct.T, ptr [[Tmp]], i32 0
-// CHECK-NEXT: %gep3 = getelementptr inbounds %struct.T, ptr %agg-temp, i32 1
-// CHECK-NEXT: %gep4 = getelementptr inbounds %struct.T, ptr %agg-temp, i32 2
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0, i32 1
+// CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds %struct.T, ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: %gep3 = getelementptr inbounds %struct.T, ptr %agg-temp, i32 0, i32 1
+// CHECK-NEXT: %gep4 = getelementptr inbounds %struct.T, ptr %agg-temp, i32 0, i32 2
 // CHECK-NEXT: %load = load i32, ptr %gep2, align 4
 // CHECK-NEXT: store i32 %load, ptr %gep, align 4
 // CHECK-NEXT: %load5 = load i32, ptr %gep3, align 4
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/VectorFlatCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/VectorFlatCast.hlsl
index 9cd320ee9f62db3..f579dfb377de59d 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/VectorFlatCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/VectorFlatCast.hlsl
@@ -8,8 +8,8 @@
 // CHECK-NEXT: [[Tmp2:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 0, i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 1, i32 0
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 0, i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 0, i32 1, i32 0
 // CHECK-NEXT: [[C:%.*]] = load <2 x i32>, ptr [[Tmp2]], align 8
 // CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
 // CHECK-NEXT: [[D:%.*]] = insertelement <2 x i32> [[C]], i32 [[L]], i64 0
@@ -34,8 +34,8 @@ struct S {
 // CHECK-NEXT: [[Tmp2:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[s]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[s]], i32 8, i1 false)
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0, i32 1
 // CHECK-NEXT: [[B:%.*]] = load <2 x i32>, ptr [[Tmp2]], align 8
 // CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
 // CHECK-NEXT: [[C:%.*]] = insertelement <2 x i32> [[B]], i32 [[L]], i64 0
@@ -55,8 +55,8 @@ export void call3() {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 1
 // CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
 // CHECK-NEXT: store i32 [[L]], ptr [[B]], align 4
 export void call4() {
@@ -71,8 +71,8 @@ export void call4() {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.S, align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[s]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[s]], i32 8, i1 false)
-// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0
-// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 1
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0, i32 1
 // CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
 // CHECK-NEXT: store i32 [[L]], ptr [[A]], align 4
 export void call5() {

>From 89709adc295d94c3befd10ab0b1408ed5045c8bb Mon Sep 17 00:00:00 2001
From: Sarah Spall <spall at planetbauer.com>
Date: Wed, 11 Dec 2024 19:14:51 +0000
Subject: [PATCH 08/18] self review continued. Make FlattenAccessAndTypes not
 recursive and handle records correctly.

---
 clang/lib/CodeGen/CGExpr.cpp        | 137 ++++++++++++++++------------
 clang/lib/CodeGen/CGExprAgg.cpp     |  24 ++---
 clang/lib/CodeGen/CGExprScalar.cpp  |   8 +-
 clang/lib/CodeGen/CodeGenFunction.h |   6 +-
 4 files changed, 92 insertions(+), 83 deletions(-)

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index f85f10eeb422d29..62980f5077e95d6 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6361,62 +6361,87 @@ LValue CodeGenFunction::EmitPseudoObjectLValue(const PseudoObjectExpr *E) {
 }
 
 void CodeGenFunction::FlattenAccessAndType(
-    Address Val, QualType SrcTy, SmallVector<llvm::Value *, 4> &IdxList,
-    SmallVector<std::pair<Address, llvm::Value *>, 16> &GEPList,
-    SmallVector<QualType> &FlatTypes) {
+    Address Addr, QualType AddrType,
+    SmallVectorImpl<std::pair<Address, llvm::Value *>> &AccessList,
+    SmallVectorImpl<QualType> &FlatTypes) {
+  // WorkList is list of type we are processing + the Index List to access
+  // the field of that type in Addr for use in a GEP
+  llvm::SmallVector<std::pair<QualType, llvm::SmallVector<llvm::Value *, 4>>,
+                    16>
+      WorkList;
   llvm::IntegerType *IdxTy = llvm::IntegerType::get(getLLVMContext(), 32);
-  if (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(SrcTy)) {
-    uint64_t Size = CAT->getZExtSize();
-    for (unsigned i = 0; i < Size; i++) {
-      // flatten each member of the array
-      // add index of this element to index list
-      llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, i);
-      IdxList.push_back(Idx);
-      // recur on this object
-      FlattenAccessAndType(Val, CAT->getElementType(), IdxList, GEPList,
-                           FlatTypes);
-      // remove index of this element from index list
-      IdxList.pop_back();
-    }
-  } else if (const RecordType *RT = SrcTy->getAs<RecordType>()) {
-    RecordDecl *Record = RT->getDecl();
-    const CGRecordLayout &RL = getTypes().getCGRecordLayout(Record);
-    // do I need to check if its a cxx record decl?
-
-    for (auto fieldIter = Record->field_begin(), fieldEnd = Record->field_end();
-         fieldIter != fieldEnd; ++fieldIter) {
-      // get the field number
-      unsigned FieldNum = RL.getLLVMFieldNo(*fieldIter);
-      // can we just do *fieldIter->getFieldIndex();
-      // add that index to the index list
-      llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, FieldNum);
-      IdxList.push_back(Idx);
-      // recur on the field
-      FlattenAccessAndType(Val, fieldIter->getType(), IdxList, GEPList,
-                           FlatTypes);
-      // remove index of this element from index list
-      IdxList.pop_back();
-    }
-  } else if (const VectorType *VT = SrcTy->getAs<VectorType>()) {
-    llvm::Type *VTy = ConvertTypeForMem(SrcTy);
-    CharUnits Align = getContext().getTypeAlignInChars(SrcTy);
-    Address GEP =
-        Builder.CreateInBoundsGEP(Val, IdxList, VTy, Align, "vector.gep");
-    for (unsigned i = 0; i < VT->getNumElements(); i++) {
-      // add index to the list
-      llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, i);
-      // create gep. no need to recur since its always a scalar
-      // gep on vector is not recommended so combine gep with extract/insert
-      GEPList.push_back({GEP, Idx});
-      FlatTypes.push_back(VT->getElementType());
+  WorkList.push_back(
+      {AddrType,
+       {llvm::ConstantInt::get(
+           IdxTy,
+           0)}}); // Addr should be a pointer so we need to 'dereference' it
+
+  while (!WorkList.empty()) {
+    std::pair<QualType, llvm::SmallVector<llvm::Value *, 4>> P =
+        WorkList.pop_back_val();
+    QualType T = P.first;
+    llvm::SmallVector<llvm::Value *, 4> IdxList = P.second;
+    T = T.getCanonicalType().getUnqualifiedType();
+    assert(!isa<MatrixType>(T) && "Matrix types not yet supported in HLSL");
+    if (const auto *CAT = dyn_cast<ConstantArrayType>(T)) {
+      uint64_t Size = CAT->getZExtSize();
+      for (int64_t i = Size - 1; i > -1; i--) {
+        llvm::SmallVector<llvm::Value *, 4> IdxListCopy = IdxList;
+        IdxListCopy.push_back(llvm::ConstantInt::get(IdxTy, i));
+        WorkList.insert(WorkList.end(), {CAT->getElementType(), IdxListCopy});
+      }
+    } else if (const auto *RT = dyn_cast<RecordType>(T)) {
+      const RecordDecl *Record = RT->getDecl();
+      if (Record->isUnion()) {
+        IdxList.push_back(llvm::ConstantInt::get(IdxTy, 0));
+        llvm::Type *LLVMT = ConvertTypeForMem(T);
+        CharUnits Align = getContext().getTypeAlignInChars(T);
+        Address GEP =
+            Builder.CreateInBoundsGEP(Addr, IdxList, LLVMT, Align, "union.gep");
+        AccessList.push_back({GEP, NULL});
+        FlatTypes.push_back(T);
+        continue;
+      }
+      const CXXRecordDecl *CXXD = dyn_cast<CXXRecordDecl>(Record);
+
+      llvm::SmallVector<QualType, 16> FieldTypes;
+      if (CXXD && CXXD->isStandardLayout())
+        Record = CXXD->getStandardLayoutBaseWithFields();
+
+      // deal with potential base classes
+      if (CXXD && !CXXD->isStandardLayout()) {
+        for (auto &Base : CXXD->bases())
+          FieldTypes.push_back(Base.getType());
+      }
+
+      for (auto *FD : Record->fields())
+        FieldTypes.push_back(FD->getType());
+
+      for (int64_t i = FieldTypes.size() - 1; i > -1; i--) {
+        llvm::SmallVector<llvm::Value *, 4> IdxListCopy = IdxList;
+        IdxListCopy.push_back(llvm::ConstantInt::get(IdxTy, i));
+        WorkList.insert(WorkList.end(), {FieldTypes[i], IdxListCopy});
+      }
+    } else if (const auto *VT = dyn_cast<VectorType>(T)) {
+      llvm::Type *LLVMT = ConvertTypeForMem(T);
+      CharUnits Align = getContext().getTypeAlignInChars(T);
+      Address GEP =
+          Builder.CreateInBoundsGEP(Addr, IdxList, LLVMT, Align, "vector.gep");
+      for (unsigned i = 0; i < VT->getNumElements(); i++) {
+        llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, i);
+        // gep on vector fields is not recommended so combine gep with
+        // extract/insert
+        AccessList.push_back({GEP, Idx});
+        FlatTypes.push_back(VT->getElementType());
+      }
+    } else {
+      // a scalar/builtin type
+      llvm::Type *LLVMT = ConvertTypeForMem(T);
+      CharUnits Align = getContext().getTypeAlignInChars(T);
+      Address GEP =
+          Builder.CreateInBoundsGEP(Addr, IdxList, LLVMT, Align, "gep");
+      AccessList.push_back({GEP, NULL});
+      FlatTypes.push_back(T);
     }
-  } else { // should be a scalar should we assert or check?
-    // create a gep
-    llvm::Type *Ty = ConvertTypeForMem(SrcTy);
-    CharUnits Align = getContext().getTypeAlignInChars(SrcTy);
-    Address GEP = Builder.CreateInBoundsGEP(Val, IdxList, Ty, Align, "gep");
-    GEPList.push_back({GEP, NULL});
-    FlatTypes.push_back(SrcTy);
-  }
-  // target extension types?
+  }
 }
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index cb4e24062aad212..62a85e983d1c973 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -496,14 +496,10 @@ static void EmitHLSLScalarFlatCast(CodeGenFunction &CGF, Address DestVal,
                                    QualType DestTy, llvm::Value *SrcVal,
                                    QualType SrcTy, SourceLocation Loc) {
   // Flatten our destination
-  SmallVector<QualType> DestTypes; // Flattened type
-  SmallVector<llvm::Value *, 4> IdxList;
-  IdxList.push_back(
-      llvm::ConstantInt::get(llvm::IntegerType::get(CGF.getLLVMContext(), 32),
-                             0)); // because an Address is a pointer
+  SmallVector<QualType, 16> DestTypes; // Flattened type
   SmallVector<std::pair<Address, llvm::Value *>, 16> StoreGEPList;
   // ^^ Flattened accesses to DestVal we want to store into
-  CGF.FlattenAccessAndType(DestVal, DestTy, IdxList, StoreGEPList, DestTypes);
+  CGF.FlattenAccessAndType(DestVal, DestTy, StoreGEPList, DestTypes);
 
   if (const VectorType *VT = SrcTy->getAs<VectorType>()) {
     SrcTy = VT->getElementType();
@@ -536,23 +532,15 @@ static void EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address DestVal,
                                       QualType DestTy, Address SrcVal,
                                       QualType SrcTy, SourceLocation Loc) {
   // Flatten our destination
-  SmallVector<QualType> DestTypes; // Flattened type
-  SmallVector<llvm::Value *, 4> IdxList;
-  IdxList.push_back(
-      llvm::ConstantInt::get(llvm::IntegerType::get(CGF.getLLVMContext(), 32),
-                             0)); // Because an Address is a pointer
+  SmallVector<QualType, 16> DestTypes; // Flattened type
   SmallVector<std::pair<Address, llvm::Value *>, 16> StoreGEPList;
   // ^^ Flattened accesses to DestVal we want to store into
-  CGF.FlattenAccessAndType(DestVal, DestTy, IdxList, StoreGEPList, DestTypes);
+  CGF.FlattenAccessAndType(DestVal, DestTy, StoreGEPList, DestTypes);
   // Flatten our src
-  SmallVector<QualType> SrcTypes; // Flattened type
+  SmallVector<QualType, 16> SrcTypes; // Flattened type
   SmallVector<std::pair<Address, llvm::Value *>, 16> LoadGEPList;
   // ^^ Flattened accesses to SrcVal we want to load from
-  IdxList.clear();
-  IdxList.push_back(
-      llvm::ConstantInt::get(llvm::IntegerType::get(CGF.getLLVMContext(), 32),
-                             0)); // Because an Address is a pointer
-  CGF.FlattenAccessAndType(SrcVal, SrcTy, IdxList, LoadGEPList, SrcTypes);
+  CGF.FlattenAccessAndType(SrcVal, SrcTy, LoadGEPList, SrcTypes);
 
   assert(StoreGEPList.size() <= LoadGEPList.size() &&
          "Cannot perform HLSL flat cast when flattened source object \
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index b906f89fb620184..e0f9be5642cd778 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2266,13 +2266,9 @@ bool CodeGenFunction::ShouldNullCheckClassCastValue(const CastExpr *CE) {
 static Value *EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address RHSVal,
                                         QualType RHSTy, QualType LHSTy,
                                         SourceLocation Loc) {
-  SmallVector<llvm::Value *, 4> IdxList;
-  IdxList.push_back(
-      llvm::ConstantInt::get(llvm::IntegerType::get(CGF.getLLVMContext(), 32),
-                             0)); // because an Address is a pointer
   SmallVector<std::pair<Address, llvm::Value *>, 16> LoadGEPList;
-  SmallVector<QualType> SrcTypes; // Flattened type
-  CGF.FlattenAccessAndType(RHSVal, RHSTy, IdxList, LoadGEPList, SrcTypes);
+  SmallVector<QualType, 16> SrcTypes; // Flattened type
+  CGF.FlattenAccessAndType(RHSVal, RHSTy, LoadGEPList, SrcTypes);
   // LHS is either a vector or a builtin?
   // if its a vector create a temp alloca to store into and return that
   if (auto *VecTy = LHSTy->getAs<VectorType>()) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 9a08f1fcae27f04..1d513d20e81b785 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4360,9 +4360,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   LValue EmitPseudoObjectLValue(const PseudoObjectExpr *e);
 
   void FlattenAccessAndType(
-      Address Val, QualType SrcTy, SmallVector<llvm::Value *, 4> &IdxList,
-      SmallVector<std::pair<Address, llvm::Value *>, 16> &GEPList,
-      SmallVector<QualType> &FlatTypes);
+      Address Addr, QualType AddrTy,
+      SmallVectorImpl<std::pair<Address, llvm::Value *>> &AccessList,
+      SmallVectorImpl<QualType> &FlatTypes);
 
   llvm::Value *EmitIvarOffset(const ObjCInterfaceDecl *Interface,
                               const ObjCIvarDecl *Ivar);

>From 162c2b5078e2741ee70d4657e40e167d5b876740 Mon Sep 17 00:00:00 2001
From: Sarah Spall <spall at planetbauer.com>
Date: Fri, 20 Dec 2024 17:48:49 +0000
Subject: [PATCH 09/18] two test showing truncation to scalar

---
 .../CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl | 16 ++++++++++++++++
 .../BasicFeatures/StructFlatCast.hlsl            | 16 ++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl
index afe301143ebc69b..18f82bff3b3086e 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl
@@ -1,5 +1,21 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - %s | FileCheck %s
 
+// array truncation to a scalar
+// CHECK-LABEL: define void {{.*}}call0
+// CHECK: [[A:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: [[B:%.*]] = alloca float, align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 1
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
+// CHECK-NEXT: store i32 [[L]], ptr [[B]], align 4
+export void call0() {
+  int A[2] = {0,1};
+  float B = (float)A;
+}
+
 // array truncation
 // CHECK-LABEL: define void {{.*}}call1
 // CHECK: [[A:%.*]] = alloca [2 x i32], align 4
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl
index 3ebdb0c32981707..26fde37c901dd0f 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl
@@ -5,6 +5,22 @@ struct S {
   float Y;
 };
 
+// struct truncation to a scalar
+// CHECK-LABEL: define void {{.*}}call0
+// CHECK: [[s:%.*]] = alloca %struct.S, align 4
+// CHECK-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.S, align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[s]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[s]], i32 8, i1 false)
+// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0, i32 1
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
+// CHECK-NEXT: store i32 [[L]], ptr [[A]], align 4
+export void call0() {
+  S s = {1,2};
+  int A = (int)s;
+}
+
 // struct from vector
 // CHECK-LABEL: define void {{.*}}call1
 // CHECK: [[A:%.*]] = alloca <2 x i32>, align 8

>From fa0e9c90a2b2b19ba466ac26a162f9d50eb583a3 Mon Sep 17 00:00:00 2001
From: Sarah Spall <sarahspall at microsoft.com>
Date: Wed, 22 Jan 2025 12:01:36 -0800
Subject: [PATCH 10/18] respond to pr comments

---
 clang/lib/CodeGen/CGExpr.cpp       | 16 +++---------
 clang/lib/CodeGen/CGExprAgg.cpp    | 42 ++++++++++++++----------------
 clang/lib/CodeGen/CGExprScalar.cpp |  9 +++----
 clang/lib/Sema/SemaHLSL.cpp        | 15 ++++++++---
 4 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 62980f5077e95d6..44f37c5bede06c1 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6385,23 +6385,15 @@ void CodeGenFunction::FlattenAccessAndType(
     assert(!isa<MatrixType>(T) && "Matrix types not yet supported in HLSL");
     if (const auto *CAT = dyn_cast<ConstantArrayType>(T)) {
       uint64_t Size = CAT->getZExtSize();
-      for (int64_t i = Size - 1; i > -1; i--) {
+      for (int64_t I = Size - 1; I > -1; I--) {
         llvm::SmallVector<llvm::Value *, 4> IdxListCopy = IdxList;
-        IdxListCopy.push_back(llvm::ConstantInt::get(IdxTy, i));
+        IdxListCopy.push_back(llvm::ConstantInt::get(IdxTy, I));
         WorkList.insert(WorkList.end(), {CAT->getElementType(), IdxListCopy});
       }
     } else if (const auto *RT = dyn_cast<RecordType>(T)) {
       const RecordDecl *Record = RT->getDecl();
-      if (Record->isUnion()) {
-        IdxList.push_back(llvm::ConstantInt::get(IdxTy, 0));
-        llvm::Type *LLVMT = ConvertTypeForMem(T);
-        CharUnits Align = getContext().getTypeAlignInChars(T);
-        Address GEP =
-            Builder.CreateInBoundsGEP(Addr, IdxList, LLVMT, Align, "union.gep");
-        AccessList.push_back({GEP, NULL});
-        FlatTypes.push_back(T);
-        continue;
-      }
+      assert(!Record->isUnion() && "Union types not supported in flat cast.");
+
       const CXXRecordDecl *CXXD = dyn_cast<CXXRecordDecl>(Record);
 
       llvm::SmallVector<QualType, 16> FieldTypes;
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 62a85e983d1c973..7ac9188ed077d63 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -501,30 +501,28 @@ static void EmitHLSLScalarFlatCast(CodeGenFunction &CGF, Address DestVal,
   // ^^ Flattened accesses to DestVal we want to store into
   CGF.FlattenAccessAndType(DestVal, DestTy, StoreGEPList, DestTypes);
 
-  if (const VectorType *VT = SrcTy->getAs<VectorType>()) {
-    SrcTy = VT->getElementType();
-    assert(StoreGEPList.size() <= VT->getNumElements() &&
-           "Cannot perform HLSL flat cast when vector source \
-           object has less elements than flattened destination \
-           object.");
-    for (unsigned i = 0; i < StoreGEPList.size(); i++) {
-      llvm::Value *Load =
-          CGF.Builder.CreateExtractElement(SrcVal, i, "vec.load");
-      llvm::Value *Cast =
-          CGF.EmitScalarConversion(Load, SrcTy, DestTypes[i], Loc);
-
-      // store back
-      llvm::Value *Idx = StoreGEPList[i].second;
-      if (Idx) {
-        llvm::Value *V =
-            CGF.Builder.CreateLoad(StoreGEPList[i].first, "load.for.insert");
-        Cast = CGF.Builder.CreateInsertElement(V, Cast, Idx);
-      }
-      CGF.Builder.CreateStore(Cast, StoreGEPList[i].first);
+  assert(SrcTy->isVectorType() && "HLSL Flat cast doesn't handle splatting.");
+  const VectorType *VT = SrcTy->getAs<VectorType>();
+  SrcTy = VT->getElementType();
+  assert(StoreGEPList.size() <= VT->getNumElements() &&
+         "Cannot perform HLSL flat cast when vector source \
+         object has less elements than flattened destination \
+         object.");
+  for (unsigned i = 0; i < StoreGEPList.size(); i++) {
+    llvm::Value *Load = CGF.Builder.CreateExtractElement(SrcVal, i, "vec.load");
+    llvm::Value *Cast =
+        CGF.EmitScalarConversion(Load, SrcTy, DestTypes[i], Loc);
+
+    // store back
+    llvm::Value *Idx = StoreGEPList[i].second;
+    if (Idx) {
+      llvm::Value *V =
+          CGF.Builder.CreateLoad(StoreGEPList[i].first, "load.for.insert");
+      Cast = CGF.Builder.CreateInsertElement(V, Cast, Idx);
     }
-    return;
+    CGF.Builder.CreateStore(Cast, StoreGEPList[i].first);
   }
-  llvm_unreachable("HLSL Flat cast doesn't handle splatting.");
+  return;
 }
 
 // emit a flat cast where the RHS is an aggregate
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index e0f9be5642cd778..e7f5a4f06f9bcb5 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2792,11 +2792,10 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     SourceLocation Loc = CE->getExprLoc();
     QualType SrcTy = E->getType();
 
-    if (RV.isAggregate()) { // RHS is an aggregate
-      Address SrcVal = RV.getAggregateAddress();
-      return EmitHLSLAggregateFlatCast(CGF, SrcVal, SrcTy, DestTy, Loc);
-    }
-    llvm_unreachable("Not a valid HLSL Flat Cast.");
+    assert(RV.isAggregate() && "Not a valid HLSL Flat Cast.");
+    // RHS is an aggregate
+    Address SrcVal = RV.getAggregateAddress();
+    return EmitHLSLAggregateFlatCast(CGF, SrcVal, SrcTy, DestTy, Loc);
   }
   } // end of switch
 
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 3ba5b1f9a95bf29..56013b1ff1566b0 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2477,7 +2477,7 @@ bool SemaHLSL::CanPerformScalarCast(QualType SrcTy, QualType DestTy) {
 }
 
 // Can we perform an HLSL Flattened cast?
-// TODO: update this code when matrices are added
+// TODO: update this code when matrices are added; see issue #88060
 bool SemaHLSL::CanPerformAggregateCast(Expr *Src, QualType DestTy) {
 
   // Don't handle casts where LHS and RHS are any combination of scalar/vector
@@ -2500,11 +2500,20 @@ bool SemaHLSL::CanPerformAggregateCast(Expr *Src, QualType DestTy) {
   if (SrcTypes.size() < DestTypes.size())
     return false;
 
-  for (unsigned i = 0; i < DestTypes.size() && i < SrcTypes.size(); i++) {
-    if (!CanPerformScalarCast(SrcTypes[i], DestTypes[i])) {
+  unsigned I;
+  for (I = 0; I < DestTypes.size() && I < SrcTypes.size(); I++) {
+    if (SrcTypes[I]->isUnionType() || DestTypes[I]->isUnionType())
+      return false;
+    if (!CanPerformScalarCast(SrcTypes[I], DestTypes[I])) {
       return false;
     }
   }
+
+  // check the rest of the source type for unions.
+  for (; I < SrcTypes.size(); I++) {
+    if (SrcTypes[I]->isUnionType())
+      return false;
+  }
   return true;
 }
 

>From afb5be23c1e0c0e287f86c60580dd03a84dddfd7 Mon Sep 17 00:00:00 2001
From: Sarah Spall <sarahspall at microsoft.com>
Date: Wed, 22 Jan 2025 12:10:56 -0800
Subject: [PATCH 11/18] add case to switch

---
 clang/lib/CodeGen/CGExprAgg.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 7ac9188ed077d63..9a433f3850f1385 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -1552,7 +1552,7 @@ static bool castPreservesZero(const CastExpr *CE) {
   case CK_NonAtomicToAtomic:
   case CK_AtomicToNonAtomic:
   case CK_HLSLVectorTruncation:
-    // TODO does CK_HLSLAggregateCast preserve zero?
+  case CK_HLSLAggregateCast: // TODO does CK_HLSLAggregateCast preserve zero?
     return true;
 
   case CK_BaseToDerivedMemberPointer:

>From c08f927c6d18f08eed81369d70bce290e9540795 Mon Sep 17 00:00:00 2001
From: Sarah Spall <sarahspall at microsoft.com>
Date: Fri, 31 Jan 2025 16:54:55 -0800
Subject: [PATCH 12/18] add newline to end of file

---
 clang/test/SemaHLSL/Language/FlatCast-errors.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/SemaHLSL/Language/FlatCast-errors.hlsl b/clang/test/SemaHLSL/Language/FlatCast-errors.hlsl
index 0197e8b1e36760a..b5f482940dbdd9f 100644
--- a/clang/test/SemaHLSL/Language/FlatCast-errors.hlsl
+++ b/clang/test/SemaHLSL/Language/FlatCast-errors.hlsl
@@ -5,4 +5,4 @@ export void cantCast() {
   int B[4] = {1,2,3,4};
   B = (int[4])A;
   // expected-error at -1 {{C-style cast from 'int *' to 'int[4]' is not allowed}}
-}
\ No newline at end of file
+}

>From 20bab28fda6f3d1b6cd4c597ea514b32c223e9c3 Mon Sep 17 00:00:00 2001
From: Sarah Spall <sarahspall at microsoft.com>
Date: Fri, 31 Jan 2025 17:01:36 -0800
Subject: [PATCH 13/18] update cases with the cast

---
 clang/lib/CodeGen/CGExprAgg.cpp | 2 +-
 clang/lib/Sema/Sema.cpp         | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 9a433f3850f1385..bc8e1a80331ded6 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -1552,7 +1552,7 @@ static bool castPreservesZero(const CastExpr *CE) {
   case CK_NonAtomicToAtomic:
   case CK_AtomicToNonAtomic:
   case CK_HLSLVectorTruncation:
-  case CK_HLSLAggregateCast: // TODO does CK_HLSLAggregateCast preserve zero?
+  case CK_HLSLAggregateCast:
     return true;
 
   case CK_BaseToDerivedMemberPointer:
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 2f0528d6ab5ce14..d6517511d7db4d2 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -707,7 +707,6 @@ ExprResult Sema::ImpCastExprToType(Expr *E, QualType Ty,
     case CK_ToVoid:
     case CK_NonAtomicToAtomic:
     case CK_HLSLArrayRValue:
-    case CK_HLSLAggregateCast:
       break;
     }
   }

>From a0f5473618fb329d35e0c4db592b17461d5fe87f Mon Sep 17 00:00:00 2001
From: Sarah Spall <sarahspall at microsoft.com>
Date: Wed, 5 Feb 2025 09:28:51 -0800
Subject: [PATCH 14/18] address pr comments

---
 clang/lib/AST/ExprConstant.cpp     |  2 +-
 clang/lib/CodeGen/CGExpr.cpp       | 20 +++++++-------------
 clang/lib/CodeGen/CGExprAgg.cpp    | 14 +++++++-------
 clang/lib/CodeGen/CGExprScalar.cpp | 13 +++++++------
 4 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index e782e6227234afe..a3f8e26b9a7823c 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14857,7 +14857,6 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
   case CK_FixedPointCast:
   case CK_IntegralToFixedPoint:
   case CK_MatrixCast:
-    // TODO does CK_HLSLAggregateCast belong here?
     llvm_unreachable("invalid cast kind for integral value");
 
   case CK_BitCast:
@@ -14876,6 +14875,7 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
   case CK_NoOp:
   case CK_LValueToRValueBitCast:
   case CK_HLSLArrayRValue:
+  case CK_HLSLAggregateCast:
     return ExprEvaluatorBaseTy::VisitCastExpr(E);
 
   case CK_MemberPointerToBoolean:
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 44f37c5bede06c1..401e7b04a7dd7b7 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6370,17 +6370,11 @@ void CodeGenFunction::FlattenAccessAndType(
                     16>
       WorkList;
   llvm::IntegerType *IdxTy = llvm::IntegerType::get(getLLVMContext(), 32);
-  WorkList.push_back(
-      {AddrType,
-       {llvm::ConstantInt::get(
-           IdxTy,
-           0)}}); // Addr should be a pointer so we need to 'dereference' it
+  // Addr should be a pointer so we need to 'dereference' it
+  WorkList.push_back({AddrType, {llvm::ConstantInt::get(IdxTy, 0)}});
 
   while (!WorkList.empty()) {
-    std::pair<QualType, llvm::SmallVector<llvm::Value *, 4>> P =
-        WorkList.pop_back_val();
-    QualType T = P.first;
-    llvm::SmallVector<llvm::Value *, 4> IdxList = P.second;
+    auto [T, IdxList] = WorkList.pop_back_val();
     T = T.getCanonicalType().getUnqualifiedType();
     assert(!isa<MatrixType>(T) && "Matrix types not yet supported in HLSL");
     if (const auto *CAT = dyn_cast<ConstantArrayType>(T)) {
@@ -6388,7 +6382,7 @@ void CodeGenFunction::FlattenAccessAndType(
       for (int64_t I = Size - 1; I > -1; I--) {
         llvm::SmallVector<llvm::Value *, 4> IdxListCopy = IdxList;
         IdxListCopy.push_back(llvm::ConstantInt::get(IdxTy, I));
-        WorkList.insert(WorkList.end(), {CAT->getElementType(), IdxListCopy});
+        WorkList.emplace_back(CAT->getElementType(), IdxListCopy);
       }
     } else if (const auto *RT = dyn_cast<RecordType>(T)) {
       const RecordDecl *Record = RT->getDecl();
@@ -6419,8 +6413,8 @@ void CodeGenFunction::FlattenAccessAndType(
       CharUnits Align = getContext().getTypeAlignInChars(T);
       Address GEP =
           Builder.CreateInBoundsGEP(Addr, IdxList, LLVMT, Align, "vector.gep");
-      for (unsigned i = 0; i < VT->getNumElements(); i++) {
-        llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, i);
+      for (unsigned I = 0, E = VT->getNumElements(); I < E; I++) {
+        llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, I);
         // gep on vector fields is not recommended so combine gep with
         // extract/insert
         AccessList.push_back({GEP, Idx});
@@ -6432,7 +6426,7 @@ void CodeGenFunction::FlattenAccessAndType(
       CharUnits Align = getContext().getTypeAlignInChars(T);
       Address GEP =
           Builder.CreateInBoundsGEP(Addr, IdxList, LLVMT, Align, "gep");
-      AccessList.push_back({GEP, NULL});
+      AccessList.emplace_back(GEP, nullptr);
       FlatTypes.push_back(T);
     }
   }
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index bc8e1a80331ded6..8755d231a097237 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -508,19 +508,19 @@ static void EmitHLSLScalarFlatCast(CodeGenFunction &CGF, Address DestVal,
          "Cannot perform HLSL flat cast when vector source \
          object has less elements than flattened destination \
          object.");
-  for (unsigned i = 0; i < StoreGEPList.size(); i++) {
-    llvm::Value *Load = CGF.Builder.CreateExtractElement(SrcVal, i, "vec.load");
+  for (unsigned I = 0, Size = StoreGEPList.size(); I < Size; I++) {
+    llvm::Value *Load = CGF.Builder.CreateExtractElement(SrcVal, I, "vec.load");
     llvm::Value *Cast =
-        CGF.EmitScalarConversion(Load, SrcTy, DestTypes[i], Loc);
+        CGF.EmitScalarConversion(Load, SrcTy, DestTypes[I], Loc);
 
     // store back
-    llvm::Value *Idx = StoreGEPList[i].second;
+    llvm::Value *Idx = StoreGEPList[I].second;
     if (Idx) {
       llvm::Value *V =
-          CGF.Builder.CreateLoad(StoreGEPList[i].first, "load.for.insert");
+          CGF.Builder.CreateLoad(StoreGEPList[I].first, "load.for.insert");
       Cast = CGF.Builder.CreateInsertElement(V, Cast, Idx);
     }
-    CGF.Builder.CreateStore(Cast, StoreGEPList[i].first);
+    CGF.Builder.CreateStore(Cast, StoreGEPList[I].first);
   }
   return;
 }
@@ -974,7 +974,7 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
     if (RV.isScalar()) {
       llvm::Value *SrcVal = RV.getScalarVal();
       EmitHLSLScalarFlatCast(CGF, DestVal, DestTy, SrcVal, SrcTy, Loc);
-    } else { // RHS is an aggregate
+    } else {
       assert(RV.isAggregate() &&
              "Can't perform HLSL Aggregate cast on a complex type.");
       Address SrcVal = RV.getAggregateAddress();
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index e7f5a4f06f9bcb5..e7136ed11f3c1a2 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2272,24 +2272,25 @@ static Value *EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address RHSVal,
   // LHS is either a vector or a builtin?
   // if its a vector create a temp alloca to store into and return that
   if (auto *VecTy = LHSTy->getAs<VectorType>()) {
+    assert(SrcTypes.size() >= VecTy->getNumElements() &&
+           "Flattened type on RHS must have more elements than vector on LHS.");
     llvm::Value *V =
         CGF.Builder.CreateLoad(CGF.CreateIRTemp(LHSTy, "flatcast.tmp"));
     // write to V.
-    for (unsigned i = 0; i < VecTy->getNumElements(); i++) {
-      llvm::Value *Load = CGF.Builder.CreateLoad(LoadGEPList[i].first, "load");
-      llvm::Value *Idx = LoadGEPList[i].second;
+    for (unsigned I = 0, E = VecTy->getNumElements(); I < E; I++) {
+      llvm::Value *Load = CGF.Builder.CreateLoad(LoadGEPList[I].first, "load");
+      llvm::Value *Idx = LoadGEPList[I].second;
       Load = Idx ? CGF.Builder.CreateExtractElement(Load, Idx, "vec.extract")
                  : Load;
       llvm::Value *Cast = CGF.EmitScalarConversion(
-          Load, SrcTypes[i], VecTy->getElementType(), Loc);
-      V = CGF.Builder.CreateInsertElement(V, Cast, i);
+          Load, SrcTypes[I], VecTy->getElementType(), Loc);
+      V = CGF.Builder.CreateInsertElement(V, Cast, I);
     }
     return V;
   }
   // i its a builtin just do an extract element or load.
   assert(LHSTy->isBuiltinType() &&
          "Destination type must be a vector or builtin type.");
-  // TODO add asserts about things being long enough
   llvm::Value *Load = CGF.Builder.CreateLoad(LoadGEPList[0].first, "load");
   llvm::Value *Idx = LoadGEPList[0].second;
   Load =

>From a7252d9be2ff961729cb9380a7b04cbd68c64088 Mon Sep 17 00:00:00 2001
From: Sarah Spall <sarahspall at microsoft.com>
Date: Wed, 5 Feb 2025 09:53:24 -0800
Subject: [PATCH 15/18] self review of loops

---
 clang/lib/CodeGen/CGExpr.cpp    |  6 +++---
 clang/lib/CodeGen/CGExprAgg.cpp | 14 +++++++-------
 clang/lib/Sema/SemaHLSL.cpp     |  6 ++++--
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 401e7b04a7dd7b7..5247db116de2d45 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6403,10 +6403,10 @@ void CodeGenFunction::FlattenAccessAndType(
       for (auto *FD : Record->fields())
         FieldTypes.push_back(FD->getType());
 
-      for (int64_t i = FieldTypes.size() - 1; i > -1; i--) {
+      for (int64_t I = FieldTypes.size() - 1; I > -1; I--) {
         llvm::SmallVector<llvm::Value *, 4> IdxListCopy = IdxList;
-        IdxListCopy.push_back(llvm::ConstantInt::get(IdxTy, i));
-        WorkList.insert(WorkList.end(), {FieldTypes[i], IdxListCopy});
+        IdxListCopy.push_back(llvm::ConstantInt::get(IdxTy, I));
+        WorkList.insert(WorkList.end(), {FieldTypes[I], IdxListCopy});
       }
     } else if (const auto *VT = dyn_cast<VectorType>(T)) {
       llvm::Type *LLVMT = ConvertTypeForMem(T);
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 8755d231a097237..dac2af4f023c7a1 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -545,22 +545,22 @@ static void EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address DestVal,
           has less elements than flattened destination object.");
   // apply casts to what we load from LoadGEPList
   // and store result in Dest
-  for (unsigned i = 0; i < StoreGEPList.size(); i++) {
-    llvm::Value *Idx = LoadGEPList[i].second;
-    llvm::Value *Load = CGF.Builder.CreateLoad(LoadGEPList[i].first, "load");
+  for (unsigned I = 0, E = StoreGEPList.size(); I < E; I++) {
+    llvm::Value *Idx = LoadGEPList[I].second;
+    llvm::Value *Load = CGF.Builder.CreateLoad(LoadGEPList[I].first, "load");
     Load =
         Idx ? CGF.Builder.CreateExtractElement(Load, Idx, "vec.extract") : Load;
     llvm::Value *Cast =
-        CGF.EmitScalarConversion(Load, SrcTypes[i], DestTypes[i], Loc);
+        CGF.EmitScalarConversion(Load, SrcTypes[I], DestTypes[I], Loc);
 
     // store back
-    Idx = StoreGEPList[i].second;
+    Idx = StoreGEPList[I].second;
     if (Idx) {
       llvm::Value *V =
-          CGF.Builder.CreateLoad(StoreGEPList[i].first, "load.for.insert");
+          CGF.Builder.CreateLoad(StoreGEPList[I].first, "load.for.insert");
       Cast = CGF.Builder.CreateInsertElement(V, Cast, Idx);
     }
-    CGF.Builder.CreateStore(Cast, StoreGEPList[i].first);
+    CGF.Builder.CreateStore(Cast, StoreGEPList[I].first);
   }
 }
 
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 56013b1ff1566b0..5bf58535bc0222b 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2500,8 +2500,10 @@ bool SemaHLSL::CanPerformAggregateCast(Expr *Src, QualType DestTy) {
   if (SrcTypes.size() < DestTypes.size())
     return false;
 
+  unsigned SrcSize = SrcTypes.size();
+  unsigned DstSize = DestTypes.size();
   unsigned I;
-  for (I = 0; I < DestTypes.size() && I < SrcTypes.size(); I++) {
+  for (I = 0; I < DstSize && I < SrcSize; I++) {
     if (SrcTypes[I]->isUnionType() || DestTypes[I]->isUnionType())
       return false;
     if (!CanPerformScalarCast(SrcTypes[I], DestTypes[I])) {
@@ -2510,7 +2512,7 @@ bool SemaHLSL::CanPerformAggregateCast(Expr *Src, QualType DestTy) {
   }
 
   // check the rest of the source type for unions.
-  for (; I < SrcTypes.size(); I++) {
+  for (; I < SrcSize; I++) {
     if (SrcTypes[I]->isUnionType())
       return false;
   }

>From 87d09f82e2c24915bee5718b7b12e9a61ebc3f6f Mon Sep 17 00:00:00 2001
From: Sarah Spall <sarahspall at microsoft.com>
Date: Wed, 5 Feb 2025 16:44:23 -0800
Subject: [PATCH 16/18] replace HLSLAggregateCast with HLSLElementwiseCast

---
 clang/include/clang/AST/OperationKinds.def    |  2 +-
 clang/lib/AST/Expr.cpp                        |  2 +-
 clang/lib/AST/ExprConstant.cpp                |  4 ++--
 clang/lib/CodeGen/CGExpr.cpp                  |  2 +-
 clang/lib/CodeGen/CGExprAgg.cpp               | 12 ++++++------
 clang/lib/CodeGen/CGExprComplex.cpp           |  2 +-
 clang/lib/CodeGen/CGExprConstant.cpp          |  2 +-
 clang/lib/CodeGen/CGExprScalar.cpp            | 10 +++++-----
 clang/lib/Edit/RewriteObjCFoundationAPI.cpp   |  2 +-
 clang/lib/Sema/SemaCast.cpp                   |  2 +-
 clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp |  2 +-
 clang/test/SemaHLSL/Language/FlatCasts.hlsl   |  4 ++--
 12 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/clang/include/clang/AST/OperationKinds.def b/clang/include/clang/AST/OperationKinds.def
index 9323d4e861a7342..b3dc7c3d8dc77e1 100644
--- a/clang/include/clang/AST/OperationKinds.def
+++ b/clang/include/clang/AST/OperationKinds.def
@@ -368,7 +368,7 @@ CAST_OPERATION(HLSLVectorTruncation)
 CAST_OPERATION(HLSLArrayRValue)
 
 // Aggregate by Value cast (HLSL only).
-CAST_OPERATION(HLSLAggregateCast)
+CAST_OPERATION(HLSLElementwiseCast)
 
 //===- Binary Operations  -------------------------------------------------===//
 // Operators listed in order of precedence.
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 4764bc84ce498a6..3129727703ef66e 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -1942,7 +1942,7 @@ bool CastExpr::CastConsistency() const {
   case CK_FixedPointToBoolean:
   case CK_HLSLArrayRValue:
   case CK_HLSLVectorTruncation:
-  case CK_HLSLAggregateCast:
+  case CK_HLSLElementwiseCast:
   CheckNoBasePath:
     assert(path_empty() && "Cast kind should not have a base path!");
     break;
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index a3f8e26b9a7823c..068d7333212f636 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14875,7 +14875,7 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
   case CK_NoOp:
   case CK_LValueToRValueBitCast:
   case CK_HLSLArrayRValue:
-  case CK_HLSLAggregateCast:
+  case CK_HLSLElementwiseCast:
     return ExprEvaluatorBaseTy::VisitCastExpr(E);
 
   case CK_MemberPointerToBoolean:
@@ -15734,7 +15734,7 @@ bool ComplexExprEvaluator::VisitCastExpr(const CastExpr *E) {
   case CK_IntegralToFixedPoint:
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
-  case CK_HLSLAggregateCast:
+  case CK_HLSLElementwiseCast:
     llvm_unreachable("invalid cast kind for complex value");
 
   case CK_LValueToRValue:
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 5247db116de2d45..24cb3b9ccb55cbf 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -5320,7 +5320,7 @@ LValue CodeGenFunction::EmitCastLValue(const CastExpr *E) {
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
   case CK_HLSLArrayRValue:
-  case CK_HLSLAggregateCast:
+  case CK_HLSLElementwiseCast:
     return EmitUnsupportedLValue(E, "unexpected cast lvalue");
 
   case CK_Dependent:
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index dac2af4f023c7a1..c3f1cbed6b39f95 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -526,9 +526,9 @@ static void EmitHLSLScalarFlatCast(CodeGenFunction &CGF, Address DestVal,
 }
 
 // emit a flat cast where the RHS is an aggregate
-static void EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address DestVal,
-                                      QualType DestTy, Address SrcVal,
-                                      QualType SrcTy, SourceLocation Loc) {
+static void EmitHLSLElementwiseCast(CodeGenFunction &CGF, Address DestVal,
+                                    QualType DestTy, Address SrcVal,
+                                    QualType SrcTy, SourceLocation Loc) {
   // Flatten our destination
   SmallVector<QualType, 16> DestTypes; // Flattened type
   SmallVector<std::pair<Address, llvm::Value *>, 16> StoreGEPList;
@@ -963,7 +963,7 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
   case CK_HLSLArrayRValue:
     Visit(E->getSubExpr());
     break;
-  case CK_HLSLAggregateCast: {
+  case CK_HLSLElementwiseCast: {
     Expr *Src = E->getSubExpr();
     QualType SrcTy = Src->getType();
     RValue RV = CGF.EmitAnyExpr(Src);
@@ -978,7 +978,7 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
       assert(RV.isAggregate() &&
              "Can't perform HLSL Aggregate cast on a complex type.");
       Address SrcVal = RV.getAggregateAddress();
-      EmitHLSLAggregateFlatCast(CGF, DestVal, DestTy, SrcVal, SrcTy, Loc);
+      EmitHLSLElementwiseCast(CGF, DestVal, DestTy, SrcVal, SrcTy, Loc);
     }
     break;
   }
@@ -1552,7 +1552,7 @@ static bool castPreservesZero(const CastExpr *CE) {
   case CK_NonAtomicToAtomic:
   case CK_AtomicToNonAtomic:
   case CK_HLSLVectorTruncation:
-  case CK_HLSLAggregateCast:
+  case CK_HLSLElementwiseCast:
     return true;
 
   case CK_BaseToDerivedMemberPointer:
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index 05680d36aa2bd77..c2679ea92dc9728 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -610,7 +610,7 @@ ComplexPairTy ComplexExprEmitter::EmitCast(CastKind CK, Expr *Op,
   case CK_MatrixCast:
   case CK_HLSLVectorTruncation:
   case CK_HLSLArrayRValue:
-  case CK_HLSLAggregateCast:
+  case CK_HLSLElementwiseCast:
     llvm_unreachable("invalid cast kind for complex value");
 
   case CK_FloatingRealToComplex:
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 6d15bc9058e4501..ef11798869d3b13 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -1335,7 +1335,7 @@ class ConstExprEmitter
     case CK_MatrixCast:
     case CK_HLSLVectorTruncation:
     case CK_HLSLArrayRValue:
-    case CK_HLSLAggregateCast:
+    case CK_HLSLElementwiseCast:
       return nullptr;
     }
     llvm_unreachable("Invalid CastKind");
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index e7136ed11f3c1a2..d099f3bdfa18a32 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2263,9 +2263,9 @@ bool CodeGenFunction::ShouldNullCheckClassCastValue(const CastExpr *CE) {
 }
 
 // RHS is an aggregate type
-static Value *EmitHLSLAggregateFlatCast(CodeGenFunction &CGF, Address RHSVal,
-                                        QualType RHSTy, QualType LHSTy,
-                                        SourceLocation Loc) {
+static Value *EmitHLSLElementwiseCast(CodeGenFunction &CGF, Address RHSVal,
+                                      QualType RHSTy, QualType LHSTy,
+                                      SourceLocation Loc) {
   SmallVector<std::pair<Address, llvm::Value *>, 16> LoadGEPList;
   SmallVector<QualType, 16> SrcTypes; // Flattened type
   CGF.FlattenAccessAndType(RHSVal, RHSTy, LoadGEPList, SrcTypes);
@@ -2788,7 +2788,7 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     llvm::Value *Zero = llvm::Constant::getNullValue(CGF.SizeTy);
     return Builder.CreateExtractElement(Vec, Zero, "cast.vtrunc");
   }
-  case CK_HLSLAggregateCast: {
+  case CK_HLSLElementwiseCast: {
     RValue RV = CGF.EmitAnyExpr(E);
     SourceLocation Loc = CE->getExprLoc();
     QualType SrcTy = E->getType();
@@ -2796,7 +2796,7 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     assert(RV.isAggregate() && "Not a valid HLSL Flat Cast.");
     // RHS is an aggregate
     Address SrcVal = RV.getAggregateAddress();
-    return EmitHLSLAggregateFlatCast(CGF, SrcVal, SrcTy, DestTy, Loc);
+    return EmitHLSLElementwiseCast(CGF, SrcVal, SrcTy, DestTy, Loc);
   }
   } // end of switch
 
diff --git a/clang/lib/Edit/RewriteObjCFoundationAPI.cpp b/clang/lib/Edit/RewriteObjCFoundationAPI.cpp
index 63308319a78d1cc..32f5ebb55155ed1 100644
--- a/clang/lib/Edit/RewriteObjCFoundationAPI.cpp
+++ b/clang/lib/Edit/RewriteObjCFoundationAPI.cpp
@@ -1085,7 +1085,7 @@ static bool rewriteToNumericBoxedExpression(const ObjCMessageExpr *Msg,
       llvm_unreachable("OpenCL-specific cast in Objective-C?");
 
     case CK_HLSLVectorTruncation:
-    case CK_HLSLAggregateCast:
+    case CK_HLSLElementwiseCast:
       llvm_unreachable("HLSL-specific cast in Objective-C?");
       break;
 
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 35a9afc0d160723..2befb6d6f748a5a 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -2781,7 +2781,7 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle,
       SrcExpr = Self.ImpCastExprToType(
           SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy),
           CK_HLSLArrayRValue, VK_PRValue, nullptr, CCK);
-    Kind = CK_HLSLAggregateCast;
+    Kind = CK_HLSLElementwiseCast;
     return;
   }
 
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
index b105c196fc3bfb3..3a983421358c7f4 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
@@ -522,7 +522,7 @@ void ExprEngine::VisitCast(const CastExpr *CastE, const Expr *Ex,
       case CK_ToUnion:
       case CK_MatrixCast:
       case CK_VectorSplat:
-      case CK_HLSLAggregateCast:
+      case CK_HLSLElementwiseCast:
       case CK_HLSLVectorTruncation: {
         QualType resultType = CastE->getType();
         if (CastE->isGLValue())
diff --git a/clang/test/SemaHLSL/Language/FlatCasts.hlsl b/clang/test/SemaHLSL/Language/FlatCasts.hlsl
index c869b32f0276c8b..563d3f02a1485a2 100644
--- a/clang/test/SemaHLSL/Language/FlatCasts.hlsl
+++ b/clang/test/SemaHLSL/Language/FlatCasts.hlsl
@@ -2,7 +2,7 @@
 
 // truncation
 // CHECK-LABEL: call1
-// CHECK: CStyleCastExpr {{.*}} 'int[1]' <HLSLAggregateCast>
+// CHECK: CStyleCastExpr {{.*}} 'int[1]' <HLSLElementwiseCast>
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int[2]' <HLSLArrayRValue> part_of_explicit_cast
 // CHECK-NEXT: DeclRefExpr {{.*}} 'int[2]' lvalue Var {{.*}} 'A' 'int[2]'
 export void call1() {
@@ -13,7 +13,7 @@ export void call1() {
 
 // flat cast of equal size
 // CHECK-LABEL: call2
-// CHECK: CStyleCastExpr {{.*}} 'float[1]' <HLSLAggregateCast>
+// CHECK: CStyleCastExpr {{.*}} 'float[1]' <HLSLElementwiseCast>
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int[1]' <HLSLArrayRValue> part_of_explicit_cast
 // CHECK-NEXT: DeclRefExpr {{.*}} 'int[1]' lvalue Var {{.*}} 'A' 'int[1]'
 export void call2() {

>From 03ed41750c78c9086d078121085ad03aef6f9ede Mon Sep 17 00:00:00 2001
From: Sarah Spall <sarahspall at microsoft.com>
Date: Wed, 5 Feb 2025 16:54:20 -0800
Subject: [PATCH 17/18] rename test files

---
 .../{ArrayFlatCast.hlsl => ArrayElementwiseCast.hlsl}             | 0
 .../{StructFlatCast.hlsl => StructElementwiseCast.hlsl}           | 0
 .../{VectorFlatCast.hlsl => VectorElementwiseCast.hlsl}           | 0
 .../{FlatCast-errors.hlsl => ElementwiseCast-errors.hlsl}         | 0
 .../SemaHLSL/Language/{FlatCasts.hlsl => ElementwiseCasts.hlsl}   | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename clang/test/CodeGenHLSL/BasicFeatures/{ArrayFlatCast.hlsl => ArrayElementwiseCast.hlsl} (100%)
 rename clang/test/CodeGenHLSL/BasicFeatures/{StructFlatCast.hlsl => StructElementwiseCast.hlsl} (100%)
 rename clang/test/CodeGenHLSL/BasicFeatures/{VectorFlatCast.hlsl => VectorElementwiseCast.hlsl} (100%)
 rename clang/test/SemaHLSL/Language/{FlatCast-errors.hlsl => ElementwiseCast-errors.hlsl} (100%)
 rename clang/test/SemaHLSL/Language/{FlatCasts.hlsl => ElementwiseCasts.hlsl} (100%)

diff --git a/clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/ArrayElementwiseCast.hlsl
similarity index 100%
rename from clang/test/CodeGenHLSL/BasicFeatures/ArrayFlatCast.hlsl
rename to clang/test/CodeGenHLSL/BasicFeatures/ArrayElementwiseCast.hlsl
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
similarity index 100%
rename from clang/test/CodeGenHLSL/BasicFeatures/StructFlatCast.hlsl
rename to clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/VectorFlatCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl
similarity index 100%
rename from clang/test/CodeGenHLSL/BasicFeatures/VectorFlatCast.hlsl
rename to clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl
diff --git a/clang/test/SemaHLSL/Language/FlatCast-errors.hlsl b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl
similarity index 100%
rename from clang/test/SemaHLSL/Language/FlatCast-errors.hlsl
rename to clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl
diff --git a/clang/test/SemaHLSL/Language/FlatCasts.hlsl b/clang/test/SemaHLSL/Language/ElementwiseCasts.hlsl
similarity index 100%
rename from clang/test/SemaHLSL/Language/FlatCasts.hlsl
rename to clang/test/SemaHLSL/Language/ElementwiseCasts.hlsl

>From d23411adf0fede190cc68015e0547c766e18069c Mon Sep 17 00:00:00 2001
From: Sarah Spall <sarahspall at microsoft.com>
Date: Wed, 5 Feb 2025 18:58:29 -0800
Subject: [PATCH 18/18] error for bitfields. tests for bitfield errors. minor
 pr suggestion change

---
 clang/include/clang/Sema/SemaHLSL.h           |  3 +-
 clang/lib/CodeGen/CGExpr.cpp                  |  2 +-
 clang/lib/Sema/SemaCast.cpp                   |  2 +-
 clang/lib/Sema/SemaHLSL.cpp                   | 41 ++++++++++++++++++-
 .../Language/ElementwiseCast-errors.hlsl      | 21 ++++++++++
 5 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 6bda1e8ce0ea5be..a2ad7c133c794a1 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -141,7 +141,8 @@ class SemaHLSL : public SemaBase {
   bool diagnoseInputIDType(QualType T, const ParsedAttr &AL);
 
   bool CanPerformScalarCast(QualType SrcTy, QualType DestTy);
-  bool CanPerformAggregateCast(Expr *Src, QualType DestType);
+  bool ContainsBitField(QualType BaseTy);
+  bool CanPerformElementwiseCast(Expr *Src, QualType DestType);
   ExprResult ActOnOutParamExpr(ParmVarDecl *Param, Expr *Arg);
 
   QualType getInoutParameterType(QualType Ty);
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 24cb3b9ccb55cbf..df25abfd84ac010 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6417,7 +6417,7 @@ void CodeGenFunction::FlattenAccessAndType(
         llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, I);
         // gep on vector fields is not recommended so combine gep with
         // extract/insert
-        AccessList.push_back({GEP, Idx});
+        AccessList.emplace_back(GEP, Idx);
         FlatTypes.push_back(VT->getElementType());
       }
     } else {
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 2befb6d6f748a5a..9a7a94a8fe432c0 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -2776,7 +2776,7 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle,
   // vector cast, vector truncation, or special hlsl splat cases
   QualType SrcTy = SrcExpr.get()->getType();
   if (Self.getLangOpts().HLSL &&
-      Self.HLSL().CanPerformAggregateCast(SrcExpr.get(), DestType)) {
+      Self.HLSL().CanPerformElementwiseCast(SrcExpr.get(), DestType)) {
     if (SrcTy->isConstantArrayType())
       SrcExpr = Self.ImpCastExprToType(
           SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy),
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 5bf58535bc0222b..33416072c59b4b8 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2476,9 +2476,43 @@ bool SemaHLSL::CanPerformScalarCast(QualType SrcTy, QualType DestTy) {
   llvm_unreachable("Unhandled scalar cast");
 }
 
-// Can we perform an HLSL Flattened cast?
+// Detect if a type contains a bitfield. Will be removed when
+// bitfield support is added to HLSLElementwiseCast
+bool SemaHLSL::ContainsBitField(QualType BaseTy) {
+  llvm::SmallVector<QualType, 16> WorkList;
+  WorkList.push_back(BaseTy);
+  while (!WorkList.empty()) {
+    QualType T = WorkList.pop_back_val();
+    T = T.getCanonicalType().getUnqualifiedType();
+    // only check aggregate types
+    if (const auto *AT = dyn_cast<ConstantArrayType>(T)) {
+      WorkList.push_back(AT->getElementType());
+      continue;
+    }
+    if (const auto *RT = dyn_cast<RecordType>(T)) {
+      const RecordDecl *RD = RT->getDecl();
+      if (RD->isUnion())
+        continue;
+
+      const CXXRecordDecl *CXXD = dyn_cast<CXXRecordDecl>(RD);
+
+      if (CXXD && CXXD->isStandardLayout())
+        RD = CXXD->getStandardLayoutBaseWithFields();
+
+      for (const auto *FD : RD->fields()) {
+        if (FD->isBitField())
+          return true;
+        WorkList.push_back(FD->getType());
+      }
+      continue;
+    }
+  }
+  return false;
+}
+
+// Can we perform an HLSL Elementwise cast?
 // TODO: update this code when matrices are added; see issue #88060
-bool SemaHLSL::CanPerformAggregateCast(Expr *Src, QualType DestTy) {
+bool SemaHLSL::CanPerformElementwiseCast(Expr *Src, QualType DestTy) {
 
   // Don't handle casts where LHS and RHS are any combination of scalar/vector
   // There must be an aggregate somewhere
@@ -2490,6 +2524,9 @@ bool SemaHLSL::CanPerformAggregateCast(Expr *Src, QualType DestTy) {
       (DestTy->isScalarType() || DestTy->isVectorType()))
     return false;
 
+  if (ContainsBitField(DestTy) || ContainsBitField(SrcTy))
+    return false;
+
   llvm::SmallVector<QualType> DestTypes;
   BuildFlattenedTypeList(DestTy, DestTypes);
   llvm::SmallVector<QualType> SrcTypes;
diff --git a/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl
index b5f482940dbdd9f..c900c83a063a06b 100644
--- a/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl
+++ b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl
@@ -6,3 +6,24 @@ export void cantCast() {
   B = (int[4])A;
   // expected-error at -1 {{C-style cast from 'int *' to 'int[4]' is not allowed}}
 }
+
+struct S {
+// expected-note at -1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int2' (aka 'vector<int, 2>') to 'const S' for 1st argument}}
+// expected-note at -2 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int2' (aka 'vector<int, 2>') to 'S' for 1st argument}}
+// expected-note at -3 {{candidate constructor (the implicit default constructor) not viable: requires 0 arguments, but 1 was provided}}
+  int A : 8;
+  int B;
+};
+
+// casting types which contain bitfields is not yet supported.
+export void cantCast2() {
+  S s = {1,2};
+  int2 C = (int2)s;
+  // expected-error at -1 {{cannot convert 'S' to 'int2' (aka 'vector<int, 2>') without a conversion operator}}
+}
+
+export void cantCast3() {
+  int2 C = {1,2};
+  S s = (S)C;
+  // expected-error at -1 {{no matching conversion for C-style cast from 'int2' (aka 'vector<int, 2>') to 'S'}}
+}