[llvm] [DirectX] Implement `memcpy` in DXIL CBuffer Access pass (PR #144436)

Deric C. via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 20 10:16:13 PDT 2025


https://github.com/Icohedron updated https://github.com/llvm/llvm-project/pull/144436

>From f1988f43da9b901d40ce6cee8c34ae6bbd8b47af Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Mon, 2 Jun 2025 22:53:53 +0000
Subject: [PATCH 1/5] Expect only Load and GEP users of Globals, and report
 error otherwise

---
 llvm/lib/Target/DirectX/DXILCBufferAccess.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
index 7559f61b4cfb9..cfd7cb95ca8f0 100644
--- a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
@@ -11,9 +11,12 @@
 #include "llvm/Frontend/HLSL/CBuffer.h"
 #include "llvm/Frontend/HLSL/HLSLResource.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsDirectX.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils/Local.h"
 
 #define DEBUG_TYPE "dxil-cbuffer-access"
@@ -161,7 +164,12 @@ static void replaceAccessesWithHandle(GlobalVariable *Global,
     }
 
     // Otherwise, walk users looking for a load...
-    ToProcess.append(Cur->user_begin(), Cur->user_end());
+    if (isa<GetElementPtrInst>(Cur) || isa<GEPOperator>(Cur)) {
+      ToProcess.append(Cur->user_begin(), Cur->user_end());
+      continue;
+    }
+
+    reportFatalInternalError("Unexpected user of Global");
   }
   RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
 }

>From c24411f1785d78263e7e9957a80bd3223181fb5d Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Fri, 13 Jun 2025 18:07:19 +0000
Subject: [PATCH 2/5] Refactor DXIL CBuffer Access pass

Refactor to consolidate logic to be reused for implementing support for
more cbuffer users.
---
 llvm/lib/Target/DirectX/DXILCBufferAccess.cpp | 210 +++++++++++-------
 1 file changed, 128 insertions(+), 82 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
index cfd7cb95ca8f0..775a7dfd99230 100644
--- a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
@@ -17,6 +17,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Transforms/Utils/Local.h"
 
 #define DEBUG_TYPE "dxil-cbuffer-access"
@@ -57,109 +58,153 @@ struct CBufferRowIntrin {
     }
   }
 };
-} // namespace
 
-static size_t getOffsetForCBufferGEP(GEPOperator *GEP, GlobalVariable *Global,
-                                     const DataLayout &DL) {
-  // Since we should always have a constant offset, we should only ever have a
-  // single GEP of indirection from the Global.
-  assert(GEP->getPointerOperand() == Global &&
-         "Indirect access to resource handle");
+// Helper for creating CBuffer handles and loading data from them
+struct CBufferResource {
+  GlobalVariable *GVHandle;
+  GlobalVariable *Member;
+  size_t MemberOffset;
 
-  APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
-  bool Success = GEP->accumulateConstantOffset(DL, ConstantOffset);
-  (void)Success;
-  assert(Success && "Offsets into cbuffer globals must be constant");
+  LoadInst *Handle;
 
-  if (auto *ATy = dyn_cast<ArrayType>(Global->getValueType()))
-    ConstantOffset = hlsl::translateCBufArrayOffset(DL, ConstantOffset, ATy);
+  CBufferResource(GlobalVariable *GVHandle, GlobalVariable *Member,
+                  size_t MemberOffset)
+      : GVHandle(GVHandle), Member(Member), MemberOffset(MemberOffset) {}
 
-  return ConstantOffset.getZExtValue();
-}
+  const DataLayout &getDataLayout() { return GVHandle->getDataLayout(); }
+  Type *getValueType() { return Member->getValueType(); }
+  iterator_range<ConstantDataSequential::user_iterator> users() {
+    return Member->users();
+  }
 
-/// Replace access via cbuffer global with a load from the cbuffer handle
-/// itself.
-static void replaceAccess(LoadInst *LI, GlobalVariable *Global,
-                          GlobalVariable *HandleGV, size_t BaseOffset,
-                          SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
-  const DataLayout &DL = HandleGV->getDataLayout();
+  /// Get the byte offset of a Pointer-typed Value * `Val` relative to Member.
+  /// `Val` can either be Member itself, or a GEP of a constant offset from
+  /// Member
+  size_t getOffsetForCBufferGEP(Value *Val) {
+    assert(isa<PointerType>(Val->getType()) &&
+           "Expected a pointer-typed value");
+
+    if (Val == Member)
+      return 0;
+
+    if (auto *GEP = dyn_cast<GEPOperator>(Val)) {
+      // Since we should always have a constant offset, we should only ever have
+      // a single GEP of indirection from the Global.
+      assert(GEP->getPointerOperand() == Member &&
+             "Indirect access to resource handle");
+
+      const DataLayout &DL = getDataLayout();
+      APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+      bool Success = GEP->accumulateConstantOffset(DL, ConstantOffset);
+      (void)Success;
+      assert(Success && "Offsets into cbuffer globals must be constant");
+
+      if (auto *ATy = dyn_cast<ArrayType>(Member->getValueType()))
+        ConstantOffset =
+            hlsl::translateCBufArrayOffset(DL, ConstantOffset, ATy);
+
+      return ConstantOffset.getZExtValue();
+    }
 
-  size_t Offset = BaseOffset;
-  if (auto *GEP = dyn_cast<GEPOperator>(LI->getPointerOperand()))
-    Offset += getOffsetForCBufferGEP(GEP, Global, DL);
-  else if (LI->getPointerOperand() != Global)
-    llvm_unreachable("Load instruction doesn't reference cbuffer global");
+    llvm_unreachable("Expected Val to be a GlobalVariable or GEP");
+  }
 
-  IRBuilder<> Builder(LI);
-  auto *Handle = Builder.CreateLoad(HandleGV->getValueType(), HandleGV,
-                                    HandleGV->getName());
-
-  Type *Ty = LI->getType();
-  CBufferRowIntrin Intrin(DL, Ty->getScalarType());
-  // The cbuffer consists of some number of 16-byte rows.
-  unsigned int CurrentRow = Offset / hlsl::CBufferRowSizeInBytes;
-  unsigned int CurrentIndex =
-      (Offset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize;
-
-  auto *CBufLoad = Builder.CreateIntrinsic(
-      Intrin.RetTy, Intrin.IID,
-      {Handle, ConstantInt::get(Builder.getInt32Ty(), CurrentRow)}, nullptr,
-      LI->getName());
-  auto *Elt =
-      Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, LI->getName());
-
-  Value *Result = nullptr;
-  unsigned int Remaining =
-      ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1;
-  if (Remaining == 0) {
-    // We only have a single element, so we're done.
-    Result = Elt;
-
-    // However, if we loaded a <1 x T>, then we need to adjust the type here.
-    if (auto *VT = dyn_cast<FixedVectorType>(LI->getType())) {
-      assert(VT->getNumElements() == 1 && "Can't have multiple elements here");
-      Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result,
-                                           Builder.getInt32(0));
-    }
-  } else {
-    // Walk each element and extract it, wrapping to new rows as needed.
-    SmallVector<Value *> Extracts{Elt};
-    while (Remaining--) {
-      CurrentIndex %= Intrin.NumElts;
-
-      if (CurrentIndex == 0)
-        CBufLoad = Builder.CreateIntrinsic(
-            Intrin.RetTy, Intrin.IID,
-            {Handle, ConstantInt::get(Builder.getInt32Ty(), ++CurrentRow)},
-            nullptr, LI->getName());
-
-      Extracts.push_back(Builder.CreateExtractValue(CBufLoad, {CurrentIndex++},
-                                                    LI->getName()));
+  /// Create a handle for this cbuffer resource using the IRBuilder `Builder`
+  /// and sets the handle as the current one to use for subsequent calls to
+  /// `loadValue`
+  void createAndSetCurrentHandle(IRBuilder<> &Builder) {
+    Handle = Builder.CreateLoad(GVHandle->getValueType(), GVHandle,
+                                GVHandle->getName());
+  }
+
+  /// Load a value of type `Ty` at offset `Offset` using the handle from the
+  /// last call to `createAndSetCurrentHandle`
+  Value *loadValue(IRBuilder<> &Builder, Type *Ty, size_t Offset,
+                   const Twine &Name = "") {
+    assert(Handle &&
+           "Expected a handle for this cbuffer global resource to be created "
+           "before loading a value from it");
+    const DataLayout &DL = getDataLayout();
+
+    size_t TargetOffset = MemberOffset + Offset;
+    CBufferRowIntrin Intrin(DL, Ty->getScalarType());
+    // The cbuffer consists of some number of 16-byte rows.
+    unsigned int CurrentRow = TargetOffset / hlsl::CBufferRowSizeInBytes;
+    unsigned int CurrentIndex =
+        (TargetOffset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize;
+
+    auto *CBufLoad = Builder.CreateIntrinsic(
+        Intrin.RetTy, Intrin.IID,
+        {Handle, ConstantInt::get(Builder.getInt32Ty(), CurrentRow)}, nullptr,
+        Name + ".load");
+    auto *Elt = Builder.CreateExtractValue(CBufLoad, {CurrentIndex++},
+                                           Name + ".extract");
+
+    Value *Result = nullptr;
+    unsigned int Remaining =
+        ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1;
+    if (Remaining == 0) {
+      // We only have a single element, so we're done.
+      Result = Elt;
+
+      // However, if we loaded a <1 x T>, then we need to adjust the type here.
+      if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+        assert(VT->getNumElements() == 1 &&
+               "Can't have multiple elements here");
+        Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result,
+                                             Builder.getInt32(0), Name);
+      }
+    } else {
+      // Walk each element and extract it, wrapping to new rows as needed.
+      SmallVector<Value *> Extracts{Elt};
+      while (Remaining--) {
+        CurrentIndex %= Intrin.NumElts;
+
+        if (CurrentIndex == 0)
+          CBufLoad = Builder.CreateIntrinsic(
+              Intrin.RetTy, Intrin.IID,
+              {Handle, ConstantInt::get(Builder.getInt32Ty(), ++CurrentRow)},
+              nullptr, Name + ".load");
+
+        Extracts.push_back(Builder.CreateExtractValue(
+            CBufLoad, {CurrentIndex++}, Name + ".extract"));
+      }
+
+      // Finally, we build up the original loaded value.
+      Result = PoisonValue::get(Ty);
+      for (int I = 0, E = Extracts.size(); I < E; ++I)
+        Result = Builder.CreateInsertElement(Result, Extracts[I],
+                                             Builder.getInt32(I),
+                                             Name + formatv(".upto{}", I));
     }
 
-    // Finally, we build up the original loaded value.
-    Result = PoisonValue::get(Ty);
-    for (int I = 0, E = Extracts.size(); I < E; ++I)
-      Result =
-          Builder.CreateInsertElement(Result, Extracts[I], Builder.getInt32(I));
+    return Result;
   }
+};
 
+} // namespace
+
+/// Replace load via cbuffer global with a load from the cbuffer handle itself.
+static void replaceLoad(LoadInst *LI, CBufferResource &CBR,
+                        SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+  size_t Offset = CBR.getOffsetForCBufferGEP(LI->getPointerOperand());
+  IRBuilder<> Builder(LI);
+  CBR.createAndSetCurrentHandle(Builder);
+  Value *Result = CBR.loadValue(Builder, LI->getType(), Offset, LI->getName());
   LI->replaceAllUsesWith(Result);
   DeadInsts.push_back(LI);
 }
 
-static void replaceAccessesWithHandle(GlobalVariable *Global,
-                                      GlobalVariable *HandleGV,
-                                      size_t BaseOffset) {
+static void replaceAccessesWithHandle(CBufferResource &CBR) {
   SmallVector<WeakTrackingVH> DeadInsts;
 
-  SmallVector<User *> ToProcess{Global->users()};
+  SmallVector<User *> ToProcess{CBR.users()};
   while (!ToProcess.empty()) {
     User *Cur = ToProcess.pop_back_val();
 
     // If we have a load instruction, replace the access.
     if (auto *LI = dyn_cast<LoadInst>(Cur)) {
-      replaceAccess(LI, Global, HandleGV, BaseOffset, DeadInsts);
+      replaceLoad(LI, CBR, DeadInsts);
       continue;
     }
 
@@ -181,7 +226,8 @@ static bool replaceCBufferAccesses(Module &M) {
 
   for (const hlsl::CBufferMapping &Mapping : *CBufMD)
     for (const hlsl::CBufferMember &Member : Mapping.Members) {
-      replaceAccessesWithHandle(Member.GV, Mapping.Handle, Member.Offset);
+      CBufferResource CBR(Mapping.Handle, Member.GV, Member.Offset);
+      replaceAccessesWithHandle(CBR);
       Member.GV->removeFromParent();
     }
 

>From b96adb5d56ca4ab84be089d5ea4b3dfa8220ab86 Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Mon, 16 Jun 2025 22:11:44 +0000
Subject: [PATCH 3/5] Implement memcpy in DXIL CBuffer Access

---
 llvm/lib/Target/DirectX/DXILCBufferAccess.cpp |  83 +++++++
 .../CodeGen/DirectX/CBufferAccess/memcpy.ll   | 204 ++++++++++++++++++
 2 files changed, 287 insertions(+)
 create mode 100644 llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll

diff --git a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
index 775a7dfd99230..e4a0ad0dd6159 100644
--- a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
@@ -195,6 +195,82 @@ static void replaceLoad(LoadInst *LI, CBufferResource &CBR,
   DeadInsts.push_back(LI);
 }
 
+/// Replace memcpy from a cbuffer global with a memcpy from the cbuffer handle
+/// itself. Assumes the cbuffer global is an array, and the length of bytes to
+/// copy is divisible by array element allocation size.
+/// The memcpy source must also be a direct cbuffer global reference, not a GEP.
+static void replaceMemCpy(MemCpyInst *MCI, CBufferResource &CBR,
+                          SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+
+  ArrayType *ArrTy = dyn_cast<ArrayType>(CBR.getValueType());
+  assert(ArrTy && "MemCpy lowering is only supported for array types");
+
+  // This assumption vastly simplifies the implementation
+  if (MCI->getSource() != CBR.Member)
+    reportFatalUsageError(
+        "Expected MemCpy source to be a cbuffer global variable");
+
+  const std::string Name = ("memcpy." + MCI->getDest()->getName() + "." +
+                            MCI->getSource()->getName())
+                               .str();
+
+  ConstantInt *Length = dyn_cast<ConstantInt>(MCI->getLength());
+  uint64_t ByteLength = Length->getZExtValue();
+
+  // If length to copy is zero, no memcpy is needed
+  if (ByteLength == 0) {
+    DeadInsts.push_back(MCI);
+    return;
+  }
+
+  const DataLayout &DL = CBR.getDataLayout();
+
+  Type *ElemTy = ArrTy->getElementType();
+  size_t ElemSize = DL.getTypeAllocSize(ElemTy);
+  assert(ByteLength % ElemSize == 0 &&
+         "Length of bytes to MemCpy must be divisible by allocation size of "
+         "source/destination array elements");
+  size_t ElemsToCpy = ByteLength / ElemSize;
+
+  IRBuilder<> Builder(MCI);
+  CBR.createAndSetCurrentHandle(Builder);
+
+  auto CopyElemsImpl = [&Builder, &MCI, &Name, &CBR,
+                        &DL](const auto &Self, ArrayType *ArrTy,
+                             size_t ArrOffset, size_t N) -> void {
+    Type *ElemTy = ArrTy->getElementType();
+    size_t ElemTySize = DL.getTypeAllocSize(ElemTy);
+    for (unsigned I = 0; I < N; ++I) {
+      size_t Offset = ArrOffset + I * ElemTySize;
+
+      // Recursively copy nested arrays
+      if (ArrayType *ElemArrTy = dyn_cast<ArrayType>(ElemTy)) {
+        Self(Self, ElemArrTy, Offset, ElemArrTy->getNumElements());
+        continue;
+      }
+
+      // Load CBuffer value and store it in Dest
+      APInt CBufArrayOffset(
+          DL.getIndexTypeSizeInBits(MCI->getSource()->getType()), Offset);
+      CBufArrayOffset =
+          hlsl::translateCBufArrayOffset(DL, CBufArrayOffset, ArrTy);
+      Value *CBufferVal =
+          CBR.loadValue(Builder, ElemTy, CBufArrayOffset.getZExtValue(), Name);
+      Value *GEP =
+          Builder.CreateInBoundsGEP(Builder.getInt8Ty(), MCI->getDest(),
+                                    {Builder.getInt32(Offset)}, Name + ".dest");
+      Builder.CreateStore(CBufferVal, GEP, MCI->isVolatile());
+    }
+  };
+  auto CopyElems = [&CopyElemsImpl](ArrayType *ArrTy, size_t N) -> void {
+    CopyElemsImpl(CopyElemsImpl, ArrTy, 0, N);
+  };
+
+  CopyElems(ArrTy, ElemsToCpy);
+
+  MCI->eraseFromParent();
+}
+
 static void replaceAccessesWithHandle(CBufferResource &CBR) {
   SmallVector<WeakTrackingVH> DeadInsts;
 
@@ -208,6 +284,13 @@ static void replaceAccessesWithHandle(CBufferResource &CBR) {
       continue;
     }
 
+    // If we have a memcpy instruction, replace it with multiple accesses and
+    // subsequent stores to the destination
+    if (auto *MCI = dyn_cast<MemCpyInst>(Cur)) {
+      replaceMemCpy(MCI, CBR, DeadInsts);
+      continue;
+    }
+
     // Otherwise, walk users looking for a load...
     if (isa<GetElementPtrInst>(Cur) || isa<GEPOperator>(Cur)) {
       ToProcess.append(Cur->user_begin(), Cur->user_end());
diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll
new file mode 100644
index 0000000000000..2cf7327d8c195
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll
@@ -0,0 +1,204 @@
+; RUN: opt -S -dxil-cbuffer-access -mtriple=dxil--shadermodel6.3-library %s | FileCheck %s
+
+; cbuffer CB : register(b0) {
+;   float a1[3];
+;   double3 a2[2];
+;   float16_t a3[2][2];
+;   uint64_t a4[3];
+;   int2 a5[3][2];
+;   uint16_t a6[1];
+;   int64_t a7[2];
+;   bool a8[4];
+; }
+%__cblayout_CB = type <{ [3 x float], [2 x <3 x double>], [2 x [2 x half]], [3 x i64], [3 x [2 x <2 x i32>]], [1 x i16], [2 x i64], [4 x i32] }>
+
+ at CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 708, 0, 48, 112, 176, 224, 272, 288, 320)) poison
+ at a1 = external local_unnamed_addr addrspace(2) global [3 x float], align 4
+ at a2 = external local_unnamed_addr addrspace(2) global [2 x <3 x double>], align 32
+ at a3 = external local_unnamed_addr addrspace(2) global [2 x [2 x half]], align 2
+ at a4 = external local_unnamed_addr addrspace(2) global [3 x i64], align 8
+ at a5 = external local_unnamed_addr addrspace(2) global [3 x [2 x <2 x i32>]], align 16
+ at a6 = external local_unnamed_addr addrspace(2) global [1 x i16], align 2
+ at a7 = external local_unnamed_addr addrspace(2) global [2 x i64], align 8
+ at a8 = external local_unnamed_addr addrspace(2) global [4 x i32], align 4
+
+; CHECK: define void @f(
+define void @f(ptr %dst) {
+entry:
+  %CB.cb_h.i.i = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 708, 0, 48, 112, 176, 224, 272, 288, 320)) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 708, 0, 48, 112, 176, 224, 272, 288, 320)) %CB.cb_h.i.i, ptr @CB.cb, align 4
+
+  %a1.copy = alloca [3 x float], align 4
+  %a2.copy = alloca [2 x <3 x double>], align 32
+  %a3.copy = alloca [2 x [2 x half]], align 2
+  %a4.copy = alloca [3 x i64], align 8
+  %a5.copy = alloca [3 x [2 x <2 x i32>]], align 16
+  %a6.copy = alloca [1 x i16], align 2
+  %a7.copy = alloca [2 x i64], align 8
+  %a8.copy = alloca [4 x i32], align 4
+
+; CHECK:    [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4
+; CHECK:    [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 0)
+; CHECK:    [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A1_COPY:%.*]], i32 0
+; CHECK:    store float [[X]], ptr [[DEST]], align 4
+; CHECK:    [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 1)
+; CHECK:    [[Y:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A1_COPY]], i32 4
+; CHECK:    store float [[Y]], ptr [[DEST]], align 4
+; CHECK:    [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 2)
+; CHECK:    [[Z:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A1_COPY]], i32 8
+; CHECK:    store float [[Z]], ptr [[DEST]], align 4
+  call void @llvm.memcpy.p0.p2.i32(ptr align 4 %a1.copy, ptr addrspace(2) align 4 @a1, i32 12, i1 false)
+
+; CHECK:    [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4
+; CHECK:    [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 3)
+; CHECK:    [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0
+; CHECK:    [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1
+; CHECK:    [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 4)
+; CHECK:    [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0
+; CHECK:    [[UPTO0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0
+; CHECK:    [[UPTO1:%.*]] = insertelement <3 x double> [[UPTO0]], double [[Y]], i32 1
+; CHECK:    [[UPTO2:%.*]] = insertelement <3 x double> [[UPTO1]], double [[Z]], i32 2
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY:%.*]], i32 0
+; CHECK:    store <3 x double> [[UPTO2]], ptr [[DEST]], align 32
+; CHECK:    [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 5)
+; CHECK:    [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0
+; CHECK:    [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1
+; CHECK:    [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 6)
+; CHECK:    [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0
+; CHECK:    [[UPTO0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0
+; CHECK:    [[UPTO1:%.*]] = insertelement <3 x double> [[UPTO0]], double [[Y]], i32 1
+; CHECK:    [[UPTO2:%.*]] = insertelement <3 x double> [[UPTO1]], double [[Z]], i32 2
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY]], i32 32
+; CHECK:    store <3 x double> [[UPTO2]], ptr [[DEST]], align 32
+  call void @llvm.memcpy.p0.p2.i32(ptr align 32 %a2.copy, ptr addrspace(2) align 32 @a2, i32 64, i1 false)
+
+; CHECK:    [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4
+; CHECK:    [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 7)
+; CHECK:    [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A3_COPY:%.*]], i32 0
+; CHECK:    store half [[X]], ptr [[DEST]], align 2
+; CHECK:    [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 8)
+; CHECK:    [[Y:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A3_COPY]], i32 2
+; CHECK:    store half [[Y]], ptr [[DEST]], align 2
+; CHECK:    [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 9)
+; CHECK:    [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A3_COPY]], i32 4
+; CHECK:    store half [[X]], ptr [[DEST]], align 2
+; CHECK:    [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 10)
+; CHECK:    [[Y:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A3_COPY]], i32 6
+; CHECK:    store half [[Y]], ptr [[DEST]], align 2
+  call void @llvm.memcpy.p0.p2.i32(ptr align 2 %a3.copy, ptr addrspace(2) align 2 @a3, i32 8, i1 false)
+
+; CHECK:    [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4
+; CHECK:    [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 11)
+; CHECK:    [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A4_COPY:%.*]], i32 0
+; CHECK:    store i64 [[X]], ptr [[DEST]], align 8
+; CHECK:    [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 12)
+; CHECK:    [[Y:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A4_COPY]], i32 8
+; CHECK:    store i64 [[Y]], ptr [[DEST]], align 8
+; CHECK:    [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 13)
+; CHECK:    [[Z:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A4_COPY]], i32 16
+; CHECK:    store i64 [[Z]], ptr [[DEST]], align 8
+  call void @llvm.memcpy.p0.p2.i32(ptr align 8 %a4.copy, ptr addrspace(2) align 8 @a4, i32 24, i1 false)
+
+; CHECK:    [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4
+; CHECK:    [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 14)
+; CHECK:    [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+; CHECK:    [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+; CHECK:    [[UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0
+; CHECK:    [[UPTO1:%.*]] = insertelement <2 x i32> [[UPTO0]], i32 [[Y]], i32 1
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A5_COPY:%.*]], i32 0
+; CHECK:    store <2 x i32> [[UPTO1]], ptr [[DEST]], align 8
+; CHECK:    [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 15)
+; CHECK:    [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+; CHECK:    [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+; CHECK:    [[UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0
+; CHECK:    [[UPTO1:%.*]] = insertelement <2 x i32> [[UPTO0]], i32 [[Y]], i32 1
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A5_COPY]], i32 8
+; CHECK:    store <2 x i32> [[UPTO1]], ptr [[DEST]], align 8
+; CHECK:    [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 16)
+; CHECK:    [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+; CHECK:    [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+; CHECK:    [[UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0
+; CHECK:    [[UPTO1:%.*]] = insertelement <2 x i32> [[UPTO0]], i32 [[Y]], i32 1
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A5_COPY]], i32 16
+; CHECK:    store <2 x i32> [[UPTO1]], ptr [[DEST]], align 8
+; CHECK:    [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 17)
+; CHECK:    [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+; CHECK:    [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+; CHECK:    [[UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0
+; CHECK:    [[UPTO1:%.*]] = insertelement <2 x i32> [[UPTO0]], i32 [[Y]], i32 1
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A5_COPY]], i32 24
+; CHECK:    store <2 x i32> [[UPTO1]], ptr [[DEST]], align 8
+; CHECK:    [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 18)
+; CHECK:    [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+; CHECK:    [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+; CHECK:    [[UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0
+; CHECK:    [[UPTO1:%.*]] = insertelement <2 x i32> [[UPTO0]], i32 [[Y]], i32 1
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A5_COPY]], i32 32
+; CHECK:    store <2 x i32> [[UPTO1]], ptr [[DEST]], align 8
+; CHECK:    [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 19)
+; CHECK:    [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+; CHECK:    [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+; CHECK:    [[UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0
+; CHECK:    [[UPTO1:%.*]] = insertelement <2 x i32> [[UPTO0]], i32 [[Y]], i32 1
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A5_COPY]], i32 40
+; CHECK:    store <2 x i32> [[UPTO1]], ptr [[DEST]], align 8
+  call void @llvm.memcpy.p0.p2.i32(ptr align 16 %a5.copy, ptr addrspace(2) align 16 @a5, i32 48, i1 false)
+
+; CHECK:    [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4
+; CHECK:    [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 17)
+; CHECK:    [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A6_COPY:%.*]], i32 0
+; CHECK:    store i16 [[X]], ptr [[DEST]], align 2
+  call void @llvm.memcpy.p0.p2.i32(ptr align 2 %a6.copy, ptr addrspace(2) align 2 @a6, i32 2, i1 false)
+
+; CHECK:    [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4
+; CHECK:    [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 18)
+; CHECK:    [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A7_COPY:%.*]], i32 0
+; CHECK:    store i64 [[X]], ptr [[DEST]], align 8
+; CHECK:    [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 19)
+; CHECK:    [[Y:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A7_COPY]], i32 8
+; CHECK:    store i64 [[Y]], ptr [[DEST]], align 8
+  call void @llvm.memcpy.p0.p2.i32(ptr align 8 %a7.copy, ptr addrspace(2) align 8 @a7, i32 16, i1 false)
+
+; CHECK:    [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4
+; CHECK:    [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 20)
+; CHECK:    [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A8_COPY:%.*]], i32 0
+; CHECK:    store i32 [[X]], ptr [[DEST]], align 4
+; CHECK:    [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 21)
+; CHECK:    [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A8_COPY]], i32 4
+; CHECK:    store i32 [[Y]], ptr [[DEST]], align 4
+; CHECK:    [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 22)
+; CHECK:    [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A8_COPY]], i32 8
+; CHECK:    store i32 [[Z]], ptr [[DEST]], align 4
+; CHECK:    [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 23)
+; CHECK:    [[W:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A8_COPY]], i32 12
+; CHECK:    store i32 [[W]], ptr [[DEST]], align 4
+  call void @llvm.memcpy.p0.p2.i32(ptr align 4 %a8.copy, ptr addrspace(2) align 4 @a8, i32 16, i1 false)
+
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p2.i32(ptr noalias writeonly captures(none), ptr addrspace(2) noalias readonly captures(none), i32, i1 immarg)
+
+; CHECK-NOT: !hlsl.cbs =
+!hlsl.cbs = !{!0}
+
+!0 = !{ptr @CB.cb, ptr addrspace(2) @a1, ptr addrspace(2) @a2, ptr addrspace(2) @a3, ptr addrspace(2) @a4, ptr addrspace(2) @a5, ptr addrspace(2) @a6, ptr addrspace(2) @a7, ptr addrspace(2) @a8}
+!1 = !{i32 0, i32 2}
+!2 = !{}

>From 6ab934d72a0679866677232b9d7c7b5fd0175f14 Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Mon, 16 Jun 2025 22:25:17 +0000
Subject: [PATCH 4/5] Add comment explaining CopyElemsImpl function

---
 llvm/lib/Target/DirectX/DXILCBufferAccess.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
index e4a0ad0dd6159..ee518908de281 100644
--- a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
@@ -235,6 +235,9 @@ static void replaceMemCpy(MemCpyInst *MCI, CBufferResource &CBR,
   IRBuilder<> Builder(MCI);
   CBR.createAndSetCurrentHandle(Builder);
 
+  // This function recursively copies N array elements from the CBuffer Resource
+  // to the MemCpy Destination. Recursion is used to unravel multidimensional
+  // arrays into a sequence of scalar/vector extracts and stores.
   auto CopyElemsImpl = [&Builder, &MCI, &Name, &CBR,
                         &DL](const auto &Self, ArrayType *ArrTy,
                              size_t ArrOffset, size_t N) -> void {

>From 14ce485502a3da92365ddbfeb2cb83cefae43704 Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Fri, 20 Jun 2025 17:16:00 +0000
Subject: [PATCH 5/5] Add test for 0 byte copy and copy one element. Fix 0 byte
 copy implementation

---
 llvm/lib/Target/DirectX/DXILCBufferAccess.cpp     |  7 +++----
 llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll | 12 ++++++++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
index ee518908de281..13abf6f1a7791 100644
--- a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
@@ -199,8 +199,7 @@ static void replaceLoad(LoadInst *LI, CBufferResource &CBR,
 /// itself. Assumes the cbuffer global is an array, and the length of bytes to
 /// copy is divisible by array element allocation size.
 /// The memcpy source must also be a direct cbuffer global reference, not a GEP.
-static void replaceMemCpy(MemCpyInst *MCI, CBufferResource &CBR,
-                          SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+static void replaceMemCpy(MemCpyInst *MCI, CBufferResource &CBR) {
 
   ArrayType *ArrTy = dyn_cast<ArrayType>(CBR.getValueType());
   assert(ArrTy && "MemCpy lowering is only supported for array types");
@@ -219,7 +218,7 @@ static void replaceMemCpy(MemCpyInst *MCI, CBufferResource &CBR,
 
   // If length to copy is zero, no memcpy is needed
   if (ByteLength == 0) {
-    DeadInsts.push_back(MCI);
+    MCI->eraseFromParent();
     return;
   }
 
@@ -290,7 +289,7 @@ static void replaceAccessesWithHandle(CBufferResource &CBR) {
     // If we have a memcpy instruction, replace it with multiple accesses and
     // subsequent stores to the destination
     if (auto *MCI = dyn_cast<MemCpyInst>(Cur)) {
-      replaceMemCpy(MCI, CBR, DeadInsts);
+      replaceMemCpy(MCI, CBR);
       continue;
     }
 
diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll
index 2cf7327d8c195..001f3320137a6 100644
--- a/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll
@@ -37,6 +37,18 @@ entry:
   %a7.copy = alloca [2 x i64], align 8
   %a8.copy = alloca [4 x i32], align 4
 
+  ; Try copying no elements
+; CHECK-NOT: memcpy
+  call void @llvm.memcpy.p0.p2.i32(ptr align 4 %a1.copy, ptr addrspace(2) align 4 @a1, i32 0, i1 false)
+
+  ; Try copying only the first element
+; CHECK:    [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4
+; CHECK:    [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 0)
+; CHECK:    [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A1_COPY:%.*]], i32 0
+; CHECK:    store float [[X]], ptr [[DEST]], align 4
+  call void @llvm.memcpy.p0.p2.i32(ptr align 4 %a1.copy, ptr addrspace(2) align 4 @a1, i32 4, i1 false)
+
 ; CHECK:    [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4
 ; CHECK:    [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 0)
 ; CHECK:    [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0



More information about the llvm-commits mailing list