[llvm] 38cdadd - [DirectX] Teach MemIntrinsics about structs and nested arrays (#173078)

Mon Dec 22 12:28:45 PST 2025

Author: Justin Bogner
Date: 2025-12-22T13:28:41-07:00
New Revision: 38cdadd9c74509be636e41778043e4cd270be04b

URL: https://github.com/llvm/llvm-project/commit/38cdadd9c74509be636e41778043e4cd270be04b
DIFF: https://github.com/llvm/llvm-project/commit/38cdadd9c74509be636e41778043e4cd270be04b.diff

LOG: [DirectX] Teach MemIntrinsics about structs and nested arrays (#173078)

Add handling for more complicated cases than simple arrays.

Added: 
    llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy-struct.ll

Modified: 
    llvm/lib/Target/DirectX/DXILMemIntrinsics.cpp
    llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy-pointee.ll
    llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/DirectX/DXILMemIntrinsics.cpp b/llvm/lib/Target/DirectX/DXILMemIntrinsics.cpp
index 76352b3a2e354..fa8b7dae6c76a 100644

--- a/llvm/lib/Target/DirectX/DXILMemIntrinsics.cpp
+++ b/llvm/lib/Target/DirectX/DXILMemIntrinsics.cpp
@@ -109,6 +109,24 @@ static Type *getPointeeType(Value *Ptr, const DataLayout &DL) {
   llvm_unreachable("Could not calculate pointee type");
 }
 
+static size_t flattenTypes(Type *ContainerTy, const DataLayout &DL,
+                           SmallVectorImpl<std::pair<Type *, size_t>> &FlatTys,
+                           size_t NextOffset = 0) {
+  if (auto *AT = dyn_cast<ArrayType>(ContainerTy)) {
+    for (uint64_t I = 0, E = AT->getNumElements(); I != E; ++I)
+      NextOffset = flattenTypes(AT->getElementType(), DL, FlatTys, NextOffset);
+    return NextOffset;
+  }
+  if (auto *ST = dyn_cast<StructType>(ContainerTy)) {
+    for (Type *Ty : ST->elements())
+      NextOffset = flattenTypes(Ty, DL, FlatTys, NextOffset);
+    return NextOffset;
+  }
+
+  FlatTys.emplace_back(ContainerTy, NextOffset);
+  return NextOffset + DL.getTypeStoreSize(ContainerTy);
+}
+
 void expandMemCpy(MemCpyInst *MemCpy) {
   IRBuilder<> Builder(MemCpy);
   Value *Dst = MemCpy->getDest();
@@ -124,43 +142,36 @@ void expandMemCpy(MemCpyInst *MemCpy) {
 
   const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
 
-  auto *DstArrTy = dyn_cast<ArrayType>(getPointeeType(Dst, DL));
-  assert(DstArrTy && "Expected Dst of memcpy to be a Pointer to an Array Type");
-  if (auto *DstGlobalVar = dyn_cast<GlobalVariable>(Dst))
-    assert(!DstGlobalVar->isConstant() &&
-           "The Dst of memcpy must not be a constant Global Variable");
-  [[maybe_unused]] auto *SrcArrTy =
-      dyn_cast<ArrayType>(getPointeeType(Src, DL));
-  assert(SrcArrTy && "Expected Src of memcpy to be a Pointer to an Array Type");
-
-  Type *DstElemTy = DstArrTy->getElementType();
-  uint64_t DstElemByteSize = DL.getTypeStoreSize(DstElemTy);
-  assert(DstElemByteSize > 0 && "Dst element type store size must be set");
-  Type *SrcElemTy = SrcArrTy->getElementType();
-  [[maybe_unused]] uint64_t SrcElemByteSize = DL.getTypeStoreSize(SrcElemTy);
-  assert(SrcElemByteSize > 0 && "Src element type store size must be set");
-
-  // This assumption simplifies implementation and covers currently-known
-  // use-cases for DXIL. It may be relaxed in the future if required.
-  assert(DstElemTy == SrcElemTy &&
-         "The element types of Src and Dst arrays must match");
-
-  [[maybe_unused]] uint64_t DstArrNumElems = DstArrTy->getArrayNumElements();
-  assert(DstElemByteSize * DstArrNumElems >= ByteLength &&
-         "Dst array size must be at least as large as the memcpy length");
-  [[maybe_unused]] uint64_t SrcArrNumElems = SrcArrTy->getArrayNumElements();
-  assert(SrcElemByteSize * SrcArrNumElems >= ByteLength &&
-         "Src array size must be at least as large as the memcpy length");
-
-  uint64_t NumElemsToCopy = ByteLength / DstElemByteSize;
-  assert(ByteLength % DstElemByteSize == 0 &&
-         "memcpy length must be divisible by array element type");
-  for (uint64_t I = 0; I < NumElemsToCopy; ++I) {
-    SmallVector<Value *, 2> Indices = {Builder.getInt32(0),
-                                       Builder.getInt32(I)};
-    Value *SrcPtr = Builder.CreateInBoundsGEP(SrcArrTy, Src, Indices, "gep");
-    Value *SrcVal = Builder.CreateLoad(SrcElemTy, SrcPtr);
-    Value *DstPtr = Builder.CreateInBoundsGEP(DstArrTy, Dst, Indices, "gep");
+  SmallVector<std::pair<Type *, size_t>> FlattenedTypes;
+  [[maybe_unused]] size_t MaxLength =
+      flattenTypes(getPointeeType(Dst, DL), DL, FlattenedTypes);
+  assert(MaxLength >= ByteLength && "Dst not large enough for memcpy");
+
+  LLVM_DEBUG({
+    // Check if Src is layout compatible with Dst. This should always be true
+    // unless the frontend did something wrong.
+    SmallVector<std::pair<Type *, size_t>> SrcTypes;
+    size_t SrcLength = flattenTypes(getPointeeType(Src, DL), DL, SrcTypes);
+    assert(SrcLength >= ByteLength && "Src not large enough for memcpy");
+    for (const auto &[LHS, RHS] : zip(FlattenedTypes, SrcTypes)) {
+      auto &[DstTy, DstOffset] = LHS;
+      auto &[SrcTy, SrcOffset] = RHS;
+      assert(DstTy == SrcTy && "Mismatched types for memcpy");
+      assert(DstOffset == SrcOffset && "Incompatible layouts for memcpy");
+      if (DstOffset >= ByteLength)
+        break;
+    }
+  });
+
+  for (const auto &[Ty, Offset] : FlattenedTypes) {
+    if (Offset >= ByteLength)
+      break;
+    // TODO: Should we skip padding types here?
+    Type *Int8Ty = Builder.getInt8Ty();
+    Value *ByteOffset = Builder.getInt32(Offset);
+    Value *SrcPtr = Builder.CreateInBoundsGEP(Int8Ty, Src, ByteOffset);
+    Value *SrcVal = Builder.CreateLoad(Ty, SrcPtr);
+    Value *DstPtr = Builder.CreateInBoundsGEP(Int8Ty, Dst, ByteOffset);
     Builder.CreateStore(SrcVal, DstPtr);
   }
 

diff  --git a/llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy-pointee.ll b/llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy-pointee.ll
index 326d9797ea88a..ae6793310fdb3 100644
--- a/llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy-pointee.ll
+++ b/llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy-pointee.ll
@@ -11,13 +11,13 @@ define void @test_structarray_alloca() "hlsl.export" {
 ; CHECK-NEXT:    [[IN:%.*]] = alloca [[STRUCT_S]], align 16
 ; CHECK-NEXT:    [[OUT_I:%.*]] = getelementptr inbounds nuw i8, ptr [[OUT]], i32 16
 ; CHECK-NEXT:    [[IN_I:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i32 16
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[IN_I]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[IN_I]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[TMP0]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[OUT_I]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[OUT_I]], i32 0
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[TMP2]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[IN_I]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[IN_I]], i32 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[TMP3]], align 16
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[OUT_I]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[OUT_I]], i32 16
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -38,13 +38,13 @@ define void @test_structarray_alloca_typed() "hlsl.export" {
 ; CHECK-NEXT:    [[IN:%.*]] = alloca [[STRUCT_S]], align 16
 ; CHECK-NEXT:    [[OUT_I:%.*]] = getelementptr { <4 x i32>, [2 x <4 x i32>] }, ptr [[OUT]], i32 0, i32 1
 ; CHECK-NEXT:    [[IN_I:%.*]] = getelementptr { <4 x i32>, [2 x <4 x i32>] }, ptr [[IN]], i32 0, i32 1
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[IN_I]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[IN_I]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[GEP]], align 16
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[OUT_I]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[OUT_I]], i32 0
 ; CHECK-NEXT:    store <4 x i32> [[TMP0]], ptr [[GEP1]], align 16
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[IN_I]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[IN_I]], i32 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[GEP2]], align 16
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[OUT_I]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[OUT_I]], i32 16
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[GEP3]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -65,8 +65,8 @@ define void @test_structarray_groupshared() "hlsl.export" {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @shared2, i32 16), align 16
 ; CHECK-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @shared1, i32 16), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(3) getelementptr inbounds ([2 x <4 x i32>], ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @shared2, i32 16), i32 0, i32 1), align 16
-; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr addrspace(3) getelementptr inbounds ([2 x <4 x i32>], ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @shared1, i32 16), i32 0, i32 1), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @shared2, i32 16), i32 16), align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @shared1, i32 16), i32 16), align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -84,13 +84,13 @@ define void @test_structarray_in_buffer() "hlsl.export" {
 ; CHECK-NEXT:    [[OUT_I:%.*]] = getelementptr inbounds nuw i8, ptr [[P_OUT]], i32 16
 ; CHECK-NEXT:    [[P_IN:%.*]] = tail call noundef nonnull align 1 dereferenceable(48) ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_s_struct.Ss_0_0t(target("dx.RawBuffer", [[STRUCT_S]], 0, 0) [[H_IN]], i32 0)
 ; CHECK-NEXT:    [[IN_I:%.*]] = getelementptr inbounds nuw i8, ptr [[P_IN]], i32 16
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[OUT_I]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[OUT_I]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[TMP0]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[IN_I]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[IN_I]], i32 0
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[TMP2]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[OUT_I]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[OUT_I]], i32 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[TMP3]], align 16
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[IN_I]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[IN_I]], i32 16
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy-struct.ll b/llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy-struct.ll
new file mode 100644
index 0000000000000..060d08d99b550
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy-struct.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -dxil-mem-intrinsics -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+%struct.S = type { <4 x i32>, [2 x <4 x i32>] }
+
+define void @test_struct_alloca() "hlsl.export" {
+; CHECK-LABEL: define void @test_struct_alloca(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[OUT:%.*]] = alloca [[STRUCT_S:%.*]], align 16
+; CHECK-NEXT:    [[IN:%.*]] = alloca [[STRUCT_S]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[OUT]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[IN]], i32 16
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[TMP3]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[OUT]], i32 16
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[IN]], i32 32
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[OUT]], i32 32
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP8]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %out = alloca %struct.S
+  %in = alloca %struct.S
+  tail call void @llvm.memcpy(ptr %out, ptr %in, i32 48, i1 false)
+  ret void
+}
+
+define void @test_array_of_array() "hlsl.export" {
+; CHECK-LABEL: define void @test_array_of_array(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[OUT:%.*]] = alloca [2 x [3 x float]], align 4
+; CHECK-NEXT:    [[IN:%.*]] = alloca [2 x [3 x float]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[OUT]], i32 0
+; CHECK-NEXT:    store float [[TMP1]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[IN]], i32 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[OUT]], i32 4
+; CHECK-NEXT:    store float [[TMP4]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[IN]], i32 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[OUT]], i32 8
+; CHECK-NEXT:    store float [[TMP7]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[IN]], i32 12
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[OUT]], i32 12
+; CHECK-NEXT:    store float [[TMP10]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[IN]], i32 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[OUT]], i32 16
+; CHECK-NEXT:    store float [[TMP13]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[IN]], i32 20
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[OUT]], i32 20
+; CHECK-NEXT:    store float [[TMP16]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %out = alloca [2 x [3 x float]]
+  %in = alloca [2 x [3 x float]]
+  tail call void @llvm.memcpy(ptr %out, ptr %in, i32 24, i1 false)
+  ret void
+}

diff  --git a/llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy.ll b/llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy.ll
index c3acd757e2180..a4cfa5224b382 100644
--- a/llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy.ll
+++ b/llvm/test/CodeGen/DirectX/MemIntrinsics/memcpy.ll
@@ -6,9 +6,9 @@ define void @replace_int_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [1 x i32], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [1 x i32], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1 x i32], ptr [[TMP1]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [1 x i32], ptr [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[GEP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -23,17 +23,17 @@ define void @replace_3int_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [3 x i32], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [3 x i32], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP2]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr [[GEP3]], align 4
-; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 2
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[GEP4]], align 4
-; CHECK-NEXT:    [[GEP5:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP2]], i32 0, i32 2
+; CHECK-NEXT:    [[GEP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8
 ; CHECK-NEXT:    store i32 [[TMP5]], ptr [[GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -48,13 +48,13 @@ define void @replace_mismatched_size_int_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [2 x i32], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [3 x i32], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP1]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP2]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr [[GEP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -69,13 +69,13 @@ define void @replace_int16_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [2 x i16], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [2 x i16], align 2
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x i16], ptr [[TMP1]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x i16], ptr [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store i16 [[TMP3]], ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x i16], ptr [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x i16], ptr [[TMP2]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 2
 ; CHECK-NEXT:    store i16 [[TMP4]], ptr [[GEP3]], align 2
 ; CHECK-NEXT:    ret void
 ;
@@ -90,13 +90,13 @@ define void @replace_float_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [2 x float], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [2 x float], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x float], ptr [[TMP1]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x float], ptr [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store float [[TMP3]], ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x float], ptr [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x float], ptr [[TMP2]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
 ; CHECK-NEXT:    store float [[TMP4]], ptr [[GEP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -111,13 +111,13 @@ define void @replace_double_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [2 x double], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [2 x double], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x double], ptr [[TMP1]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x double], ptr [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store double [[TMP3]], ptr [[GEP1]], align 8
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x double], ptr [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[GEP2]], align 8
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x double], ptr [[TMP2]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8
 ; CHECK-NEXT:    store double [[TMP4]], ptr [[GEP3]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -132,13 +132,13 @@ define void @replace_half_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [2 x half], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [2 x half], align 2
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x half], ptr [[TMP1]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x half], ptr [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store half [[TMP3]], ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x half], ptr [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x half], ptr [[TMP2]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 2
 ; CHECK-NEXT:    store half [[TMP4]], ptr [[GEP3]], align 2
 ; CHECK-NEXT:    ret void
 ;