[llvm] Reapply "[AMDGPU] Handle memcpy()-like ops in LowerBufferFatPointers (#126621)" (PR #129078)

via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 27 08:21:03 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Krzysztof Drewniak (krzysz00)

<details>
<summary>Changes</summary>

This reverts commit 1559a65efaf327f9c72e14d4bb1834f076e7fc20.

Fixed test (I suspect broken by unrelated change in the merge)

---

Patch is 402.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129078.diff


3 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp (+98-24) 
- (added) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll (+1345) 
- (added) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll (+1730) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 86b2c4f78fc3e..608b43b59eed3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -45,16 +45,16 @@
 //
 // This pass proceeds in three main phases:
 //
-// ## Rewriting loads and stores of p7
+// ## Rewriting loads and stores of p7 and memcpy()-like handling
 //
 // The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`,
 // including aggregates containing such pointers, to ones that use `i160`. This
-// is handled by `StoreFatPtrsAsIntsVisitor` , which visits loads, stores, and
-// allocas and, if the loaded or stored type contains `ptr addrspace(7)`,
-// rewrites that type to one where the p7s are replaced by i160s, copying other
-// parts of aggregates as needed. In the case of a store, each pointer is
-// `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back.
-// This same transformation is applied to vectors of pointers.
+// is handled by `StoreFatPtrsAsIntsAndExpandMemcpyVisitor` , which visits
+// loads, stores, and allocas and, if the loaded or stored type contains `ptr
+// addrspace(7)`, rewrites that type to one where the p7s are replaced by i160s,
+// copying other parts of aggregates as needed. In the case of a store, each
+// pointer is `ptrtoint`d to i160 before storing, and load integers are
+// `inttoptr`d back. This same transformation is applied to vectors of pointers.
 //
 // Such a transformation allows the later phases of the pass to not need
 // to handle buffer fat pointers moving to and from memory, where we load
@@ -66,6 +66,10 @@
 // Atomics operations on `ptr addrspace(7)` values are not suppported, as the
 // hardware does not include a 160-bit atomic.
 //
+// In order to save on O(N) work and to ensure that the contents type
+// legalizer correctly splits up wide loads, also unconditionally lower
+// memcpy-like intrinsics into loops here.
+//
 // ## Buffer contents type legalization
 //
 // The underlying buffer intrinsics only support types up to 128 bits long,
@@ -231,20 +235,24 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ReplaceConstant.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
 #define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers"
@@ -431,14 +439,16 @@ namespace {
 /// marshalling costs when reading or storing these values, but since placing
 /// such pointers into memory is an uncommon operation at best, we feel that
 /// this cost is acceptable for better performance in the common case.
-class StoreFatPtrsAsIntsVisitor
-    : public InstVisitor<StoreFatPtrsAsIntsVisitor, bool> {
+class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
+    : public InstVisitor<StoreFatPtrsAsIntsAndExpandMemcpyVisitor, bool> {
   BufferFatPtrToIntTypeMap *TypeMap;
 
   ValueToValueMapTy ConvertedForStore;
 
   IRBuilder<> IRB;
 
+  const TargetMachine *TM;
+
   // Convert all the buffer fat pointers within the input value to inttegers
   // so that it can be stored in memory.
   Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name);
@@ -448,8 +458,10 @@ class StoreFatPtrsAsIntsVisitor
   Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name);
 
 public:
-  StoreFatPtrsAsIntsVisitor(BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx)
-      : TypeMap(TypeMap), IRB(Ctx) {}
+  StoreFatPtrsAsIntsAndExpandMemcpyVisitor(BufferFatPtrToIntTypeMap *TypeMap,
+                                           LLVMContext &Ctx,
+                                           const TargetMachine *TM)
+      : TypeMap(TypeMap), IRB(Ctx), TM(TM) {}
   bool processFunction(Function &F);
 
   bool visitInstruction(Instruction &I) { return false; }
@@ -457,11 +469,16 @@ class StoreFatPtrsAsIntsVisitor
   bool visitLoadInst(LoadInst &LI);
   bool visitStoreInst(StoreInst &SI);
   bool visitGetElementPtrInst(GetElementPtrInst &I);
+
+  bool visitMemCpyInst(MemCpyInst &MCI);
+  bool visitMemMoveInst(MemMoveInst &MMI);
+  bool visitMemSetInst(MemSetInst &MSI);
+  bool visitMemSetPatternInst(MemSetPatternInst &MSPI);
 };
 } // namespace
 
-Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To,
-                                                const Twine &Name) {
+Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::fatPtrsToInts(
+    Value *V, Type *From, Type *To, const Twine &Name) {
   if (From == To)
     return V;
   ValueToValueMapTy::iterator Find = ConvertedForStore.find(V);
@@ -498,8 +515,8 @@ Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To,
   return Ret;
 }
 
-Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To,
-                                                const Twine &Name) {
+Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::intsToFatPtrs(
+    Value *V, Type *From, Type *To, const Twine &Name) {
   if (From == To)
     return V;
   if (isBufferFatPtrOrVector(To)) {
@@ -531,18 +548,25 @@ Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To,
   return Ret;
 }
 
-bool StoreFatPtrsAsIntsVisitor::processFunction(Function &F) {
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::processFunction(Function &F) {
   bool Changed = false;
-  // The visitors will mutate GEPs and allocas, but will push loads and stores
-  // to the worklist to avoid invalidation.
+  // Process memcpy-like instructions after the main iteration because they can
+  // invalidate iterators.
+  SmallVector<WeakTrackingVH> CanBecomeLoops;
   for (Instruction &I : make_early_inc_range(instructions(F))) {
-    Changed |= visit(I);
+    if (isa<MemTransferInst, MemSetInst, MemSetPatternInst>(I))
+      CanBecomeLoops.push_back(&I);
+    else
+      Changed |= visit(I);
+  }
+  for (WeakTrackingVH VH : make_early_inc_range(CanBecomeLoops)) {
+    Changed |= visit(cast<Instruction>(VH));
   }
   ConvertedForStore.clear();
   return Changed;
 }
 
-bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) {
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitAllocaInst(AllocaInst &I) {
   Type *Ty = I.getAllocatedType();
   Type *NewTy = TypeMap->remapType(Ty);
   if (Ty == NewTy)
@@ -551,7 +575,8 @@ bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) {
   return true;
 }
 
-bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitGetElementPtrInst(
+    GetElementPtrInst &I) {
   Type *Ty = I.getSourceElementType();
   Type *NewTy = TypeMap->remapType(Ty);
   if (Ty == NewTy)
@@ -563,7 +588,7 @@ bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
   return true;
 }
 
-bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitLoadInst(LoadInst &LI) {
   Type *Ty = LI.getType();
   Type *IntTy = TypeMap->remapType(Ty);
   if (Ty == IntTy)
@@ -581,7 +606,7 @@ bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
   return true;
 }
 
-bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitStoreInst(StoreInst &SI) {
   Value *V = SI.getValueOperand();
   Type *Ty = V->getType();
   Type *IntTy = TypeMap->remapType(Ty);
@@ -597,6 +622,47 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
   return true;
 }
 
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemCpyInst(
+    MemCpyInst &MCI) {
+  // TODO: Allow memcpy.p7.p3 as a synonym for the direct-to-LDS copy, which'll
+  // need loop expansion here.
+  if (MCI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER &&
+      MCI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+    return false;
+  llvm::expandMemCpyAsLoop(&MCI,
+                           TM->getTargetTransformInfo(*MCI.getFunction()));
+  MCI.eraseFromParent();
+  return true;
+}
+
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemMoveInst(
+    MemMoveInst &MMI) {
+  if (MMI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER &&
+      MMI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+    return false;
+  report_fatal_error(
+      "memmove() on buffer descriptors is not implemented because pointer "
+      "comparison on buffer descriptors isn't implemented\n");
+}
+
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst(
+    MemSetInst &MSI) {
+  if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+    return false;
+  llvm::expandMemSetAsLoop(&MSI);
+  MSI.eraseFromParent();
+  return true;
+}
+
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetPatternInst(
+    MemSetPatternInst &MSPI) {
+  if (MSPI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+    return false;
+  llvm::expandMemSetPatternAsLoop(&MSPI);
+  MSPI.eraseFromParent();
+  return true;
+}
+
 namespace {
 /// Convert loads/stores of types that the buffer intrinsics can't handle into
 /// one ore more such loads/stores that consist of legal types.
@@ -1127,6 +1193,7 @@ bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
 
 bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {
   bool Changed = false;
+  // Note, memory transfer intrinsics won't
   for (Instruction &I : make_early_inc_range(instructions(F))) {
     Changed |= visit(I);
   }
@@ -2084,6 +2151,12 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
   case Intrinsic::invariant_end:
   case Intrinsic::launder_invariant_group:
   case Intrinsic::strip_invariant_group:
+  case Intrinsic::memcpy:
+  case Intrinsic::memcpy_inline:
+  case Intrinsic::memmove:
+  case Intrinsic::memset:
+  case Intrinsic::memset_inline:
+  case Intrinsic::experimental_memset_pattern:
     return true;
   }
 }
@@ -2353,7 +2426,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
         /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true);
   }
 
-  StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext());
+  StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, M.getContext(),
+                                                         &TM);
   LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
                                                               M.getContext());
   for (Function &F : M.functions()) {
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
new file mode 100644
index 0000000000000..8e023723ec25c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -0,0 +1,1345 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=SDAG-GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG-GFX1100 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX1100 %s
+
+; Note: if you're adding tests here, also add them to
+; lower-buffer-fat-pointers-mem-transfer.ll to verify the IR produced by
+; the lowering.
+;
+; This file is a sanity check to make sure that the code generated
+; for buffer-related memcpy() calls turns into something reasonable in
+; the backend, despite the wide intermediate vectors
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
+target triple = "amdgcn--"
+
+;; memcpy
+
+declare void @llvm.memcpy.p7.p7.i32(ptr addrspace(7), ptr addrspace(7), i32, i1)
+
+define amdgpu_kernel void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) {
+; SDAG-LABEL: memcpy_known:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s7, s24
+; SDAG-NEXT:    s_mov_b32 s6, s23
+; SDAG-NEXT:    s_mov_b32 s5, s22
+; SDAG-NEXT:    s_mov_b32 s4, s21
+; SDAG-NEXT:    s_mov_b32 s8, 0
+; SDAG-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SDAG-NEXT:  .LBB0_1: ; %load-store-loop
+; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    s_add_i32 s9, s20, s8
+; SDAG-NEXT:    v_mov_b32_e32 v60, s9
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], v60, s[16:19], 0 offen
+; SDAG-NEXT:    s_add_i32 s9, s25, s8
+; SDAG-NEXT:    s_addk_i32 s8, 0x100
+; SDAG-NEXT:    s_cmpk_lt_u32 s8, 0x2000
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], v60, s[16:19], 0 offen offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v60, s[16:19], 0 offen offset:32
+; SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v60, s[16:19], 0 offen offset:48
+; SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v60, s[16:19], 0 offen offset:64
+; SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v60, s[16:19], 0 offen offset:80
+; SDAG-NEXT:    buffer_load_dwordx4 v[24:27], v60, s[16:19], 0 offen offset:96
+; SDAG-NEXT:    buffer_load_dwordx4 v[28:31], v60, s[16:19], 0 offen offset:112
+; SDAG-NEXT:    buffer_load_dwordx4 v[32:35], v60, s[16:19], 0 offen offset:128
+; SDAG-NEXT:    buffer_load_dwordx4 v[36:39], v60, s[16:19], 0 offen offset:144
+; SDAG-NEXT:    buffer_load_dwordx4 v[48:51], v60, s[16:19], 0 offen offset:160
+; SDAG-NEXT:    buffer_load_dwordx4 v[52:55], v60, s[16:19], 0 offen offset:176
+; SDAG-NEXT:    buffer_load_dwordx4 v[40:43], v60, s[16:19], 0 offen offset:192
+; SDAG-NEXT:    buffer_load_dwordx4 v[44:47], v60, s[16:19], 0 offen offset:208
+; SDAG-NEXT:    buffer_load_dwordx4 v[56:59], v60, s[16:19], 0 offen offset:224
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[60:63], v60, s[16:19], 0 offen offset:240
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SDAG-NEXT:    v_mov_b32_e32 v0, s9
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[4:7], 0 offen
+; SDAG-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[4:7], 0 offen offset:16
+; SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v0, s[4:7], 0 offen offset:32
+; SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v0, s[4:7], 0 offen offset:48
+; SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v0, s[4:7], 0 offen offset:64
+; SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v0, s[4:7], 0 offen offset:80
+; SDAG-NEXT:    buffer_store_dwordx4 v[24:27], v0, s[4:7], 0 offen offset:96
+; SDAG-NEXT:    buffer_store_dwordx4 v[28:31], v0, s[4:7], 0 offen offset:112
+; SDAG-NEXT:    buffer_store_dwordx4 v[32:35], v0, s[4:7], 0 offen offset:128
+; SDAG-NEXT:    buffer_store_dwordx4 v[36:39], v0, s[4:7], 0 offen offset:144
+; SDAG-NEXT:    buffer_store_dwordx4 v[48:51], v0, s[4:7], 0 offen offset:160
+; SDAG-NEXT:    buffer_store_dwordx4 v[52:55], v0, s[4:7], 0 offen offset:176
+; SDAG-NEXT:    buffer_store_dwordx4 v[40:43], v0, s[4:7], 0 offen offset:192
+; SDAG-NEXT:    buffer_store_dwordx4 v[44:47], v0, s[4:7], 0 offen offset:208
+; SDAG-NEXT:    buffer_store_dwordx4 v[56:59], v0, s[4:7], 0 offen offset:224
+; SDAG-NEXT:    buffer_store_dwordx4 v[60:63], v0, s[4:7], 0 offen offset:240
+; SDAG-NEXT:    s_cbranch_scc1 .LBB0_1
+; SDAG-NEXT:  ; %bb.2: ; %memcpy-split
+; SDAG-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SDAG-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte F...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/129078


More information about the llvm-commits mailing list