[llvm] Reapply "[AMDGPU] Handle memcpy()-like ops in LowerBufferFatPointers (#126621)" (PR #129078)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 27 08:21:03 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Krzysztof Drewniak (krzysz00)
<details>
<summary>Changes</summary>
This reverts commit 1559a65efaf327f9c72e14d4bb1834f076e7fc20.
Fixed test (I suspect broken by unrelated change in the merge)
---
Patch is 402.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129078.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp (+98-24)
- (added) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll (+1345)
- (added) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll (+1730)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 86b2c4f78fc3e..608b43b59eed3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -45,16 +45,16 @@
//
// This pass proceeds in three main phases:
//
-// ## Rewriting loads and stores of p7
+// ## Rewriting loads and stores of p7 and memcpy()-like handling
//
// The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`,
// including aggregates containing such pointers, to ones that use `i160`. This
-// is handled by `StoreFatPtrsAsIntsVisitor` , which visits loads, stores, and
-// allocas and, if the loaded or stored type contains `ptr addrspace(7)`,
-// rewrites that type to one where the p7s are replaced by i160s, copying other
-// parts of aggregates as needed. In the case of a store, each pointer is
-// `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back.
-// This same transformation is applied to vectors of pointers.
+// is handled by `StoreFatPtrsAsIntsAndExpandMemcpyVisitor` , which visits
+// loads, stores, and allocas and, if the loaded or stored type contains `ptr
+// addrspace(7)`, rewrites that type to one where the p7s are replaced by i160s,
+// copying other parts of aggregates as needed. In the case of a store, each
+// pointer is `ptrtoint`d to i160 before storing, and load integers are
+// `inttoptr`d back. This same transformation is applied to vectors of pointers.
//
// Such a transformation allows the later phases of the pass to not need
// to handle buffer fat pointers moving to and from memory, where we load
@@ -66,6 +66,10 @@
// Atomics operations on `ptr addrspace(7)` values are not suppported, as the
// hardware does not include a 160-bit atomic.
//
+// In order to save on O(N) work and to ensure that the contents type
+// legalizer correctly splits up wide loads, also unconditionally lower
+// memcpy-like intrinsics into loops here.
+//
// ## Buffer contents type legalization
//
// The underlying buffer intrinsics only support types up to 128 bits long,
@@ -231,20 +235,24 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/IR/ValueHandle.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers"
@@ -431,14 +439,16 @@ namespace {
/// marshalling costs when reading or storing these values, but since placing
/// such pointers into memory is an uncommon operation at best, we feel that
/// this cost is acceptable for better performance in the common case.
-class StoreFatPtrsAsIntsVisitor
- : public InstVisitor<StoreFatPtrsAsIntsVisitor, bool> {
+class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
+ : public InstVisitor<StoreFatPtrsAsIntsAndExpandMemcpyVisitor, bool> {
BufferFatPtrToIntTypeMap *TypeMap;
ValueToValueMapTy ConvertedForStore;
IRBuilder<> IRB;
+ const TargetMachine *TM;
+
// Convert all the buffer fat pointers within the input value to inttegers
// so that it can be stored in memory.
Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name);
@@ -448,8 +458,10 @@ class StoreFatPtrsAsIntsVisitor
Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name);
public:
- StoreFatPtrsAsIntsVisitor(BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx)
- : TypeMap(TypeMap), IRB(Ctx) {}
+ StoreFatPtrsAsIntsAndExpandMemcpyVisitor(BufferFatPtrToIntTypeMap *TypeMap,
+ LLVMContext &Ctx,
+ const TargetMachine *TM)
+ : TypeMap(TypeMap), IRB(Ctx), TM(TM) {}
bool processFunction(Function &F);
bool visitInstruction(Instruction &I) { return false; }
@@ -457,11 +469,16 @@ class StoreFatPtrsAsIntsVisitor
bool visitLoadInst(LoadInst &LI);
bool visitStoreInst(StoreInst &SI);
bool visitGetElementPtrInst(GetElementPtrInst &I);
+
+ bool visitMemCpyInst(MemCpyInst &MCI);
+ bool visitMemMoveInst(MemMoveInst &MMI);
+ bool visitMemSetInst(MemSetInst &MSI);
+ bool visitMemSetPatternInst(MemSetPatternInst &MSPI);
};
} // namespace
-Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To,
- const Twine &Name) {
+Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::fatPtrsToInts(
+ Value *V, Type *From, Type *To, const Twine &Name) {
if (From == To)
return V;
ValueToValueMapTy::iterator Find = ConvertedForStore.find(V);
@@ -498,8 +515,8 @@ Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To,
return Ret;
}
-Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To,
- const Twine &Name) {
+Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::intsToFatPtrs(
+ Value *V, Type *From, Type *To, const Twine &Name) {
if (From == To)
return V;
if (isBufferFatPtrOrVector(To)) {
@@ -531,18 +548,25 @@ Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To,
return Ret;
}
-bool StoreFatPtrsAsIntsVisitor::processFunction(Function &F) {
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::processFunction(Function &F) {
bool Changed = false;
- // The visitors will mutate GEPs and allocas, but will push loads and stores
- // to the worklist to avoid invalidation.
+ // Process memcpy-like instructions after the main iteration because they can
+ // invalidate iterators.
+ SmallVector<WeakTrackingVH> CanBecomeLoops;
for (Instruction &I : make_early_inc_range(instructions(F))) {
- Changed |= visit(I);
+ if (isa<MemTransferInst, MemSetInst, MemSetPatternInst>(I))
+ CanBecomeLoops.push_back(&I);
+ else
+ Changed |= visit(I);
+ }
+ for (WeakTrackingVH VH : make_early_inc_range(CanBecomeLoops)) {
+ Changed |= visit(cast<Instruction>(VH));
}
ConvertedForStore.clear();
return Changed;
}
-bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) {
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitAllocaInst(AllocaInst &I) {
Type *Ty = I.getAllocatedType();
Type *NewTy = TypeMap->remapType(Ty);
if (Ty == NewTy)
@@ -551,7 +575,8 @@ bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) {
return true;
}
-bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitGetElementPtrInst(
+ GetElementPtrInst &I) {
Type *Ty = I.getSourceElementType();
Type *NewTy = TypeMap->remapType(Ty);
if (Ty == NewTy)
@@ -563,7 +588,7 @@ bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
return true;
}
-bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitLoadInst(LoadInst &LI) {
Type *Ty = LI.getType();
Type *IntTy = TypeMap->remapType(Ty);
if (Ty == IntTy)
@@ -581,7 +606,7 @@ bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
return true;
}
-bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitStoreInst(StoreInst &SI) {
Value *V = SI.getValueOperand();
Type *Ty = V->getType();
Type *IntTy = TypeMap->remapType(Ty);
@@ -597,6 +622,47 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
return true;
}
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemCpyInst(
+ MemCpyInst &MCI) {
+ // TODO: Allow memcpy.p7.p3 as a synonym for the direct-to-LDS copy, which'll
+ // need loop expansion here.
+ if (MCI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER &&
+ MCI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+ return false;
+ llvm::expandMemCpyAsLoop(&MCI,
+ TM->getTargetTransformInfo(*MCI.getFunction()));
+ MCI.eraseFromParent();
+ return true;
+}
+
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemMoveInst(
+ MemMoveInst &MMI) {
+ if (MMI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER &&
+ MMI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+ return false;
+ report_fatal_error(
+ "memmove() on buffer descriptors is not implemented because pointer "
+ "comparison on buffer descriptors isn't implemented\n");
+}
+
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst(
+ MemSetInst &MSI) {
+ if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+ return false;
+ llvm::expandMemSetAsLoop(&MSI);
+ MSI.eraseFromParent();
+ return true;
+}
+
+bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetPatternInst(
+ MemSetPatternInst &MSPI) {
+ if (MSPI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+ return false;
+ llvm::expandMemSetPatternAsLoop(&MSPI);
+ MSPI.eraseFromParent();
+ return true;
+}
+
namespace {
/// Convert loads/stores of types that the buffer intrinsics can't handle into
/// one ore more such loads/stores that consist of legal types.
@@ -1127,6 +1193,7 @@ bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {
bool Changed = false;
+ // Note, memory transfer intrinsics won't
for (Instruction &I : make_early_inc_range(instructions(F))) {
Changed |= visit(I);
}
@@ -2084,6 +2151,12 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
case Intrinsic::invariant_end:
case Intrinsic::launder_invariant_group:
case Intrinsic::strip_invariant_group:
+ case Intrinsic::memcpy:
+ case Intrinsic::memcpy_inline:
+ case Intrinsic::memmove:
+ case Intrinsic::memset:
+ case Intrinsic::memset_inline:
+ case Intrinsic::experimental_memset_pattern:
return true;
}
}
@@ -2353,7 +2426,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
/*RemoveDeadConstants=*/false, /*IncludeSelf=*/true);
}
- StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext());
+ StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, M.getContext(),
+ &TM);
LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
M.getContext());
for (Function &F : M.functions()) {
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
new file mode 100644
index 0000000000000..8e023723ec25c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -0,0 +1,1345 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=SDAG-GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG-GFX1100 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX1100 %s
+
+; Note: if you're adding tests here, also add them to
+; lower-buffer-fat-pointers-mem-transfer.ll to verify the IR produced by
+; the lowering.
+;
+; This file is a sanity check to make sure that the code generated
+; for buffer-related memcpy() calls turns into something reasonable in
+; the backend, despite the wide intermediate vectors
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
+target triple = "amdgcn--"
+
+;; memcpy
+
+declare void @llvm.memcpy.p7.p7.i32(ptr addrspace(7), ptr addrspace(7), i32, i1)
+
+define amdgpu_kernel void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) {
+; SDAG-LABEL: memcpy_known:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s7, s24
+; SDAG-NEXT: s_mov_b32 s6, s23
+; SDAG-NEXT: s_mov_b32 s5, s22
+; SDAG-NEXT: s_mov_b32 s4, s21
+; SDAG-NEXT: s_mov_b32 s8, 0
+; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SDAG-NEXT: .LBB0_1: ; %load-store-loop
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: s_add_i32 s9, s20, s8
+; SDAG-NEXT: v_mov_b32_e32 v60, s9
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], v60, s[16:19], 0 offen
+; SDAG-NEXT: s_add_i32 s9, s25, s8
+; SDAG-NEXT: s_addk_i32 s8, 0x100
+; SDAG-NEXT: s_cmpk_lt_u32 s8, 0x2000
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], v60, s[16:19], 0 offen offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SDAG-NEXT: buffer_load_dwordx4 v[8:11], v60, s[16:19], 0 offen offset:32
+; SDAG-NEXT: buffer_load_dwordx4 v[12:15], v60, s[16:19], 0 offen offset:48
+; SDAG-NEXT: buffer_load_dwordx4 v[16:19], v60, s[16:19], 0 offen offset:64
+; SDAG-NEXT: buffer_load_dwordx4 v[20:23], v60, s[16:19], 0 offen offset:80
+; SDAG-NEXT: buffer_load_dwordx4 v[24:27], v60, s[16:19], 0 offen offset:96
+; SDAG-NEXT: buffer_load_dwordx4 v[28:31], v60, s[16:19], 0 offen offset:112
+; SDAG-NEXT: buffer_load_dwordx4 v[32:35], v60, s[16:19], 0 offen offset:128
+; SDAG-NEXT: buffer_load_dwordx4 v[36:39], v60, s[16:19], 0 offen offset:144
+; SDAG-NEXT: buffer_load_dwordx4 v[48:51], v60, s[16:19], 0 offen offset:160
+; SDAG-NEXT: buffer_load_dwordx4 v[52:55], v60, s[16:19], 0 offen offset:176
+; SDAG-NEXT: buffer_load_dwordx4 v[40:43], v60, s[16:19], 0 offen offset:192
+; SDAG-NEXT: buffer_load_dwordx4 v[44:47], v60, s[16:19], 0 offen offset:208
+; SDAG-NEXT: buffer_load_dwordx4 v[56:59], v60, s[16:19], 0 offen offset:224
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: buffer_load_dwordx4 v[60:63], v60, s[16:19], 0 offen offset:240
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SDAG-NEXT: v_mov_b32_e32 v0, s9
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[1:4], v0, s[4:7], 0 offen
+; SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[1:4], v0, s[4:7], 0 offen offset:16
+; SDAG-NEXT: buffer_store_dwordx4 v[8:11], v0, s[4:7], 0 offen offset:32
+; SDAG-NEXT: buffer_store_dwordx4 v[12:15], v0, s[4:7], 0 offen offset:48
+; SDAG-NEXT: buffer_store_dwordx4 v[16:19], v0, s[4:7], 0 offen offset:64
+; SDAG-NEXT: buffer_store_dwordx4 v[20:23], v0, s[4:7], 0 offen offset:80
+; SDAG-NEXT: buffer_store_dwordx4 v[24:27], v0, s[4:7], 0 offen offset:96
+; SDAG-NEXT: buffer_store_dwordx4 v[28:31], v0, s[4:7], 0 offen offset:112
+; SDAG-NEXT: buffer_store_dwordx4 v[32:35], v0, s[4:7], 0 offen offset:128
+; SDAG-NEXT: buffer_store_dwordx4 v[36:39], v0, s[4:7], 0 offen offset:144
+; SDAG-NEXT: buffer_store_dwordx4 v[48:51], v0, s[4:7], 0 offen offset:160
+; SDAG-NEXT: buffer_store_dwordx4 v[52:55], v0, s[4:7], 0 offen offset:176
+; SDAG-NEXT: buffer_store_dwordx4 v[40:43], v0, s[4:7], 0 offen offset:192
+; SDAG-NEXT: buffer_store_dwordx4 v[44:47], v0, s[4:7], 0 offen offset:208
+; SDAG-NEXT: buffer_store_dwordx4 v[56:59], v0, s[4:7], 0 offen offset:224
+; SDAG-NEXT: buffer_store_dwordx4 v[60:63], v0, s[4:7], 0 offen offset:240
+; SDAG-NEXT: s_cbranch_scc1 .LBB0_1
+; SDAG-NEXT: ; %bb.2: ; %memcpy-split
+; SDAG-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte F...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/129078
More information about the llvm-commits
mailing list