[llvm] 36527cb - [AMDGPU][GlobalISel] Legalize memcpy family of intrinsics
Mirko Brkusanin via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 7 03:33:27 PDT 2021
Author: Mirko Brkusanin
Date: 2021-09-07T12:24:07+02:00
New Revision: 36527cbe02c401671098c35b1a99853e01c291a9
URL: https://github.com/llvm/llvm-project/commit/36527cbe02c401671098c35b1a99853e01c291a9
DIFF: https://github.com/llvm/llvm-project/commit/36527cbe02c401671098c35b1a99853e01c291a9.diff
LOG: [AMDGPU][GlobalISel] Legalize memcpy family of intrinsics
Legalize G_MEMCPY, G_MEMMOVE, G_MEMSET and G_MEMCPY_INLINE.
Corresponding intrinsics are replaced by a loop that uses loads/stores in
AMDGPULowerIntrinsics pass unless their length is a constant lower then
MemIntrinsicExpandSizeThresholdOpt (default 1024). Any G_MEM* instruction that
reaches legalizer should have a const length argument and should be expanded
into appropriate number of loads + stores.
Differential Revision: https://reviews.llvm.org/D108357
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
Modified:
llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
Removed:
llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-memcpy-inline.mir
################################################################################
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 8bc89cbc40bb5..015c75d75a1a7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -594,19 +594,6 @@ class CombinerHelper {
bool tryEmitMemcpyInline(MachineInstr &MI);
private:
- // Memcpy family optimization helpers.
- bool tryEmitMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
- uint64_t KnownLen, Align DstAlign, Align SrcAlign,
- bool IsVolatile);
- bool optimizeMemcpy(MachineInstr &MI, Register Dst, Register Src,
- uint64_t KnownLen, uint64_t Limit, Align DstAlign,
- Align SrcAlign, bool IsVolatile);
- bool optimizeMemmove(MachineInstr &MI, Register Dst, Register Src,
- uint64_t KnownLen, Align DstAlign, Align SrcAlign,
- bool IsVolatile);
- bool optimizeMemset(MachineInstr &MI, Register Dst, Register Val,
- uint64_t KnownLen, Align DstAlign, bool IsVolatile);
-
/// Given a non-indexed load or store instruction \p MI, find an offset that
/// can be usefully and legally folded into it as a post-indexing operation.
///
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index de6d67443f79b..74615c73741a2 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -256,6 +256,20 @@ class LegalizerHelper {
LLT SrcTy, LLT NarrowTy,
unsigned ScalarOpc);
+ // Memcpy family legalization helpers.
+ LegalizeResult lowerMemset(MachineInstr &MI, Register Dst, Register Val,
+ uint64_t KnownLen, Align Alignment,
+ bool IsVolatile);
+ LegalizeResult lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
+ uint64_t KnownLen, Align DstAlign,
+ Align SrcAlign, bool IsVolatile);
+ LegalizeResult lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
+ uint64_t KnownLen, uint64_t Limit, Align DstAlign,
+ Align SrcAlign, bool IsVolatile);
+ LegalizeResult lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
+ uint64_t KnownLen, Align DstAlign, Align SrcAlign,
+ bool IsVolatile);
+
public:
/// Return the alignment to use for a stack temporary object with the given
/// type.
@@ -403,6 +417,8 @@ class LegalizerHelper {
LegalizeResult lowerAbsToAddXor(MachineInstr &MI);
LegalizeResult lowerAbsToMaxNeg(MachineInstr &MI);
LegalizeResult lowerVectorReduction(MachineInstr &MI);
+ LegalizeResult lowerMemcpyInline(MachineInstr &MI);
+ LegalizeResult lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen = 0);
};
/// Helper function that creates a libcall to the given \p Name using the given
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 9a781e355cbac..62df6fed36c93 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -12,6 +12,7 @@
#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -28,7 +29,6 @@
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetMachine.h"
#include <tuple>
#define DEBUG_TYPE "gi-combiner"
@@ -1111,81 +1111,6 @@ void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI,
Observer.changedInstr(*BrCond);
}
-static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
- // On Darwin, -Os means optimize for size without hurting performance, so
- // only really optimize for size when -Oz (MinSize) is used.
- if (MF.getTarget().getTargetTriple().isOSDarwin())
- return MF.getFunction().hasMinSize();
- return MF.getFunction().hasOptSize();
-}
-
-// Returns a list of types to use for memory op lowering in MemOps. A partial
-// port of findOptimalMemOpLowering in TargetLowering.
-static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
- unsigned Limit, const MemOp &Op,
- unsigned DstAS, unsigned SrcAS,
- const AttributeList &FuncAttributes,
- const TargetLowering &TLI) {
- if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
- return false;
-
- LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
-
- if (Ty == LLT()) {
- // Use the largest scalar type whose alignment constraints are satisfied.
- // We only need to check DstAlign here as SrcAlign is always greater or
- // equal to DstAlign (or zero).
- Ty = LLT::scalar(64);
- if (Op.isFixedDstAlign())
- while (Op.getDstAlign() < Ty.getSizeInBytes() &&
- !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
- Ty = LLT::scalar(Ty.getSizeInBytes());
- assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
- // FIXME: check for the largest legal type we can load/store to.
- }
-
- unsigned NumMemOps = 0;
- uint64_t Size = Op.size();
- while (Size) {
- unsigned TySize = Ty.getSizeInBytes();
- while (TySize > Size) {
- // For now, only use non-vector load / store's for the left-over pieces.
- LLT NewTy = Ty;
- // FIXME: check for mem op safety and legality of the types. Not all of
- // SDAGisms map cleanly to GISel concepts.
- if (NewTy.isVector())
- NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
- NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
- unsigned NewTySize = NewTy.getSizeInBytes();
- assert(NewTySize > 0 && "Could not find appropriate type");
-
- // If the new LLT cannot cover all of the remaining bits, then consider
- // issuing a (or a pair of) unaligned and overlapping load / store.
- bool Fast;
- // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
- MVT VT = getMVTForLLT(Ty);
- if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
- TLI.allowsMisalignedMemoryAccesses(
- VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
- MachineMemOperand::MONone, &Fast) &&
- Fast)
- TySize = Size;
- else {
- Ty = NewTy;
- TySize = NewTySize;
- }
- }
-
- if (++NumMemOps > Limit)
- return false;
-
- MemOps.push_back(Ty);
- Size -= TySize;
- }
-
- return true;
-}
-
static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
if (Ty.isVector())
return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
@@ -1193,458 +1118,20 @@ static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
return IntegerType::get(C, Ty.getSizeInBits());
}
-// Get a vectorized representation of the memset value operand, GISel edition.
-static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
- MachineRegisterInfo &MRI = *MIB.getMRI();
- unsigned NumBits = Ty.getScalarSizeInBits();
- auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
- if (!Ty.isVector() && ValVRegAndVal) {
- APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
- APInt SplatVal = APInt::getSplat(NumBits, Scalar);
- return MIB.buildConstant(Ty, SplatVal).getReg(0);
- }
-
- // Extend the byte value to the larger type, and then multiply by a magic
- // value 0x010101... in order to replicate it across every byte.
- // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
- if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
- return MIB.buildConstant(Ty, 0).getReg(0);
- }
-
- LLT ExtType = Ty.getScalarType();
- auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
- if (NumBits > 8) {
- APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
- auto MagicMI = MIB.buildConstant(ExtType, Magic);
- Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
- }
-
- // For vector types create a G_BUILD_VECTOR.
- if (Ty.isVector())
- Val = MIB.buildSplatVector(Ty, Val).getReg(0);
-
- return Val;
-}
-
-bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst,
- Register Val, uint64_t KnownLen,
- Align Alignment, bool IsVolatile) {
- auto &MF = *MI.getParent()->getParent();
- const auto &TLI = *MF.getSubtarget().getTargetLowering();
- auto &DL = MF.getDataLayout();
- LLVMContext &C = MF.getFunction().getContext();
-
- assert(KnownLen != 0 && "Have a zero length memset length!");
-
- bool DstAlignCanChange = false;
- MachineFrameInfo &MFI = MF.getFrameInfo();
- bool OptSize = shouldLowerMemFuncForSize(MF);
-
- MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
- if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
- DstAlignCanChange = true;
-
- unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
- std::vector<LLT> MemOps;
-
- const auto &DstMMO = **MI.memoperands_begin();
- MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
-
- auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
- bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
-
- if (!findGISelOptimalMemOpLowering(MemOps, Limit,
- MemOp::Set(KnownLen, DstAlignCanChange,
- Alignment,
- /*IsZeroMemset=*/IsZeroVal,
- /*IsVolatile=*/IsVolatile),
- DstPtrInfo.getAddrSpace(), ~0u,
- MF.getFunction().getAttributes(), TLI))
- return false;
-
- if (DstAlignCanChange) {
- // Get an estimate of the type from the LLT.
- Type *IRTy = getTypeForLLT(MemOps[0], C);
- Align NewAlign = DL.getABITypeAlign(IRTy);
- if (NewAlign > Alignment) {
- Alignment = NewAlign;
- unsigned FI = FIDef->getOperand(1).getIndex();
- // Give the stack frame object a larger alignment if needed.
- if (MFI.getObjectAlign(FI) < Alignment)
- MFI.setObjectAlignment(FI, Alignment);
- }
- }
-
- MachineIRBuilder MIB(MI);
- // Find the largest store and generate the bit pattern for it.
- LLT LargestTy = MemOps[0];
- for (unsigned i = 1; i < MemOps.size(); i++)
- if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
- LargestTy = MemOps[i];
-
- // The memset stored value is always defined as an s8, so in order to make it
- // work with larger store types we need to repeat the bit pattern across the
- // wider type.
- Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
-
- if (!MemSetValue)
- return false;
-
- // Generate the stores. For each store type in the list, we generate the
- // matching store of that type to the destination address.
- LLT PtrTy = MRI.getType(Dst);
- unsigned DstOff = 0;
- unsigned Size = KnownLen;
- for (unsigned I = 0; I < MemOps.size(); I++) {
- LLT Ty = MemOps[I];
- unsigned TySize = Ty.getSizeInBytes();
- if (TySize > Size) {
- // Issuing an unaligned load / store pair that overlaps with the previous
- // pair. Adjust the offset accordingly.
- assert(I == MemOps.size() - 1 && I != 0);
- DstOff -= TySize - Size;
- }
-
- // If this store is smaller than the largest store see whether we can get
- // the smaller value for free with a truncate.
- Register Value = MemSetValue;
- if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
- MVT VT = getMVTForLLT(Ty);
- MVT LargestVT = getMVTForLLT(LargestTy);
- if (!LargestTy.isVector() && !Ty.isVector() &&
- TLI.isTruncateFree(LargestVT, VT))
- Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
- else
- Value = getMemsetValue(Val, Ty, MIB);
- if (!Value)
- return false;
- }
-
- auto *StoreMMO =
- MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
-
- Register Ptr = Dst;
- if (DstOff != 0) {
- auto Offset =
- MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
- Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
- }
-
- MIB.buildStore(Value, Ptr, *StoreMMO);
- DstOff += Ty.getSizeInBytes();
- Size -= TySize;
- }
-
- MI.eraseFromParent();
- return true;
-}
-
bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI) {
- assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
-
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- Register Len = MI.getOperand(2).getReg();
-
- const auto *MMOIt = MI.memoperands_begin();
- const MachineMemOperand *MemOp = *MMOIt;
- bool IsVolatile = MemOp->isVolatile();
-
- // See if this is a constant length copy
- auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
- // FIXME: support dynamically sized G_MEMCPY_INLINE
- assert(LenVRegAndVal.hasValue() &&
- "inline memcpy with dynamic size is not yet supported");
- uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
- if (KnownLen == 0) {
- MI.eraseFromParent();
- return true;
- }
-
- const auto &DstMMO = **MI.memoperands_begin();
- const auto &SrcMMO = **std::next(MI.memoperands_begin());
- Align DstAlign = DstMMO.getBaseAlign();
- Align SrcAlign = SrcMMO.getBaseAlign();
-
- return tryEmitMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
- IsVolatile);
-}
-
-bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI, Register Dst,
- Register Src, uint64_t KnownLen,
- Align DstAlign, Align SrcAlign,
- bool IsVolatile) {
- assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
- return optimizeMemcpy(MI, Dst, Src, KnownLen,
- std::numeric_limits<uint64_t>::max(), DstAlign,
- SrcAlign, IsVolatile);
-}
-
-bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
- Register Src, uint64_t KnownLen,
- uint64_t Limit, Align DstAlign,
- Align SrcAlign, bool IsVolatile) {
- auto &MF = *MI.getParent()->getParent();
- const auto &TLI = *MF.getSubtarget().getTargetLowering();
- auto &DL = MF.getDataLayout();
- LLVMContext &C = MF.getFunction().getContext();
-
- assert(KnownLen != 0 && "Have a zero length memcpy length!");
-
- bool DstAlignCanChange = false;
- MachineFrameInfo &MFI = MF.getFrameInfo();
- Align Alignment = commonAlignment(DstAlign, SrcAlign);
-
- MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
- if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
- DstAlignCanChange = true;
-
- // FIXME: infer better src pointer alignment like SelectionDAG does here.
- // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
- // if the memcpy is in a tail call position.
-
- std::vector<LLT> MemOps;
-
- const auto &DstMMO = **MI.memoperands_begin();
- const auto &SrcMMO = **std::next(MI.memoperands_begin());
- MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
- MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
-
- if (!findGISelOptimalMemOpLowering(
- MemOps, Limit,
- MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
- IsVolatile),
- DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
- MF.getFunction().getAttributes(), TLI))
- return false;
-
- if (DstAlignCanChange) {
- // Get an estimate of the type from the LLT.
- Type *IRTy = getTypeForLLT(MemOps[0], C);
- Align NewAlign = DL.getABITypeAlign(IRTy);
-
- // Don't promote to an alignment that would require dynamic stack
- // realignment.
- if (!TRI->hasStackRealignment(MF))
- while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
- NewAlign = NewAlign / 2;
-
- if (NewAlign > Alignment) {
- Alignment = NewAlign;
- unsigned FI = FIDef->getOperand(1).getIndex();
- // Give the stack frame object a larger alignment if needed.
- if (MFI.getObjectAlign(FI) < Alignment)
- MFI.setObjectAlignment(FI, Alignment);
- }
- }
-
- LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
-
- MachineIRBuilder MIB(MI);
- // Now we need to emit a pair of load and stores for each of the types we've
- // collected. I.e. for each type, generate a load from the source pointer of
- // that type width, and then generate a corresponding store to the dest buffer
- // of that value loaded. This can result in a sequence of loads and stores
- // mixed types, depending on what the target specifies as good types to use.
- unsigned CurrOffset = 0;
- LLT PtrTy = MRI.getType(Src);
- unsigned Size = KnownLen;
- for (auto CopyTy : MemOps) {
- // Issuing an unaligned load / store pair that overlaps with the previous
- // pair. Adjust the offset accordingly.
- if (CopyTy.getSizeInBytes() > Size)
- CurrOffset -= CopyTy.getSizeInBytes() - Size;
-
- // Construct MMOs for the accesses.
- auto *LoadMMO =
- MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
- auto *StoreMMO =
- MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
-
- // Create the load.
- Register LoadPtr = Src;
- Register Offset;
- if (CurrOffset != 0) {
- Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
- .getReg(0);
- LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
- }
- auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
-
- // Create the store.
- Register StorePtr =
- CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
- MIB.buildStore(LdVal, StorePtr, *StoreMMO);
- CurrOffset += CopyTy.getSizeInBytes();
- Size -= CopyTy.getSizeInBytes();
- }
-
- MI.eraseFromParent();
- return true;
-}
-
-bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
- Register Src, uint64_t KnownLen,
- Align DstAlign, Align SrcAlign,
- bool IsVolatile) {
- auto &MF = *MI.getParent()->getParent();
- const auto &TLI = *MF.getSubtarget().getTargetLowering();
- auto &DL = MF.getDataLayout();
- LLVMContext &C = MF.getFunction().getContext();
-
- assert(KnownLen != 0 && "Have a zero length memmove length!");
-
- bool DstAlignCanChange = false;
- MachineFrameInfo &MFI = MF.getFrameInfo();
- bool OptSize = shouldLowerMemFuncForSize(MF);
- Align Alignment = commonAlignment(DstAlign, SrcAlign);
-
- MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
- if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
- DstAlignCanChange = true;
-
- unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
- std::vector<LLT> MemOps;
-
- const auto &DstMMO = **MI.memoperands_begin();
- const auto &SrcMMO = **std::next(MI.memoperands_begin());
- MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
- MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
-
- // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
- // to a bug in it's findOptimalMemOpLowering implementation. For now do the
- // same thing here.
- if (!findGISelOptimalMemOpLowering(
- MemOps, Limit,
- MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
- /*IsVolatile*/ true),
- DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
- MF.getFunction().getAttributes(), TLI))
- return false;
-
- if (DstAlignCanChange) {
- // Get an estimate of the type from the LLT.
- Type *IRTy = getTypeForLLT(MemOps[0], C);
- Align NewAlign = DL.getABITypeAlign(IRTy);
-
- // Don't promote to an alignment that would require dynamic stack
- // realignment.
- if (!TRI->hasStackRealignment(MF))
- while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
- NewAlign = NewAlign / 2;
-
- if (NewAlign > Alignment) {
- Alignment = NewAlign;
- unsigned FI = FIDef->getOperand(1).getIndex();
- // Give the stack frame object a larger alignment if needed.
- if (MFI.getObjectAlign(FI) < Alignment)
- MFI.setObjectAlignment(FI, Alignment);
- }
- }
-
- LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
-
- MachineIRBuilder MIB(MI);
- // Memmove requires that we perform the loads first before issuing the stores.
- // Apart from that, this loop is pretty much doing the same thing as the
- // memcpy codegen function.
- unsigned CurrOffset = 0;
- LLT PtrTy = MRI.getType(Src);
- SmallVector<Register, 16> LoadVals;
- for (auto CopyTy : MemOps) {
- // Construct MMO for the load.
- auto *LoadMMO =
- MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
-
- // Create the load.
- Register LoadPtr = Src;
- if (CurrOffset != 0) {
- auto Offset =
- MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
- LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
- }
- LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
- CurrOffset += CopyTy.getSizeInBytes();
- }
-
- CurrOffset = 0;
- for (unsigned I = 0; I < MemOps.size(); ++I) {
- LLT CopyTy = MemOps[I];
- // Now store the values loaded.
- auto *StoreMMO =
- MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
-
- Register StorePtr = Dst;
- if (CurrOffset != 0) {
- auto Offset =
- MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
- StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
- }
- MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
- CurrOffset += CopyTy.getSizeInBytes();
- }
- MI.eraseFromParent();
- return true;
+ MachineIRBuilder HelperBuilder(MI);
+ GISelObserverWrapper DummyObserver;
+ LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder);
+ return Helper.lowerMemcpyInline(MI) ==
+ LegalizerHelper::LegalizeResult::Legalized;
}
bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
- const unsigned Opc = MI.getOpcode();
- // This combine is fairly complex so it's not written with a separate
- // matcher function.
- assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
- Opc == TargetOpcode::G_MEMSET) && "Expected memcpy like instruction");
-
- auto MMOIt = MI.memoperands_begin();
- const MachineMemOperand *MemOp = *MMOIt;
-
- Align DstAlign = MemOp->getBaseAlign();
- Align SrcAlign;
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- Register Len = MI.getOperand(2).getReg();
-
- if (Opc != TargetOpcode::G_MEMSET) {
- assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
- MemOp = *(++MMOIt);
- SrcAlign = MemOp->getBaseAlign();
- }
-
- // See if this is a constant length copy
- auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
- if (!LenVRegAndVal)
- return false; // Leave it to the legalizer to lower it to a libcall.
- uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
-
- if (KnownLen == 0) {
- MI.eraseFromParent();
- return true;
- }
-
- bool IsVolatile = MemOp->isVolatile();
- if (Opc == TargetOpcode::G_MEMCPY_INLINE)
- return tryEmitMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
- IsVolatile);
-
- // Don't try to optimize volatile.
- if (IsVolatile)
- return false;
-
- if (MaxLen && KnownLen > MaxLen)
- return false;
-
- if (Opc == TargetOpcode::G_MEMCPY) {
- auto &MF = *MI.getParent()->getParent();
- const auto &TLI = *MF.getSubtarget().getTargetLowering();
- bool OptSize = shouldLowerMemFuncForSize(MF);
- uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
- return optimizeMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
- IsVolatile);
- }
- if (Opc == TargetOpcode::G_MEMMOVE)
- return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
- if (Opc == TargetOpcode::G_MEMSET)
- return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
- return false;
+ MachineIRBuilder HelperBuilder(MI);
+ GISelObserverWrapper DummyObserver;
+ LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder);
+ return Helper.lowerMemCpyFamily(MI, MaxLen) ==
+ LegalizerHelper::LegalizeResult::Legalized;
}
static Optional<APFloat> constantFoldFpUnary(unsigned Opcode, LLT DstTy,
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 9795589da9998..315c199d72bc4 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -29,6 +29,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "legalizer"
@@ -3486,6 +3487,12 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
case G_ROTL:
case G_ROTR:
return lowerRotate(MI);
+ case G_MEMSET:
+ case G_MEMCPY:
+ case G_MEMMOVE:
+ return lowerMemCpyFamily(MI);
+ case G_MEMCPY_INLINE:
+ return lowerMemcpyInline(MI);
GISEL_VECREDUCE_CASES_NONSEQ
return lowerVectorReduction(MI);
}
@@ -7419,3 +7426,544 @@ LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
}
return UnableToLegalize;;
}
+
+static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
+ // On Darwin, -Os means optimize for size without hurting performance, so
+ // only really optimize for size when -Oz (MinSize) is used.
+ if (MF.getTarget().getTargetTriple().isOSDarwin())
+ return MF.getFunction().hasMinSize();
+ return MF.getFunction().hasOptSize();
+}
+
+// Returns a list of types to use for memory op lowering in MemOps. A partial
+// port of findOptimalMemOpLowering in TargetLowering.
+static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
+ unsigned Limit, const MemOp &Op,
+ unsigned DstAS, unsigned SrcAS,
+ const AttributeList &FuncAttributes,
+ const TargetLowering &TLI) {
+ if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
+ return false;
+
+ LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
+
+ if (Ty == LLT()) {
+ // Use the largest scalar type whose alignment constraints are satisfied.
+ // We only need to check DstAlign here as SrcAlign is always greater or
+ // equal to DstAlign (or zero).
+ Ty = LLT::scalar(64);
+ if (Op.isFixedDstAlign())
+ while (Op.getDstAlign() < Ty.getSizeInBytes() &&
+ !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
+ Ty = LLT::scalar(Ty.getSizeInBytes());
+ assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
+ // FIXME: check for the largest legal type we can load/store to.
+ }
+
+ unsigned NumMemOps = 0;
+ uint64_t Size = Op.size();
+ while (Size) {
+ unsigned TySize = Ty.getSizeInBytes();
+ while (TySize > Size) {
+ // For now, only use non-vector load / store's for the left-over pieces.
+ LLT NewTy = Ty;
+ // FIXME: check for mem op safety and legality of the types. Not all of
+ // SDAGisms map cleanly to GISel concepts.
+ if (NewTy.isVector())
+ NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
+ NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
+ unsigned NewTySize = NewTy.getSizeInBytes();
+ assert(NewTySize > 0 && "Could not find appropriate type");
+
+ // If the new LLT cannot cover all of the remaining bits, then consider
+ // issuing a (or a pair of) unaligned and overlapping load / store.
+ bool Fast;
+ // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
+ MVT VT = getMVTForLLT(Ty);
+ if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
+ TLI.allowsMisalignedMemoryAccesses(
+ VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
+ MachineMemOperand::MONone, &Fast) &&
+ Fast)
+ TySize = Size;
+ else {
+ Ty = NewTy;
+ TySize = NewTySize;
+ }
+ }
+
+ if (++NumMemOps > Limit)
+ return false;
+
+ MemOps.push_back(Ty);
+ Size -= TySize;
+ }
+
+ return true;
+}
+
+static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
+ if (Ty.isVector())
+ return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
+ Ty.getNumElements());
+ return IntegerType::get(C, Ty.getSizeInBits());
+}
+
+// Get a vectorized representation of the memset value operand, GISel edition.
+static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+ unsigned NumBits = Ty.getScalarSizeInBits();
+ auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
+ if (!Ty.isVector() && ValVRegAndVal) {
+ APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
+ APInt SplatVal = APInt::getSplat(NumBits, Scalar);
+ return MIB.buildConstant(Ty, SplatVal).getReg(0);
+ }
+
+ // Extend the byte value to the larger type, and then multiply by a magic
+ // value 0x010101... in order to replicate it across every byte.
+ // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
+ if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
+ return MIB.buildConstant(Ty, 0).getReg(0);
+ }
+
+ LLT ExtType = Ty.getScalarType();
+ auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
+ if (NumBits > 8) {
+ APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
+ auto MagicMI = MIB.buildConstant(ExtType, Magic);
+ Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
+ }
+
+ // For vector types create a G_BUILD_VECTOR.
+ if (Ty.isVector())
+ Val = MIB.buildSplatVector(Ty, Val).getReg(0);
+
+ return Val;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
+ uint64_t KnownLen, Align Alignment,
+ bool IsVolatile) {
+ auto &MF = *MI.getParent()->getParent();
+ const auto &TLI = *MF.getSubtarget().getTargetLowering();
+ auto &DL = MF.getDataLayout();
+ LLVMContext &C = MF.getFunction().getContext();
+
+ assert(KnownLen != 0 && "Have a zero length memset length!");
+
+ bool DstAlignCanChange = false;
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ bool OptSize = shouldLowerMemFuncForSize(MF);
+
+ MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+ if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+ DstAlignCanChange = true;
+
+ unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
+ std::vector<LLT> MemOps;
+
+ const auto &DstMMO = **MI.memoperands_begin();
+ MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+
+ auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
+ bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
+
+ if (!findGISelOptimalMemOpLowering(MemOps, Limit,
+ MemOp::Set(KnownLen, DstAlignCanChange,
+ Alignment,
+ /*IsZeroMemset=*/IsZeroVal,
+ /*IsVolatile=*/IsVolatile),
+ DstPtrInfo.getAddrSpace(), ~0u,
+ MF.getFunction().getAttributes(), TLI))
+ return UnableToLegalize;
+
+ if (DstAlignCanChange) {
+ // Get an estimate of the type from the LLT.
+ Type *IRTy = getTypeForLLT(MemOps[0], C);
+ Align NewAlign = DL.getABITypeAlign(IRTy);
+ if (NewAlign > Alignment) {
+ Alignment = NewAlign;
+ unsigned FI = FIDef->getOperand(1).getIndex();
+ // Give the stack frame object a larger alignment if needed.
+ if (MFI.getObjectAlign(FI) < Alignment)
+ MFI.setObjectAlignment(FI, Alignment);
+ }
+ }
+
+ MachineIRBuilder MIB(MI);
+ // Find the largest store and generate the bit pattern for it.
+ LLT LargestTy = MemOps[0];
+ for (unsigned i = 1; i < MemOps.size(); i++)
+ if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
+ LargestTy = MemOps[i];
+
+ // The memset stored value is always defined as an s8, so in order to make it
+ // work with larger store types we need to repeat the bit pattern across the
+ // wider type.
+ Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
+
+ if (!MemSetValue)
+ return UnableToLegalize;
+
+ // Generate the stores. For each store type in the list, we generate the
+ // matching store of that type to the destination address.
+ LLT PtrTy = MRI.getType(Dst);
+ unsigned DstOff = 0;
+ unsigned Size = KnownLen;
+ for (unsigned I = 0; I < MemOps.size(); I++) {
+ LLT Ty = MemOps[I];
+ unsigned TySize = Ty.getSizeInBytes();
+ if (TySize > Size) {
+ // Issuing an unaligned load / store pair that overlaps with the previous
+ // pair. Adjust the offset accordingly.
+ assert(I == MemOps.size() - 1 && I != 0);
+ DstOff -= TySize - Size;
+ }
+
+ // If this store is smaller than the largest store see whether we can get
+ // the smaller value for free with a truncate.
+ Register Value = MemSetValue;
+ if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
+ MVT VT = getMVTForLLT(Ty);
+ MVT LargestVT = getMVTForLLT(LargestTy);
+ if (!LargestTy.isVector() && !Ty.isVector() &&
+ TLI.isTruncateFree(LargestVT, VT))
+ Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
+ else
+ Value = getMemsetValue(Val, Ty, MIB);
+ if (!Value)
+ return UnableToLegalize;
+ }
+
+ auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
+
+ Register Ptr = Dst;
+ if (DstOff != 0) {
+ auto Offset =
+ MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
+ Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
+ }
+
+ MIB.buildStore(Value, Ptr, *StoreMMO);
+ DstOff += Ty.getSizeInBytes();
+ Size -= TySize;
+ }
+
+ MI.eraseFromParent();
+ return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
+ assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ Register Len = MI.getOperand(2).getReg();
+
+ const auto *MMOIt = MI.memoperands_begin();
+ const MachineMemOperand *MemOp = *MMOIt;
+ bool IsVolatile = MemOp->isVolatile();
+
+ // See if this is a constant length copy
+ auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
+ // FIXME: support dynamically sized G_MEMCPY_INLINE
+ assert(LenVRegAndVal.hasValue() &&
+ "inline memcpy with dynamic size is not yet supported");
+ uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
+ if (KnownLen == 0) {
+ MI.eraseFromParent();
+ return Legalized;
+ }
+
+ const auto &DstMMO = **MI.memoperands_begin();
+ const auto &SrcMMO = **std::next(MI.memoperands_begin());
+ Align DstAlign = DstMMO.getBaseAlign();
+ Align SrcAlign = SrcMMO.getBaseAlign();
+
+ return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
+ IsVolatile);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
+ uint64_t KnownLen, Align DstAlign,
+ Align SrcAlign, bool IsVolatile) {
+ assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
+ return lowerMemcpy(MI, Dst, Src, KnownLen,
+ std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
+ IsVolatile);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
+ uint64_t KnownLen, uint64_t Limit, Align DstAlign,
+ Align SrcAlign, bool IsVolatile) {
+ auto &MF = *MI.getParent()->getParent();
+ const auto &TLI = *MF.getSubtarget().getTargetLowering();
+ auto &DL = MF.getDataLayout();
+ LLVMContext &C = MF.getFunction().getContext();
+
+ assert(KnownLen != 0 && "Have a zero length memcpy length!");
+
+ bool DstAlignCanChange = false;
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ Align Alignment = commonAlignment(DstAlign, SrcAlign);
+
+ MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+ if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+ DstAlignCanChange = true;
+
+ // FIXME: infer better src pointer alignment like SelectionDAG does here.
+ // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
+ // if the memcpy is in a tail call position.
+
+ std::vector<LLT> MemOps;
+
+ const auto &DstMMO = **MI.memoperands_begin();
+ const auto &SrcMMO = **std::next(MI.memoperands_begin());
+ MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+ MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
+
+ if (!findGISelOptimalMemOpLowering(
+ MemOps, Limit,
+ MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
+ IsVolatile),
+ DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
+ MF.getFunction().getAttributes(), TLI))
+ return UnableToLegalize;
+
+ if (DstAlignCanChange) {
+ // Get an estimate of the type from the LLT.
+ Type *IRTy = getTypeForLLT(MemOps[0], C);
+ Align NewAlign = DL.getABITypeAlign(IRTy);
+
+ // Don't promote to an alignment that would require dynamic stack
+ // realignment.
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ if (!TRI->hasStackRealignment(MF))
+ while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
+ NewAlign = NewAlign / 2;
+
+ if (NewAlign > Alignment) {
+ Alignment = NewAlign;
+ unsigned FI = FIDef->getOperand(1).getIndex();
+ // Give the stack frame object a larger alignment if needed.
+ if (MFI.getObjectAlign(FI) < Alignment)
+ MFI.setObjectAlignment(FI, Alignment);
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
+
+ MachineIRBuilder MIB(MI);
+ // Now we need to emit a pair of load and stores for each of the types we've
+ // collected. I.e. for each type, generate a load from the source pointer of
+ // that type width, and then generate a corresponding store to the dest buffer
+ // of that value loaded. This can result in a sequence of loads and stores
+ // mixed types, depending on what the target specifies as good types to use.
+ unsigned CurrOffset = 0;
+ LLT PtrTy = MRI.getType(Src);
+ unsigned Size = KnownLen;
+ for (auto CopyTy : MemOps) {
+ // Issuing an unaligned load / store pair that overlaps with the previous
+ // pair. Adjust the offset accordingly.
+ if (CopyTy.getSizeInBytes() > Size)
+ CurrOffset -= CopyTy.getSizeInBytes() - Size;
+
+ // Construct MMOs for the accesses.
+ auto *LoadMMO =
+ MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
+ auto *StoreMMO =
+ MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+ // Create the load.
+ Register LoadPtr = Src;
+ Register Offset;
+ if (CurrOffset != 0) {
+ Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
+ .getReg(0);
+ LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
+ }
+ auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
+
+ // Create the store.
+ Register StorePtr =
+ CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
+ MIB.buildStore(LdVal, StorePtr, *StoreMMO);
+ CurrOffset += CopyTy.getSizeInBytes();
+ Size -= CopyTy.getSizeInBytes();
+ }
+
+ MI.eraseFromParent();
+ return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
+ uint64_t KnownLen, Align DstAlign, Align SrcAlign,
+ bool IsVolatile) {
+ auto &MF = *MI.getParent()->getParent();
+ const auto &TLI = *MF.getSubtarget().getTargetLowering();
+ auto &DL = MF.getDataLayout();
+ LLVMContext &C = MF.getFunction().getContext();
+
+ assert(KnownLen != 0 && "Have a zero length memmove length!");
+
+ bool DstAlignCanChange = false;
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ bool OptSize = shouldLowerMemFuncForSize(MF);
+ Align Alignment = commonAlignment(DstAlign, SrcAlign);
+
+ MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+ if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+ DstAlignCanChange = true;
+
+ unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
+ std::vector<LLT> MemOps;
+
+ const auto &DstMMO = **MI.memoperands_begin();
+ const auto &SrcMMO = **std::next(MI.memoperands_begin());
+ MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+ MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
+
+ // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
+ // to a bug in it's findOptimalMemOpLowering implementation. For now do the
+ // same thing here.
+ if (!findGISelOptimalMemOpLowering(
+ MemOps, Limit,
+ MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
+ /*IsVolatile*/ true),
+ DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
+ MF.getFunction().getAttributes(), TLI))
+ return UnableToLegalize;
+
+ if (DstAlignCanChange) {
+ // Get an estimate of the type from the LLT.
+ Type *IRTy = getTypeForLLT(MemOps[0], C);
+ Align NewAlign = DL.getABITypeAlign(IRTy);
+
+ // Don't promote to an alignment that would require dynamic stack
+ // realignment.
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ if (!TRI->hasStackRealignment(MF))
+ while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
+ NewAlign = NewAlign / 2;
+
+ if (NewAlign > Alignment) {
+ Alignment = NewAlign;
+ unsigned FI = FIDef->getOperand(1).getIndex();
+ // Give the stack frame object a larger alignment if needed.
+ if (MFI.getObjectAlign(FI) < Alignment)
+ MFI.setObjectAlignment(FI, Alignment);
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
+
+ MachineIRBuilder MIB(MI);
+ // Memmove requires that we perform the loads first before issuing the stores.
+ // Apart from that, this loop is pretty much doing the same thing as the
+ // memcpy codegen function.
+ unsigned CurrOffset = 0;
+ LLT PtrTy = MRI.getType(Src);
+ SmallVector<Register, 16> LoadVals;
+ for (auto CopyTy : MemOps) {
+ // Construct MMO for the load.
+ auto *LoadMMO =
+ MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+ // Create the load.
+ Register LoadPtr = Src;
+ if (CurrOffset != 0) {
+ auto Offset =
+ MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
+ LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
+ }
+ LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
+ CurrOffset += CopyTy.getSizeInBytes();
+ }
+
+ CurrOffset = 0;
+ for (unsigned I = 0; I < MemOps.size(); ++I) {
+ LLT CopyTy = MemOps[I];
+ // Now store the values loaded.
+ auto *StoreMMO =
+ MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+ Register StorePtr = Dst;
+ if (CurrOffset != 0) {
+ auto Offset =
+ MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
+ StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
+ }
+ MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
+ CurrOffset += CopyTy.getSizeInBytes();
+ }
+ MI.eraseFromParent();
+ return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
+ const unsigned Opc = MI.getOpcode();
+ // This combine is fairly complex so it's not written with a separate
+ // matcher function.
+ assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
+ Opc == TargetOpcode::G_MEMSET) &&
+ "Expected memcpy like instruction");
+
+ auto MMOIt = MI.memoperands_begin();
+ const MachineMemOperand *MemOp = *MMOIt;
+
+ Align DstAlign = MemOp->getBaseAlign();
+ Align SrcAlign;
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ Register Len = MI.getOperand(2).getReg();
+
+ if (Opc != TargetOpcode::G_MEMSET) {
+ assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
+ MemOp = *(++MMOIt);
+ SrcAlign = MemOp->getBaseAlign();
+ }
+
+ // See if this is a constant length copy
+ auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
+ if (!LenVRegAndVal)
+ return UnableToLegalize;
+ uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
+
+ if (KnownLen == 0) {
+ MI.eraseFromParent();
+ return Legalized;
+ }
+
+ bool IsVolatile = MemOp->isVolatile();
+ if (Opc == TargetOpcode::G_MEMCPY_INLINE)
+ return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
+ IsVolatile);
+
+ // Don't try to optimize volatile.
+ if (IsVolatile)
+ return UnableToLegalize;
+
+ if (MaxLen && KnownLen > MaxLen)
+ return UnableToLegalize;
+
+ if (Opc == TargetOpcode::G_MEMCPY) {
+ auto &MF = *MI.getParent()->getParent();
+ const auto &TLI = *MF.getSubtarget().getTargetLowering();
+ bool OptSize = shouldLowerMemFuncForSize(MF);
+ uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
+ return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
+ IsVolatile);
+ }
+ if (Opc == TargetOpcode::G_MEMMOVE)
+ return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
+ if (Opc == TargetOpcode::G_MEMSET)
+ return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
+ return UnableToLegalize;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 1192da24f04f4..2efada6758cc0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1679,6 +1679,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// TODO: Implement
G_FMINIMUM, G_FMAXIMUM}).lower();
+ getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
+ .lower();
+
getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 13f09ab8f1649..ba08af2ecfcbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -205,8 +205,6 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
return true;
switch (MI.getOpcode()) {
- case TargetOpcode::G_MEMCPY_INLINE:
- return Helper.tryEmitMemcpyInline(MI);
case TargetOpcode::G_CONCAT_VECTORS:
return Helper.tryCombineConcatVectors(MI);
case TargetOpcode::G_SHUFFLE_VECTOR:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
new file mode 100644
index 0000000000000..614aafeb04398
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
@@ -0,0 +1,32 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-- -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: memcpy_test
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; CHECK-LABEL: name: memcpy_test
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; CHECK: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (load (s8))
+ ; CHECK: G_STORE [[LOAD]](s32), [[MV]](p0) :: (store (s8))
+ ; CHECK: S_ENDPGM 0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32)
+ %6:_(s32) = G_CONSTANT i32 1
+ %7:_(s64) = G_ZEXT %6:_(s32)
+ G_MEMCPY %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (store (s8)), (load (s8))
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
new file mode 100644
index 0000000000000..5437c8078ad0e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
@@ -0,0 +1,32 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-- -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: memcpyinline_test
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; CHECK-LABEL: name: memcpyinline_test
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; CHECK: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (load (s8))
+ ; CHECK: G_STORE [[LOAD]](s32), [[MV]](p0) :: (store (s8))
+ ; CHECK: S_ENDPGM 0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32)
+ %6:_(s32) = G_CONSTANT i32 1
+ %7:_(s64) = G_ZEXT %6:_(s32)
+ G_MEMCPY_INLINE %2:_(p0), %5:_(p0), %7:_(s64) :: (store (s8)), (load (s8))
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
new file mode 100644
index 0000000000000..0ca5e90da5c60
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
@@ -0,0 +1,32 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-- -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: memmove_test
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; CHECK-LABEL: name: memmove_test
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; CHECK: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (load (s8))
+ ; CHECK: G_STORE [[LOAD]](s32), [[MV]](p0) :: (store (s8))
+ ; CHECK: S_ENDPGM 0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32)
+ %6:_(s32) = G_CONSTANT i32 1
+ %7:_(s64) = G_ZEXT %6:_(s32)
+ G_MEMMOVE %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (store (s8)), (load (s8))
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
new file mode 100644
index 0000000000000..35725748c83a3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-- -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: memset_test
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: memset_test
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK: [[COPY3:%[0-9]+]]:_(s8) = COPY [[TRUNC]](s8)
+ ; CHECK: G_STORE [[COPY2]](s32), [[MV]](p0) :: (store (s8))
+ ; CHECK: S_ENDPGM 0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s16) = G_TRUNC %3:_(s32)
+ %5:_(s8) = G_TRUNC %4:_(s16)
+ %6:_(s32) = G_CONSTANT i32 1
+ %7:_(s64) = G_ZEXT %6:_(s32)
+ G_MEMSET %2:_(p0), %5:_(s8), %7:_(s64), 0 :: (store (s8))
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
new file mode 100644
index 0000000000000..8ff79e1184ce4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
+
+declare void @llvm.memcpy.inline.p1i8.p1i8.i32(i8 addrspace(1)*, i8 addrspace(1)*, i32, i1 immarg)
+
+define amdgpu_cs void @test(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) {
+; GCN-LABEL: test:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s2, 0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b64 s[0:1], 0
+; GCN-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:1
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:2
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:2
+; GCN-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:3
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3
+; GCN-NEXT: s_endpgm
+ call void @llvm.memcpy.inline.p1i8.p1i8.i32(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 4, i1 false)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
new file mode 100644
index 0000000000000..4f597af1ead1d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s
+
+declare void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1)*, i8 addrspace(1)*, i32, i1 immarg)
+
+define amdgpu_cs void @memcpy_p1i8(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) {
+; LOOP-LABEL: memcpy_p1i8:
+; LOOP: ; %bb.0:
+; LOOP-NEXT: s_mov_b32 s6, 0
+; LOOP-NEXT: s_mov_b32 s7, 0xf000
+; LOOP-NEXT: s_mov_b64 s[4:5], 0
+; LOOP-NEXT: v_mov_b32_e32 v5, v3
+; LOOP-NEXT: v_mov_b32_e32 v4, v2
+; LOOP-NEXT: v_mov_b32_e32 v7, v1
+; LOOP-NEXT: v_mov_b32_e32 v6, v0
+; LOOP-NEXT: v_mov_b32_e32 v8, s6
+; LOOP-NEXT: BB0_1: ; %load-store-loop
+; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
+; LOOP-NEXT: buffer_load_ubyte v9, v[4:5], s[4:7], 0 addr64
+; LOOP-NEXT: buffer_load_ubyte v10, v[4:5], s[4:7], 0 addr64 offset:1
+; LOOP-NEXT: buffer_load_ubyte v11, v[4:5], s[4:7], 0 addr64 offset:2
+; LOOP-NEXT: buffer_load_ubyte v12, v[4:5], s[4:7], 0 addr64 offset:3
+; LOOP-NEXT: buffer_load_ubyte v13, v[4:5], s[4:7], 0 addr64 offset:4
+; LOOP-NEXT: buffer_load_ubyte v14, v[4:5], s[4:7], 0 addr64 offset:5
+; LOOP-NEXT: buffer_load_ubyte v15, v[4:5], s[4:7], 0 addr64 offset:6
+; LOOP-NEXT: buffer_load_ubyte v16, v[4:5], s[4:7], 0 addr64 offset:7
+; LOOP-NEXT: buffer_load_ubyte v17, v[4:5], s[4:7], 0 addr64 offset:8
+; LOOP-NEXT: s_waitcnt expcnt(6)
+; LOOP-NEXT: buffer_load_ubyte v18, v[4:5], s[4:7], 0 addr64 offset:9
+; LOOP-NEXT: s_waitcnt expcnt(5)
+; LOOP-NEXT: buffer_load_ubyte v19, v[4:5], s[4:7], 0 addr64 offset:10
+; LOOP-NEXT: s_waitcnt expcnt(4)
+; LOOP-NEXT: buffer_load_ubyte v20, v[4:5], s[4:7], 0 addr64 offset:11
+; LOOP-NEXT: s_waitcnt expcnt(3)
+; LOOP-NEXT: buffer_load_ubyte v21, v[4:5], s[4:7], 0 addr64 offset:12
+; LOOP-NEXT: s_waitcnt expcnt(2)
+; LOOP-NEXT: buffer_load_ubyte v22, v[4:5], s[4:7], 0 addr64 offset:13
+; LOOP-NEXT: s_waitcnt expcnt(1)
+; LOOP-NEXT: buffer_load_ubyte v23, v[4:5], s[4:7], 0 addr64 offset:14
+; LOOP-NEXT: s_waitcnt expcnt(0)
+; LOOP-NEXT: buffer_load_ubyte v24, v[4:5], s[4:7], 0 addr64 offset:15
+; LOOP-NEXT: v_add_i32_e32 v8, vcc, 1, v8
+; LOOP-NEXT: s_xor_b64 s[0:1], vcc, -1
+; LOOP-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; LOOP-NEXT: s_and_b64 vcc, s[0:1], exec
+; LOOP-NEXT: s_waitcnt vmcnt(14)
+; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[4:7], 0 addr64
+; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[4:7], 0 addr64 offset:1
+; LOOP-NEXT: s_waitcnt vmcnt(14)
+; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:2
+; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:3
+; LOOP-NEXT: s_waitcnt vmcnt(14)
+; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[4:7], 0 addr64 offset:4
+; LOOP-NEXT: buffer_store_byte v14, v[6:7], s[4:7], 0 addr64 offset:5
+; LOOP-NEXT: s_waitcnt vmcnt(14)
+; LOOP-NEXT: buffer_store_byte v15, v[6:7], s[4:7], 0 addr64 offset:6
+; LOOP-NEXT: buffer_store_byte v16, v[6:7], s[4:7], 0 addr64 offset:7
+; LOOP-NEXT: s_waitcnt vmcnt(14)
+; LOOP-NEXT: buffer_store_byte v17, v[6:7], s[4:7], 0 addr64 offset:8
+; LOOP-NEXT: buffer_store_byte v18, v[6:7], s[4:7], 0 addr64 offset:9
+; LOOP-NEXT: s_waitcnt vmcnt(14)
+; LOOP-NEXT: buffer_store_byte v19, v[6:7], s[4:7], 0 addr64 offset:10
+; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[4:7], 0 addr64 offset:11
+; LOOP-NEXT: s_waitcnt vmcnt(14)
+; LOOP-NEXT: buffer_store_byte v21, v[6:7], s[4:7], 0 addr64 offset:12
+; LOOP-NEXT: buffer_store_byte v22, v[6:7], s[4:7], 0 addr64 offset:13
+; LOOP-NEXT: s_waitcnt vmcnt(14)
+; LOOP-NEXT: buffer_store_byte v23, v[6:7], s[4:7], 0 addr64 offset:14
+; LOOP-NEXT: buffer_store_byte v24, v[6:7], s[4:7], 0 addr64 offset:15
+; LOOP-NEXT: v_add_i32_e64 v6, s[0:1], 16, v6
+; LOOP-NEXT: v_addc_u32_e64 v7, s[0:1], 0, v7, s[0:1]
+; LOOP-NEXT: v_add_i32_e64 v4, s[0:1], 16, v4
+; LOOP-NEXT: v_addc_u32_e64 v5, s[0:1], 0, v5, s[0:1]
+; LOOP-NEXT: s_cbranch_vccnz BB0_1
+; LOOP-NEXT: ; %bb.2: ; %memcpy-split
+; LOOP-NEXT: s_mov_b32 s2, 0
+; LOOP-NEXT: s_mov_b32 s3, 0xf000
+; LOOP-NEXT: s_mov_b64 s[0:1], 0
+; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT: s_waitcnt vmcnt(3)
+; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT: s_waitcnt vmcnt(3)
+; LOOP-NEXT: buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT: s_waitcnt vmcnt(3)
+; LOOP-NEXT: buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT: s_waitcnt vmcnt(3)
+; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT: s_endpgm
+;
+; UNROLL-LABEL: memcpy_p1i8:
+; UNROLL: ; %bb.0:
+; UNROLL-NEXT: s_mov_b32 s2, 0
+; UNROLL-NEXT: s_mov_b32 s3, 0xf000
+; UNROLL-NEXT: s_mov_b64 s[0:1], 0
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:1
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:1
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:2
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:2
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:3
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:3
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:4
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:4
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:5
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:5
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:6
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:6
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:7
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:7
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:8
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:8
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:9
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:9
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:10
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:10
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:11
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:11
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:12
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:12
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:13
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:13
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:14
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:14
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:15
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:15
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:16
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:16
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:17
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:17
+; UNROLL-NEXT: s_waitcnt expcnt(0)
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:18
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:18
+; UNROLL-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:19
+; UNROLL-NEXT: s_waitcnt vmcnt(0)
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:19
+; UNROLL-NEXT: s_endpgm
+ call void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 20, i1 false)
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
new file mode 100644
index 0000000000000..e379699bf36b5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+
+declare void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)*, i8 addrspace(1)*, i32, i1)
+
+define amdgpu_cs void @memmove_p1i8(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) {
+; LOOP-LABEL: memmove_p1i8:
+; LOOP: ; %bb.0:
+; LOOP-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
+; LOOP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; LOOP-NEXT: s_xor_b64 s[4:5], exec, s[0:1]
+; LOOP-NEXT: s_cbranch_execz BB0_3
+; LOOP-NEXT: ; %bb.1: ; %copy_forward
+; LOOP-NEXT: s_mov_b64 s[0:1], 0
+; LOOP-NEXT: s_mov_b32 s2, 0
+; LOOP-NEXT: s_mov_b32 s3, 0xf000
+; LOOP-NEXT: v_mov_b32_e32 v5, s1
+; LOOP-NEXT: v_mov_b32_e32 v4, s0
+; LOOP-NEXT: BB0_2: ; %copy_forward_loop
+; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
+; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4
+; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v3, v5, vcc
+; LOOP-NEXT: s_waitcnt expcnt(0)
+; LOOP-NEXT: buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64
+; LOOP-NEXT: v_add_i32_e32 v6, vcc, v0, v4
+; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v1, v5, vcc
+; LOOP-NEXT: v_add_i32_e32 v4, vcc, 1, v4
+; LOOP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; LOOP-NEXT: v_cmp_ne_u32_e32 vcc, 4, v4
+; LOOP-NEXT: s_waitcnt vmcnt(0)
+; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64
+; LOOP-NEXT: s_cbranch_vccnz BB0_2
+; LOOP-NEXT: BB0_3: ; %Flow14
+; LOOP-NEXT: s_or_saveexec_b64 s[0:1], s[4:5]
+; LOOP-NEXT: s_xor_b64 exec, exec, s[0:1]
+; LOOP-NEXT: s_cbranch_execz BB0_6
+; LOOP-NEXT: ; %bb.4: ; %copy_backwards
+; LOOP-NEXT: s_mov_b64 s[4:5], 3
+; LOOP-NEXT: s_mov_b32 s2, 0
+; LOOP-NEXT: s_mov_b32 s3, 0xf000
+; LOOP-NEXT: s_mov_b64 s[0:1], 0
+; LOOP-NEXT: v_mov_b32_e32 v4, s4
+; LOOP-NEXT: v_mov_b32_e32 v5, s5
+; LOOP-NEXT: BB0_5: ; %copy_backwards_loop
+; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
+; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4
+; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v3, v5, vcc
+; LOOP-NEXT: s_waitcnt expcnt(0)
+; LOOP-NEXT: buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64
+; LOOP-NEXT: v_add_i32_e32 v6, vcc, v0, v4
+; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v1, v5, vcc
+; LOOP-NEXT: v_add_i32_e32 v4, vcc, -1, v4
+; LOOP-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc
+; LOOP-NEXT: v_cmp_eq_u32_e32 vcc, -1, v4
+; LOOP-NEXT: s_waitcnt vmcnt(0)
+; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64
+; LOOP-NEXT: s_cbranch_vccz BB0_5
+; LOOP-NEXT: BB0_6: ; %memmove_done
+; LOOP-NEXT: s_endpgm
+;
+; UNROLL-LABEL: memmove_p1i8:
+; UNROLL: ; %bb.0:
+; UNROLL-NEXT: s_mov_b32 s2, 0
+; UNROLL-NEXT: s_mov_b32 s3, 0xf000
+; UNROLL-NEXT: s_mov_b64 s[0:1], 0
+; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64
+; UNROLL-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:1
+; UNROLL-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:2
+; UNROLL-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:3
+; UNROLL-NEXT: s_waitcnt vmcnt(3)
+; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64
+; UNROLL-NEXT: s_waitcnt vmcnt(3)
+; UNROLL-NEXT: buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:1
+; UNROLL-NEXT: s_waitcnt vmcnt(3)
+; UNROLL-NEXT: buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:2
+; UNROLL-NEXT: s_waitcnt vmcnt(3)
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3
+; UNROLL-NEXT: s_endpgm
+ call void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 4, i1 false)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
new file mode 100644
index 0000000000000..f70929b8f2b12
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+
+declare void @llvm.memset.p1i8.i32(i8 addrspace(1)*, i8, i32, i1)
+
+define amdgpu_cs void @memset_p1i8(i8 addrspace(1)* %dst, i8 %val) {
+; LOOP-LABEL: memset_p1i8:
+; LOOP: ; %bb.0: ; %loadstoreloop.preheader
+; LOOP-NEXT: s_mov_b64 s[0:1], 0
+; LOOP-NEXT: s_mov_b32 s2, 0
+; LOOP-NEXT: s_mov_b32 s3, 0xf000
+; LOOP-NEXT: v_mov_b32_e32 v4, s1
+; LOOP-NEXT: v_mov_b32_e32 v3, s0
+; LOOP-NEXT: BB0_1: ; %loadstoreloop
+; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
+; LOOP-NEXT: v_add_i32_e32 v5, vcc, v0, v3
+; LOOP-NEXT: v_addc_u32_e32 v6, vcc, v1, v4, vcc
+; LOOP-NEXT: v_add_i32_e32 v3, vcc, 1, v3
+; LOOP-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 4, v3
+; LOOP-NEXT: buffer_store_byte v2, v[5:6], s[0:3], 0 addr64
+; LOOP-NEXT: s_cbranch_vccnz BB0_1
+; LOOP-NEXT: ; %bb.2: ; %split
+; LOOP-NEXT: s_endpgm
+;
+; UNROLL-LABEL: memset_p1i8:
+; UNROLL: ; %bb.0:
+; UNROLL-NEXT: s_mov_b32 s2, 0
+; UNROLL-NEXT: s_mov_b32 s3, 0xf000
+; UNROLL-NEXT: s_mov_b64 s[0:1], 0
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:1
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:2
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3
+; UNROLL-NEXT: s_endpgm
+ call void @llvm.memset.p1i8.i32(i8 addrspace(1)* %dst, i8 %val, i32 4, i1 false)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-memcpy-inline.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-memcpy-inline.mir
deleted file mode 100644
index 1e4ff0324793f..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-memcpy-inline.mir
+++ /dev/null
@@ -1,81 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s
---- |
- target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
- target triple = "amdgcn-amd-amdhsa"
-
- declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64 immarg, i1 immarg) #0
-
- define void @test_memcpy_inline(i32* nocapture %dst, i32* nocapture readonly %src) local_unnamed_addr #1 {
- entry:
- %0 = bitcast i32* %dst to i8*
- %1 = bitcast i32* %src to i8*
- tail call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 13, i1 false)
- ret void
- }
-
- attributes #0 = { argmemonly nofree nounwind willreturn "target-cpu"="gfx900" }
- attributes #1 = { "target-cpu"="gfx900" }
-
-...
----
-name: test_memcpy_inline
-alignment: 1
-tracksRegLiveness: true
-registers:
- - { id: 0, class: _ }
- - { id: 1, class: _ }
- - { id: 2, class: sgpr_64 }
- - { id: 3, class: _ }
- - { id: 4, class: _ }
- - { id: 5, class: _ }
- - { id: 6, class: _ }
- - { id: 7, class: _ }
- - { id: 8, class: ccr_sgpr_64 }
-liveins:
- - { reg: '$sgpr30_sgpr31', virtual-reg: '%2' }
-frameInfo:
- maxAlignment: 1
-machineFunctionInfo:
- maxKernArgAlign: 1
- scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
- frameOffsetReg: '$sgpr33'
- stackPtrOffsetReg: '$sgpr32'
- argumentInfo:
- privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
- occupancy: 10
-body: |
- bb.1.entry:
- liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
-
- ; CHECK-LABEL: name: test_memcpy_inline
- ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
- ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; CHECK: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; CHECK: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
- ; CHECK: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[MV1]](p0) :: (load (s64) from %ir.1, align 4)
- ; CHECK: G_STORE [[LOAD]](s64), [[MV]](p0) :: (store (s64) into %ir.0, align 4)
- ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
- ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[MV1]], [[C]](s64)
- ; CHECK: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from %ir.1 + 5, align 1, basealign 4)
- ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[MV]], [[C]](s64)
- ; CHECK: G_STORE [[LOAD1]](s64), [[PTR_ADD1]](p0) :: (store (s64) into %ir.0 + 5, align 1, basealign 4)
- ; CHECK: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
- ; CHECK: S_SETPC_B64_return [[COPY5]]
- %3:_(s32) = COPY $vgpr0
- %4:_(s32) = COPY $vgpr1
- %0:_(p0) = G_MERGE_VALUES %3(s32), %4(s32)
- %5:_(s32) = COPY $vgpr2
- %6:_(s32) = COPY $vgpr3
- %1:_(p0) = G_MERGE_VALUES %5(s32), %6(s32)
- %2:sgpr_64 = COPY $sgpr30_sgpr31
- %7:_(s64) = G_CONSTANT i64 13
- G_MEMCPY_INLINE %0(p0), %1(p0), %7(s64) :: (store (s8) into %ir.0, align 4), (load (s8) from %ir.1, align 4)
- %8:ccr_sgpr_64 = COPY %2
- S_SETPC_B64_return %8
-
-...
More information about the llvm-commits
mailing list