[llvm] [CodeGen][PreISelIntrinsicLowering] Add VP-based lowering for memcpy/memmove/memset (PR #165585)
David Del Río via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 11 06:47:40 PST 2025
https://github.com/dadra-oc updated https://github.com/llvm/llvm-project/pull/165585
>From fa59e8db87ab10b73c92e76bc2e8b3406dac17d3 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad at comstyle.com>
Date: Wed, 10 Dec 2025 03:09:54 -0500
Subject: [PATCH] [compiler-rt][sanitizer] fix i386 build for Haiku (#171075)
r13 does not provide the trap err.
Co-authored-by: Jerome Duval <jerome.duval at gmail.com>
---
.../llvm/Analysis/TargetTransformInfo.h | 2 +
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 +
.../Transforms/Utils/LowerMemIntrinsics.h | 2 +-
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 +
llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp | 16 +-
.../lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp | 2 +-
.../Target/RISCV/RISCVTargetTransformInfo.h | 3 +
.../Target/SPIRV/SPIRVPrepareFunctions.cpp | 7 +-
.../Transforms/Utils/LowerMemIntrinsics.cpp | 394 +++++++++++++++++-
.../RISCV/expand-vp-mem-ops.ll | 175 ++++++++
10 files changed, 588 insertions(+), 21 deletions(-)
create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/expand-vp-mem-ops.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 19d5fb0feb73e..870330d62f0a7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -734,6 +734,8 @@ class TargetTransformInfo {
LLVM_ABI TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const;
+ LLVM_ABI bool preferMemIntrinsicVPExpansion() const;
+
// Parameters that control the loop peeling transformation
struct PeelingPreferences {
/// A forced peeling factor (the number of bodied of the original loop
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index da104ffe0a6e6..b396348478982 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -236,6 +236,10 @@ class TargetTransformInfoImplBase {
return TailFoldingStyle::DataWithoutLaneMask;
}
+ virtual bool preferMemIntrinsicVPExpansion() const {
+ return false;
+ }
+
virtual std::optional<Instruction *>
instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return std::nullopt;
diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
index d4e72a60fc1ea..b44b58808cfce 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@@ -59,7 +59,7 @@ LLVM_ABI bool expandMemMoveAsLoop(MemMoveInst *MemMove,
const TargetTransformInfo &TTI);
/// Expand \p MemSet as a loop. \p MemSet is not deleted.
-LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet);
+LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet, const TargetTransformInfo &TTI);
/// Expand \p MemSetPattern as a loop. \p MemSet is not deleted.
LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 6201f6a2acbf3..81811e4613f73 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -377,6 +377,10 @@ TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle(
return TTIImpl->getPreferredTailFoldingStyle(IVUpdateMayOverflow);
}
+bool TargetTransformInfo::preferMemIntrinsicVPExpansion() const {
+ return TTIImpl->preferMemIntrinsicVPExpansion();
+}
+
std::optional<Instruction *>
TargetTransformInfo::instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const {
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 97da9abab0b7b..4806f5d97f755 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -50,6 +50,12 @@ static cl::opt<int64_t> MemIntrinsicExpandSizeThresholdOpt(
cl::desc("Set minimum mem intrinsic size to expand in IR"), cl::init(-1),
cl::Hidden);
+static llvm::cl::opt<bool>
+ ForceMemIntrinsicExpansion("force-mem-intrinsic-expansion",
+ cl::desc("Force expansion of memory intrinsics "
+ "instead of lowering to libc calls"),
+ cl::init(false));
+
namespace {
struct PreISelIntrinsicLowering {
@@ -70,8 +76,8 @@ struct PreISelIntrinsicLowering {
function_ref<TargetLibraryInfo &(Function &)> LookupTLI_,
bool UseMemIntrinsicLibFunc_ = true)
: TM(TM_), ModuleLibcalls(ModuleLibcalls_), LookupTTI(LookupTTI_),
- LookupTLI(LookupTLI_), UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_) {
- }
+ LookupTLI(LookupTLI_), UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_ &&
+ !ForceMemIntrinsicExpansion) {}
static bool shouldExpandMemIntrinsicWithSize(Value *Size,
const TargetTransformInfo &TTI);
@@ -381,7 +387,7 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
canEmitLibcall(ModuleLibcalls, TM, ParentFunc, RTLIB::MEMSET))
break;
- expandMemSetAsLoop(Memset);
+ expandMemSetAsLoop(Memset, TTI);
Changed = true;
Memset->eraseFromParent();
}
@@ -396,7 +402,9 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
if (isa<ConstantInt>(Memset->getLength()))
break;
- expandMemSetAsLoop(Memset);
+ Function *ParentFunc = Memset->getFunction();
+ const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+ expandMemSetAsLoop(Memset, TTI);
Changed = true;
Memset->eraseFromParent();
break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index ac6f4061b9f1f..d0b50d2610bd5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -128,7 +128,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
} else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
expandMemMoveAsLoop(Memmove, TTI);
} else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
- expandMemSetAsLoop(Memset);
+ expandMemSetAsLoop(Memset, TTI);
}
MemCall->eraseFromParent();
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index e6b75d7c458db..14997e1b32a99 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -122,6 +122,9 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL
: TailFoldingStyle::None;
}
+ bool preferMemIntrinsicVPExpansion() const override {
+ return ST->hasVInstructions();
+ }
std::optional<unsigned> getMaxVScale() const override;
std::optional<unsigned> getVScaleForTuning() const override;
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index fdd0af871e03e..13542ca8d4c6f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -23,6 +23,7 @@
#include "SPIRVTargetMachine.h"
#include "SPIRVUtils.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/IRBuilder.h"
@@ -96,7 +97,7 @@ static Function *getOrCreateFunction(Module *M, Type *RetTy,
return NewF;
}
-static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) {
+static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic, const SPIRVTargetMachine& TM) {
// For @llvm.memset.* intrinsic cases with constant value and length arguments
// are emulated via "storing" a constant array to the destination. For other
// cases we wrap the intrinsic in @spirv.llvm_memset_* function and expand the
@@ -140,7 +141,7 @@ static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) {
auto *MemSet = IRB.CreateMemSet(Dest, Val, Len, MSI->getDestAlign(),
MSI->isVolatile());
IRB.CreateRetVoid();
- expandMemSetAsLoop(cast<MemSetInst>(MemSet));
+ expandMemSetAsLoop(cast<MemSetInst>(MemSet), TM.getTargetTransformInfo(*F));
MemSet->eraseFromParent();
break;
}
@@ -414,7 +415,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
switch (II->getIntrinsicID()) {
case Intrinsic::memset:
case Intrinsic::bswap:
- Changed |= lowerIntrinsicToFunction(II);
+ Changed |= lowerIntrinsicToFunction(II, TM);
break;
case Intrinsic::fshl:
case Intrinsic::fshr:
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 4ab99edd64baa..f636042066da1 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -233,6 +233,92 @@ static LoopExpansionInfo insertLoopExpansion(Instruction *InsertBefore,
return LEI;
}
+static void buildScalableVectorForwardCpyLoop(BasicBlock *EntryBB,
+ BasicBlock *LoopBB,
+ BasicBlock *ExitBB,
+ Value *SrcAddr, Value *DstAddr,
+ Value *CopyLen) {
+ Function *F = EntryBB->getParent();
+ LLVMContext &Ctx = F->getContext();
+ Module *M = F->getParent();
+
+ Type *Int8Ty = Type::getInt8Ty(Ctx);
+ ElementCount EC = ElementCount::getScalable(8);
+ Type *VecTy = VectorType::get(Int8Ty, EC);
+ Type *PtrTy = PointerType::get(VecTy, 0);
+
+ Type *Int1Ty = Type::getInt1Ty(Ctx);
+ Type *MaskTy = VectorType::get(Int1Ty, EC);
+
+ Type *Int32Ty = Type::getInt32Ty(Ctx);
+
+ Type *LenType = CopyLen->getType();
+ Value *TrueMaskVec = Constant::getAllOnesValue(MaskTy);
+
+ IRBuilder<> LoopBuilder(LoopBB);
+ PHINode *Src = LoopBuilder.CreatePHI(SrcAddr->getType(), 2, "src.f");
+ PHINode *Dst = LoopBuilder.CreatePHI(DstAddr->getType(), 2, "dst.f");
+ PHINode *Len = LoopBuilder.CreatePHI(LenType, 2, "len.f");
+
+ Src->addIncoming(SrcAddr, EntryBB);
+ Dst->addIncoming(DstAddr, EntryBB);
+ Len->addIncoming(CopyLen, EntryBB);
+
+ // load and store a chunk of data
+ Value *SrcBitcast = LoopBuilder.CreateBitCast(Src, PtrTy);
+ Value *DstBitcast = LoopBuilder.CreateBitCast(Dst, PtrTy);
+
+ Value *ActualVL = LoopBuilder.CreateIntrinsic(
+ Intrinsic::experimental_get_vector_length, {LenType},
+ {Len, ConstantInt::get(Int32Ty, 8), LoopBuilder.getTrue()});
+
+ FunctionCallee VPLoad =
+ Intrinsic::getOrInsertDeclaration(M, Intrinsic::vp_load, {VecTy, PtrTy});
+ Value *Vec =
+ LoopBuilder.CreateCall(VPLoad, {SrcBitcast, TrueMaskVec, ActualVL});
+
+ FunctionCallee VPStore =
+ Intrinsic::getOrInsertDeclaration(M, Intrinsic::vp_store, {VecTy, PtrTy});
+ LoopBuilder.CreateCall(VPStore, {Vec, DstBitcast, TrueMaskVec, ActualVL});
+
+ // Update loop state
+ Value *SrcNext = LoopBuilder.CreateGEP(Int8Ty, Src, ActualVL);
+ Value *DstNext = LoopBuilder.CreateGEP(Int8Ty, Dst, ActualVL);
+
+ // On 64 bit the two operator types for update Len will be different types
+ // VL -> Int32Ty
+ // Len -> Int64Ty
+ Value *LenNext =
+ LoopBuilder.CreateSub(Len, LoopBuilder.CreateZExt(ActualVL, LenType));
+
+ Value *Cond =
+ LoopBuilder.CreateICmpUGT(LenNext, ConstantInt::get(LenType, 0));
+
+ LoopBuilder.CreateCondBr(Cond, LoopBB, ExitBB);
+ Src->addIncoming(SrcNext, LoopBB);
+ Dst->addIncoming(DstNext, LoopBB);
+ Len->addIncoming(LenNext, LoopBB);
+}
+
+static void createMemCpyScalableVectorLoop(Instruction *InsertBefore,
+ Value *SrcAddr, Value *DstAddr,
+ Value *CopyLen, Align SrcAlign,
+ Align DstAlign, bool SrcIsVolatile,
+ bool DstIsVolatile,
+ const TargetTransformInfo &TTI) {
+ BasicBlock *EntryBB = InsertBefore->getParent();
+ Function *F = EntryBB->getParent();
+ LLVMContext &Ctx = F->getContext();
+
+ BasicBlock *ExitBB =
+ EntryBB->splitBasicBlock(InsertBefore, "memcpy.vec.exit");
+ BasicBlock *LoopBB = BasicBlock::Create(Ctx, "memcpy.vec.loop", F, ExitBB);
+ EntryBB->getTerminator()->setSuccessor(0, LoopBB);
+
+ buildScalableVectorForwardCpyLoop(EntryBB, LoopBB, ExitBB, SrcAddr, DstAddr,
+ CopyLen);
+}
+
void llvm::createMemCpyLoopKnownSize(
Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr,
ConstantInt *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile,
@@ -475,6 +561,168 @@ tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2,
return {ResAddr1, ResAddr2};
}
+// Lower memmove to IR using VP intrinsics. memmove is required to correctly
+// copy overlapping memory regions; therefore, it has to check the relative
+// positions of the source and destination pointers and choose the copy
+// direction accordingly.
+//
+// The code below is an IR rendition of this C function using RISCV intrinsics:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+// uint8_t *d = (uint8_t *)dest;
+// const uint8_t *s = (const uint8_t *)src;
+// if (s < d) {
+// // Backward copy
+// s += n;
+// d += n;
+// while (n > 0) {
+// size_t vl = __riscv_vsetvl_e8m8(n);
+// s -= vl;
+// d -= vl;
+// vuint8m8_t v = __riscv_vle8_v_u8m8(s, vl);
+// __riscv_vse8_v_u8m8(d, v, vl);
+// n -= vl;
+// }
+// } else {
+// // Forward copy
+// while (n > 0) {
+// size_t vl = __riscv_vsetvl_e8m8(n);
+// vuint8m8_t v = __riscv_vle8_v_u8m8(s, vl);
+// __riscv_vse8_v_u8m8(d, v, vl);
+// s += vl;
+// d += vl;
+// n -= vl;
+// }
+// }
+// return dest;
+// }
+static void createMemMoveScalableVectorLoop(Instruction *InsertBefore,
+ Value *SrcAddr, Value *DstAddr,
+ Value *CopyLen, Align SrcAlign,
+ Align DstAlign, bool SrcIsVolatile,
+ bool DstIsVolatile,
+ const TargetTransformInfo &TTI) {
+
+ BasicBlock *EntryBB = InsertBefore->getParent();
+ Function *F = EntryBB->getParent();
+ LLVMContext &Ctx = F->getContext();
+ Module *M = F->getParent();
+
+ Type *Int8Ty = Type::getInt8Ty(Ctx);
+ ElementCount EC = ElementCount::getScalable(8);
+ Type *VecTy = VectorType::get(Int8Ty, EC); // <vscale x 8 x i8>
+ Type *PtrTy = PointerType::get(VecTy, 0); // i8* or <vscale x N x i8>*
+
+ Type *Int1Ty = Type::getInt1Ty(Ctx);
+ Type *MaskTy = VectorType::get(Int1Ty, EC);
+
+ Type *Int32Ty = Type::getInt32Ty(Ctx);
+
+ Type *LenType = CopyLen->getType();
+ Value *TrueMaskVec = Constant::getAllOnesValue(MaskTy);
+
+ IRBuilder<> Builder(InsertBefore);
+ // Create the a comparison of src and dst, based on which we jump to either
+ // the forward-copy part of the function (if src >= dst) or the backwards-copy
+ // part (if src < dst).
+ // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+ // structure. Its block terminators (unconditional branches) are replaced by
+ // the appropriate conditional branches when the loop is built.
+ // If the pointers are in different address spaces, they need to be converted
+ // to a compatible one. Cases where memory ranges in the different address
+ // spaces cannot overlap are lowered as memcpy and not handled here.
+ auto [CmpSrcAddr, CmpDstAddr] =
+ tryInsertCastToCommonAddrSpace(Builder, SrcAddr, DstAddr, TTI);
+ Value *PtrCompare =
+ Builder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_addr");
+ Instruction *ThenTerm, *ElseTerm;
+ SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
+ &ThenTerm, &ElseTerm);
+
+ // Then we will have two parts, backward and forward copy to ensure correctess
+ // of the reuslt on overlaping memory regions.
+ BasicBlock *BackwardBB = ThenTerm->getParent();
+ BackwardBB->setName("vec.backward");
+ BasicBlock *ForwardBB = ElseTerm->getParent();
+ ForwardBB->setName("vec.forward");
+ BasicBlock *ExitBB = InsertBefore->getParent();
+ ExitBB->setName("vec.done");
+
+ // === Backward Copy Loop ===
+ // This part is divided in two Blocks:
+ // - The preheader were pointers are moved to the end of the region to copy
+ // - The loop block that will perform the data copy
+ {
+ // Init block for initializing Src and Dest addresses
+ IRBuilder<> BackwardBuilder(BackwardBB);
+ Value *SrcStart = BackwardBuilder.CreateGEP(Int8Ty, SrcAddr, CopyLen);
+ Value *DstStart = BackwardBuilder.CreateGEP(Int8Ty, DstAddr, CopyLen);
+
+ BasicBlock *BackwardLoopBB =
+ BasicBlock::Create(F->getContext(), "vec.backward.loop", F, ForwardBB);
+ BackwardBuilder.CreateBr(BackwardLoopBB);
+
+ // Backward copy loop
+ IRBuilder<> BackwardLoopBuilder(BackwardLoopBB);
+ PHINode *Src =
+ BackwardLoopBuilder.CreatePHI(SrcStart->getType(), 2, "src.b");
+ PHINode *Dst =
+ BackwardLoopBuilder.CreatePHI(DstStart->getType(), 2, "dst.b");
+ PHINode *Len = BackwardLoopBuilder.CreatePHI(LenType, 2, "len.b");
+
+ Src->addIncoming(SrcStart, BackwardBB);
+ Dst->addIncoming(DstStart, BackwardBB);
+ Len->addIncoming(CopyLen, BackwardBB);
+
+ // Compute next chunk to be processed
+ Value *VL = Len;
+ Value *ActualVL = BackwardLoopBuilder.CreateIntrinsic(
+ Intrinsic::experimental_get_vector_length, {LenType},
+ {VL, ConstantInt::get(Int32Ty, 8), BackwardLoopBuilder.getTrue()});
+
+ Value *SrcNext = BackwardLoopBuilder.CreateGEP(
+ Int8Ty, Src, BackwardLoopBuilder.CreateNeg(ActualVL));
+ Value *DstNext = BackwardLoopBuilder.CreateGEP(
+ Int8Ty, Dst, BackwardLoopBuilder.CreateNeg(ActualVL));
+
+ // Load and store a chunk
+ Value *SrcBitcast = BackwardLoopBuilder.CreateBitCast(SrcNext, PtrTy);
+ Value *DstBitcast = BackwardLoopBuilder.CreateBitCast(DstNext, PtrTy);
+
+ FunctionCallee VPLoad = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::vp_load, {VecTy, PtrTy});
+ Value *Vec = BackwardLoopBuilder.CreateCall(
+ VPLoad, {SrcBitcast, TrueMaskVec, ActualVL});
+
+ FunctionCallee VPStore = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::vp_store, {VecTy, PtrTy});
+ BackwardLoopBuilder.CreateCall(VPStore,
+ {Vec, DstBitcast, TrueMaskVec, ActualVL});
+
+ // Update loop state
+ // On 64 bit the two operator types for update Len will be different types
+ // VL -> Int32Ty
+ // Len -> Int64Ty
+ Value *LenNext = BackwardLoopBuilder.CreateSub(
+ Len, BackwardLoopBuilder.CreateZExt(ActualVL, LenType));
+
+ Value *CondB = BackwardLoopBuilder.CreateICmpUGT(
+ LenNext, ConstantInt::get(LenType, 0));
+ BackwardLoopBuilder.CreateCondBr(CondB, BackwardLoopBB, ExitBB);
+
+ Src->addIncoming(SrcNext, BackwardLoopBB);
+ Dst->addIncoming(DstNext, BackwardLoopBB);
+ Len->addIncoming(LenNext, BackwardLoopBB);
+
+ ThenTerm->eraseFromParent();
+ }
+
+ // === Forward Copy Loop ===
+ buildScalableVectorForwardCpyLoop(EntryBB, ForwardBB, ExitBB, SrcAddr,
+ DstAddr, CopyLen);
+ ElseTerm->eraseFromParent();
+}
+
// Lower memmove to IR. memmove is required to correctly copy overlapping memory
// regions; therefore, it has to check the relative positions of the source and
// destination pointers and choose the copy direction accordingly.
@@ -929,6 +1177,84 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
}
}
+static void createMemSetScalableVectorLoop(Instruction *InsertBefore,
+ Value *DstAddr, Value *CopyLen,
+ Value *SetValue, Align DstAlign,
+ bool IsVolatile) {
+ BasicBlock *EntryBB = InsertBefore->getParent();
+ Function *F = EntryBB->getParent();
+ LLVMContext &Ctx = F->getContext();
+ Module *M = F->getParent();
+
+ Type *Int8Ty = Type::getInt8Ty(Ctx);
+ ElementCount EC = ElementCount::getScalable(8);
+ Type *VecTy = VectorType::get(Int8Ty, EC);
+ Type *PtrTy = PointerType::get(VecTy, 0);
+
+ Type *Int1Ty = Type::getInt1Ty(Ctx);
+ Type *MaskTy = VectorType::get(Int1Ty, EC);
+
+ Type *Int32Ty = Type::getInt32Ty(Ctx);
+
+ Type *ShuffleMaskTy = VectorType::get(Int32Ty, EC);
+
+ Type *LenType = CopyLen->getType();
+
+ Value *TrueMaskVec = Constant::getAllOnesValue(MaskTy);
+ BasicBlock *ExitBB =
+ EntryBB->splitBasicBlock(InsertBefore, "memset.vec.exit");
+ BasicBlock *LoopBB = BasicBlock::Create(Ctx, "memset.vec.loop", F, ExitBB);
+ EntryBB->getTerminator()->setSuccessor(0, LoopBB);
+
+ {
+ IRBuilder<> LoopBuilder(LoopBB);
+ PHINode *Dst = LoopBuilder.CreatePHI(DstAddr->getType(), 2, "src.f");
+ PHINode *Len = LoopBuilder.CreatePHI(LenType, 2, "len.f");
+
+ Dst->addIncoming(DstAddr, EntryBB);
+ Len->addIncoming(CopyLen, EntryBB);
+
+ // Broadcast scalar value into vector
+ Value *Val = (SetValue->getType() != Int8Ty)
+ ? LoopBuilder.CreateTrunc(SetValue, Int8Ty)
+ : SetValue;
+
+ Value *LaneValue = LoopBuilder.CreateInsertElement(
+ UndefValue::get(VecTy), Val, LoopBuilder.getInt32(0));
+
+ Value *ValueVec = LoopBuilder.CreateShuffleVector(
+ LaneValue, UndefValue::get(VecTy),
+ Constant::getNullValue(ShuffleMaskTy));
+
+ // store a chunk of data
+ Value *ActualVL = LoopBuilder.CreateIntrinsic(
+ Intrinsic::experimental_get_vector_length, {LenType},
+ {Len, ConstantInt::get(Int32Ty, 8), LoopBuilder.getTrue()});
+
+ Value *DstBitcast = LoopBuilder.CreateBitCast(Dst, PtrTy);
+ FunctionCallee VPStore = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::vp_store, {VecTy, PtrTy});
+ LoopBuilder.CreateCall(VPStore,
+ {ValueVec, DstBitcast, TrueMaskVec, ActualVL});
+
+ // Update loop state
+ Value *DstNext = LoopBuilder.CreateGEP(Int8Ty, Dst, ActualVL);
+
+ // On 64 bit the two operator types for update Len will be different types
+ // VL -> Int32Ty
+ // Len -> Int64Ty
+ Value *LenNext =
+ LoopBuilder.CreateSub(Len, LoopBuilder.CreateZExt(ActualVL, LenType));
+
+ Value *Cond =
+ LoopBuilder.CreateICmpUGT(LenNext, ConstantInt::get(LenType, 0));
+
+ LoopBuilder.CreateCondBr(Cond, LoopBB, ExitBB);
+ Dst->addIncoming(DstNext, LoopBB);
+ Len->addIncoming(LenNext, LoopBB);
+ }
+}
+
static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
Value *CopyLen, Value *SetValue, Align DstAlign,
bool IsVolatile) {
@@ -936,10 +1262,9 @@ static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
BasicBlock *OrigBB = InsertBefore->getParent();
Function *F = OrigBB->getParent();
const DataLayout &DL = F->getDataLayout();
- BasicBlock *NewBB =
- OrigBB->splitBasicBlock(InsertBefore, "split");
- BasicBlock *LoopBB
- = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
+ BasicBlock *NewBB = OrigBB->splitBasicBlock(InsertBefore, "split");
+ BasicBlock *LoopBB =
+ BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
IRBuilder<> Builder(OrigBB->getTerminator());
@@ -983,6 +1308,22 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
const TargetTransformInfo &TTI,
ScalarEvolution *SE) {
bool CanOverlap = canOverlap(Memcpy, SE);
+ // If the target architecture support scalable vector, we can just lower
+ // it using VP intrinsics
+ if (TTI.preferMemIntrinsicVPExpansion()) {
+ createMemCpyScalableVectorLoop(
+ /* InsertBefore */ Memcpy,
+ /* SrcAddr */ Memcpy->getRawSource(),
+ /* DstAddr */ Memcpy->getRawDest(),
+ /* CopyLen */ Memcpy->getLength(),
+ /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
+ /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
+ /* SrcIsVolatile */ Memcpy->isVolatile(),
+ /* DstIsVolatile */ Memcpy->isVolatile(),
+ /* TargetTransformInfo */ TTI);
+ return;
+ }
+
if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
createMemCpyLoopKnownSize(
/* InsertBefore */ Memcpy,
@@ -1020,13 +1361,22 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
bool SrcIsVolatile = Memmove->isVolatile();
bool DstIsVolatile = SrcIsVolatile;
IRBuilder<> CastBuilder(Memmove);
-
unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
if (SrcAS != DstAS) {
if (!TTI.addrspacesMayAlias(SrcAS, DstAS)) {
// We may not be able to emit a pointer comparison, but we don't have
// to. Expand as memcpy.
+
+ // If the target architecture support scalable vector, we can just lower
+ // it using VP intrinsics
+ if (TTI.preferMemIntrinsicVPExpansion()) {
+ createMemCpyScalableVectorLoop(/*InsertBefore=*/Memmove, SrcAddr,
+ DstAddr, CopyLen, SrcAlign, DstAlign,
+ SrcIsVolatile, DstIsVolatile, TTI);
+ return true;
+ }
+
if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
createMemCpyLoopKnownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr,
CI, SrcAlign, DstAlign, SrcIsVolatile,
@@ -1054,6 +1404,15 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
}
}
+ // If the target architecture support scalable vector, we can just lower
+ // it using VP intrinsics
+ if (TTI.preferMemIntrinsicVPExpansion()) {
+ createMemMoveScalableVectorLoop(
+ /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
+ SrcIsVolatile, DstIsVolatile, TTI);
+ return true;
+ }
+
if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
createMemMoveLoopKnownSize(
/*InsertBefore=*/Memmove, SrcAddr, DstAddr, CI, SrcAlign, DstAlign,
@@ -1066,13 +1425,24 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
return true;
}
-void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
- createMemSetLoop(/* InsertBefore */ Memset,
- /* DstAddr */ Memset->getRawDest(),
- /* CopyLen */ Memset->getLength(),
- /* SetValue */ Memset->getValue(),
- /* Alignment */ Memset->getDestAlign().valueOrOne(),
- Memset->isVolatile());
+void llvm::expandMemSetAsLoop(MemSetInst *Memset,
+ const TargetTransformInfo &TTI) {
+ if (TTI.preferMemIntrinsicVPExpansion()) {
+ createMemSetScalableVectorLoop(
+ /* InsertBefore=*/Memset,
+ /* DstAddr=*/Memset->getRawDest(),
+ /* CopyLen=*/Memset->getLength(),
+ /* SetValue=*/Memset->getValue(),
+ /* Alignment=*/Memset->getDestAlign().valueOrOne(),
+ Memset->isVolatile());
+ } else {
+ createMemSetLoop(/* InsertBefore=*/Memset,
+ /* DstAddr=*/Memset->getRawDest(),
+ /* CopyLen=*/Memset->getLength(),
+ /* SetValue=*/Memset->getValue(),
+ /* Alignment=*/Memset->getDestAlign().valueOrOne(),
+ Memset->isVolatile());
+ }
}
void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) {
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/expand-vp-mem-ops.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/expand-vp-mem-ops.ll
new file mode 100644
index 0000000000000..a8d11359ef69e
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/expand-vp-mem-ops.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=riscv64-linux-gnu -mattr=+v -passes=pre-isel-intrinsic-lowering \
+; RUN: -force-mem-intrinsic-expansion -S < %s | FileCheck %s
+
+define void @memcpy1024_i64(ptr %dst, ptr %src) {
+; CHECK-LABEL: define void @memcpy1024_i64(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[MEMCPY_VEC_LOOP:.*]]
+; CHECK: [[MEMCPY_VEC_LOOP]]:
+; CHECK-NEXT: [[SRC_F:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[MEMCPY_VEC_LOOP]] ]
+; CHECK-NEXT: [[DST_F:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[MEMCPY_VEC_LOOP]] ]
+; CHECK-NEXT: [[LEN_F:%.*]] = phi i64 [ 1024, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[MEMCPY_VEC_LOOP]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[LEN_F]], i32 8, i1 true)
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr [[SRC_F]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP1]], ptr [[DST_F]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT: [[TMP2]] = getelementptr i8, ptr [[SRC_F]], i32 [[TMP0]]
+; CHECK-NEXT: [[TMP3]] = getelementptr i8, ptr [[DST_F]], i32 [[TMP0]]
+; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP5]] = sub i64 [[LEN_F]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP5]], 0
+; CHECK-NEXT: br i1 [[TMP6]], label %[[MEMCPY_VEC_LOOP]], label %[[MEMCPY_VEC_EXIT:.*]]
+; CHECK: [[MEMCPY_VEC_EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 1024, i1 false)
+ ret void
+}
+
+define void @memcpy1024_i32(ptr %dst, ptr %src) {
+; CHECK-LABEL: define void @memcpy1024_i32(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[MEMCPY_VEC_LOOP:.*]]
+; CHECK: [[MEMCPY_VEC_LOOP]]:
+; CHECK-NEXT: [[SRC_F:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[MEMCPY_VEC_LOOP]] ]
+; CHECK-NEXT: [[DST_F:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[MEMCPY_VEC_LOOP]] ]
+; CHECK-NEXT: [[LEN_F:%.*]] = phi i32 [ 1024, %[[ENTRY]] ], [ [[TMP4:%.*]], %[[MEMCPY_VEC_LOOP]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[LEN_F]], i32 8, i1 true)
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr [[SRC_F]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP1]], ptr [[DST_F]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT: [[TMP2]] = getelementptr i8, ptr [[SRC_F]], i32 [[TMP0]]
+; CHECK-NEXT: [[TMP3]] = getelementptr i8, ptr [[DST_F]], i32 [[TMP0]]
+; CHECK-NEXT: [[TMP4]] = sub i32 [[LEN_F]], [[TMP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MEMCPY_VEC_LOOP]], label %[[MEMCPY_VEC_EXIT:.*]]
+; CHECK: [[MEMCPY_VEC_EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.memcpy.p0.p0.i32(ptr %dst, ptr %src, i32 1024, i1 false)
+ ret void
+}
+
+define void @memset1024(ptr %dst, i8 %value) {
+; CHECK-LABEL: define void @memset1024(
+; CHECK-SAME: ptr [[DST:%.*]], i8 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[MEMSET_VEC_LOOP:.*]]
+; CHECK: [[MEMSET_VEC_LOOP]]:
+; CHECK-NEXT: [[SRC_F:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[MEMSET_VEC_LOOP]] ]
+; CHECK-NEXT: [[LEN_F:%.*]] = phi i64 [ 1024, %[[ENTRY]] ], [ [[TMP4:%.*]], %[[MEMSET_VEC_LOOP]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <vscale x 8 x i8> undef, i8 [[VALUE]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <vscale x 8 x i8> [[TMP6]], <vscale x 8 x i8> undef, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[LEN_F]], i32 8, i1 true)
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP1]], ptr [[SRC_F]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT: [[TMP2]] = getelementptr i8, ptr [[SRC_F]], i32 [[TMP0]]
+; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP4]] = sub i64 [[LEN_F]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MEMSET_VEC_LOOP]], label %[[MEMSET_VEC_EXIT:.*]]
+; CHECK: [[MEMSET_VEC_EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.memset.p0.i64(ptr %dst, i8 %value, i64 1024, i1 false)
+ ret void
+}
+
+define void @memmove1024_i64(ptr %dst, ptr %src) {
+; CHECK-LABEL: define void @memmove1024_i64(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[COMPARE_ADDR:%.*]] = icmp ult ptr [[SRC]], [[DST]]
+; CHECK-NEXT: br i1 [[COMPARE_ADDR]], label %[[VEC_BACKWARD:.*]], label %[[VEC_FORWARD:.*]]
+; CHECK: [[VEC_BACKWARD]]:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC]], i64 1024
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1024
+; CHECK-NEXT: br label %[[VEC_BACKWARD_LOOP:.*]]
+; CHECK: [[VEC_BACKWARD_LOOP]]:
+; CHECK-NEXT: [[SRC_B:%.*]] = phi ptr [ [[TMP0]], %[[VEC_BACKWARD]] ], [ [[TMP4:%.*]], %[[VEC_BACKWARD_LOOP]] ]
+; CHECK-NEXT: [[DST_B:%.*]] = phi ptr [ [[TMP1]], %[[VEC_BACKWARD]] ], [ [[TMP6:%.*]], %[[VEC_BACKWARD_LOOP]] ]
+; CHECK-NEXT: [[LEN_B:%.*]] = phi i64 [ 1024, %[[VEC_BACKWARD]] ], [ [[TMP9:%.*]], %[[VEC_BACKWARD_LOOP]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[LEN_B]], i32 8, i1 true)
+; CHECK-NEXT: [[TMP3:%.*]] = sub i32 0, [[TMP2]]
+; CHECK-NEXT: [[TMP4]] = getelementptr i8, ptr [[SRC_B]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP2]]
+; CHECK-NEXT: [[TMP6]] = getelementptr i8, ptr [[DST_B]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr [[TMP4]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP2]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP7]], ptr [[TMP6]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP2]])
+; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT: [[TMP9]] = sub i64 [[LEN_B]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP9]], 0
+; CHECK-NEXT: br i1 [[TMP10]], label %[[VEC_BACKWARD_LOOP]], label %[[VEC_DONE:.*]]
+; CHECK: [[VEC_FORWARD]]:
+; CHECK-NEXT: [[SRC_F:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[TMP13:%.*]], %[[VEC_FORWARD]] ]
+; CHECK-NEXT: [[DST_F:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ], [ [[TMP14:%.*]], %[[VEC_FORWARD]] ]
+; CHECK-NEXT: [[LEN_F:%.*]] = phi i64 [ 1024, %[[ENTRY]] ], [ [[TMP16:%.*]], %[[VEC_FORWARD]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[LEN_F]], i32 8, i1 true)
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr [[SRC_F]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP12]], ptr [[DST_F]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT: [[TMP13]] = getelementptr i8, ptr [[SRC_F]], i32 [[TMP11]]
+; CHECK-NEXT: [[TMP14]] = getelementptr i8, ptr [[DST_F]], i32 [[TMP11]]
+; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT: [[TMP16]] = sub i64 [[LEN_F]], [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ugt i64 [[TMP16]], 0
+; CHECK-NEXT: br i1 [[TMP17]], label %[[VEC_FORWARD]], label %[[VEC_DONE]]
+; CHECK: [[VEC_DONE]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %dst, ptr %src, i64 1024, i1 false)
+ ret void
+}
+
+define void @memmove1024_i32(ptr %dst, ptr %src) {
+; CHECK-LABEL: define void @memmove1024_i32(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[COMPARE_ADDR:%.*]] = icmp ult ptr [[SRC]], [[DST]]
+; CHECK-NEXT: br i1 [[COMPARE_ADDR]], label %[[VEC_BACKWARD:.*]], label %[[VEC_FORWARD:.*]]
+; CHECK: [[VEC_BACKWARD]]:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC]], i32 1024
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i32 1024
+; CHECK-NEXT: br label %[[VEC_BACKWARD_LOOP:.*]]
+; CHECK: [[VEC_BACKWARD_LOOP]]:
+; CHECK-NEXT: [[SRC_B:%.*]] = phi ptr [ [[TMP0]], %[[VEC_BACKWARD]] ], [ [[TMP4:%.*]], %[[VEC_BACKWARD_LOOP]] ]
+; CHECK-NEXT: [[DST_B:%.*]] = phi ptr [ [[TMP1]], %[[VEC_BACKWARD]] ], [ [[TMP6:%.*]], %[[VEC_BACKWARD_LOOP]] ]
+; CHECK-NEXT: [[LEN_B:%.*]] = phi i32 [ 1024, %[[VEC_BACKWARD]] ], [ [[TMP8:%.*]], %[[VEC_BACKWARD_LOOP]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[LEN_B]], i32 8, i1 true)
+; CHECK-NEXT: [[TMP3:%.*]] = sub i32 0, [[TMP2]]
+; CHECK-NEXT: [[TMP4]] = getelementptr i8, ptr [[SRC_B]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[TMP2]]
+; CHECK-NEXT: [[TMP6]] = getelementptr i8, ptr [[DST_B]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr [[TMP4]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP2]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP7]], ptr [[TMP6]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP2]])
+; CHECK-NEXT: [[TMP8]] = sub i32 [[LEN_B]], [[TMP2]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[TMP8]], 0
+; CHECK-NEXT: br i1 [[TMP9]], label %[[VEC_BACKWARD_LOOP]], label %[[VEC_DONE:.*]]
+; CHECK: [[VEC_FORWARD]]:
+; CHECK-NEXT: [[SRC_F:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[TMP12:%.*]], %[[VEC_FORWARD]] ]
+; CHECK-NEXT: [[DST_F:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ], [ [[TMP13:%.*]], %[[VEC_FORWARD]] ]
+; CHECK-NEXT: [[LEN_F:%.*]] = phi i32 [ 1024, %[[ENTRY]] ], [ [[TMP14:%.*]], %[[VEC_FORWARD]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[LEN_F]], i32 8, i1 true)
+; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr [[SRC_F]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP10]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP11]], ptr [[DST_F]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP10]])
+; CHECK-NEXT: [[TMP12]] = getelementptr i8, ptr [[SRC_F]], i32 [[TMP10]]
+; CHECK-NEXT: [[TMP13]] = getelementptr i8, ptr [[DST_F]], i32 [[TMP10]]
+; CHECK-NEXT: [[TMP14]] = sub i32 [[LEN_F]], [[TMP10]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp ugt i32 [[TMP14]], 0
+; CHECK-NEXT: br i1 [[TMP15]], label %[[VEC_FORWARD]], label %[[VEC_DONE]]
+; CHECK: [[VEC_DONE]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.memmove.p0.p0.i32(ptr %dst, ptr %src, i32 1024, i1 false)
+ ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
+declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
+declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1)
+declare void @llvm.memmove.p0.p0.i64(ptr, ptr, i64, i1)
+declare void @llvm.memmove.p0.p0.i32(ptr, ptr, i32, i1)
More information about the llvm-commits
mailing list