[llvm] [CodeGen][PreISelIntrinsicLowering] Add VP-based lowering for memcpy/memmove/memset (PR #165585)
David Del Río via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 4 03:58:08 PST 2025
================
@@ -373,6 +465,173 @@ tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2,
return {ResAddr1, ResAddr2};
}
+// Lower memmove to IR using VP intrinsics. memmove is required to correctly
+// copy overlapping memory regions; therefore, it has to check the relative
+// positions of the source and destination pointers and choose the copy
+// direction accordingly.
+//
+// The code below is an IR rendition of this C function using RISCV intrinsics:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+// uint8_t *d = (uint8_t *)dest;
+// const uint8_t *s = (const uint8_t *)src;
+// if (d < s || d >= s + n) {
+// // Forward copy
+// while (n > 0) {
+// size_t vl = __riscv_vsetvl_e8m8(n);
+// vuint8m8_t v = __riscv_vle8_v_u8m8(s, vl);
+// __riscv_vse8_v_u8m8(d, v, vl);
+// s += vl;
+// d += vl;
+// n -= vl;
+// }
+// } else {
+// // Backward copy
+// s += n;
+// d += n;
+// while (n > 0) {
+// size_t vl = __riscv_vsetvl_e8m8(n);
+// s -= vl;
+// d -= vl;
+// vuint8m8_t v = __riscv_vle8_v_u8m8(s, vl);
+// __riscv_vse8_v_u8m8(d, v, vl);
+// n -= vl;
+// }
+// }
+// return dest;
+// }
+static void createMemMoveScalableVectorLoop(Instruction *InsertBefore,
+ Value *SrcAddr, Value *DstAddr,
+ Value *CopyLen, Align SrcAlign,
+ Align DstAlign, bool SrcIsVolatile,
+ bool DstIsVolatile,
+ const TargetTransformInfo &TTI) {
+
+ BasicBlock *EntryBB = InsertBefore->getParent();
+ Function *F = EntryBB->getParent();
+ LLVMContext &Ctx = F->getContext();
+ Module *M = F->getParent();
+
+ Type *Int8Ty = Type::getInt8Ty(Ctx);
+ ElementCount EC = ElementCount::getScalable(8);
+ Type *VecTy = VectorType::get(Int8Ty, EC); // <vscale x 8 x i8>
+ Type *PtrTy = PointerType::get(VecTy, 0); // i8* or <vscale x N x i8>*
+
+ Type *Int1Ty = Type::getInt1Ty(Ctx);
+ Type *MaskTy = VectorType::get(Int1Ty, EC);
+
+ Type *Int32Ty = Type::getInt32Ty(Ctx);
+ Type *Int64Ty = Type::getInt64Ty(Ctx);
+
+ Type *LenType = CopyLen->getType();
+ Value *TrueMaskVec = Constant::getAllOnesValue(MaskTy);
+
+ IRBuilder<> Builder(InsertBefore);
+ // Create the a comparison of src and dst, based on which we jump to either
+ // the forward-copy part of the function (if src >= dst) or the backwards-copy
+ // part (if src < dst).
+ // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+ // structure. Its block terminators (unconditional branches) are replaced by
+ // the appropriate conditional branches when the loop is built.
+ // If the pointers are in different address spaces, they need to be converted
+ // to a compatible one. Cases where memory ranges in the different address
+ // spaces cannot overlap are lowered as memcpy and not handled here.
+ auto [CmpSrcAddr, CmpDstAddr] =
+ tryInsertCastToCommonAddrSpace(Builder, SrcAddr, DstAddr, TTI);
+ Value *PtrCompare =
+ Builder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_addr");
+ Instruction *ThenTerm, *ElseTerm;
+ SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
+ &ThenTerm, &ElseTerm);
+
+ // Then we will have two parts, backward and forward copy to ensure correctess
+ // of the reuslt on overlaping memory regions.
+ BasicBlock *BackwardBB = ThenTerm->getParent();
+ BackwardBB->setName("vec.backward");
+ BasicBlock *ForwardBB = ElseTerm->getParent();
+ ForwardBB->setName("vec.forward");
+ BasicBlock *ExitBB = InsertBefore->getParent();
+ ExitBB->setName("vec.done");
+
+ // === Backward Copy Loop ===
+ // This part is divided in two Blocks:
+ // - The preheader were pointers are moved to the end of the region to copy
+ // - The loop block that will perform the data copy
+ {
+ // Init block for initializing Src and Dest addresses
+ IRBuilder<> BackwardBuilder(BackwardBB);
+ Value *SrcStart = BackwardBuilder.CreateGEP(Int8Ty, SrcAddr, CopyLen);
+ Value *DstStart = BackwardBuilder.CreateGEP(Int8Ty, DstAddr, CopyLen);
+
+ BasicBlock *BackwardLoopBB =
+ BasicBlock::Create(F->getContext(), "vec.backward.loop", F, ForwardBB);
+ BackwardBuilder.CreateBr(BackwardLoopBB);
+
+ // Backward copy loop
+ IRBuilder<> BackwardLoopBuilder(BackwardLoopBB);
+ PHINode *Src =
+ BackwardLoopBuilder.CreatePHI(SrcStart->getType(), 2, "src.b");
+ PHINode *Dst =
+ BackwardLoopBuilder.CreatePHI(DstStart->getType(), 2, "dst.b");
+ PHINode *Len = BackwardLoopBuilder.CreatePHI(LenType, 2, "len.b");
+
+ Src->addIncoming(SrcStart, BackwardBB);
+ Dst->addIncoming(DstStart, BackwardBB);
+ Len->addIncoming(CopyLen, BackwardBB);
+
+ // Compute next chunk to be processed
+ Value *VL = Len;
+ Value *ActualVL = BackwardLoopBuilder.CreateIntrinsic(
+ Intrinsic::experimental_get_vector_length, {LenType},
+ {VL, ConstantInt::get(Int32Ty, 8), BackwardLoopBuilder.getTrue()});
+
+ Value *SrcNext = BackwardLoopBuilder.CreateGEP(
+ Int8Ty, Src, BackwardLoopBuilder.CreateNeg(ActualVL));
+ Value *DstNext = BackwardLoopBuilder.CreateGEP(
+ Int8Ty, Dst, BackwardLoopBuilder.CreateNeg(ActualVL));
+
+ // Load and store a chunk
+ Value *SrcBitcast = BackwardLoopBuilder.CreateBitCast(SrcNext, PtrTy);
+ Value *DstBitcast = BackwardLoopBuilder.CreateBitCast(DstNext, PtrTy);
+
+ FunctionCallee VPLoad = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::vp_load, {VecTy, PtrTy});
+ Value *Vec = BackwardLoopBuilder.CreateCall(
+ VPLoad, {SrcBitcast, TrueMaskVec, ActualVL});
+
+ FunctionCallee VPStore = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::vp_store, {VecTy, PtrTy});
+ BackwardLoopBuilder.CreateCall(VPStore,
+ {Vec, DstBitcast, TrueMaskVec, ActualVL});
+
+ // Update loop state
+ // On 64 bit the two operator types for update Len will be different types
+ // VL -> Int32Ty
+ // Len -> Int64Ty
+ Value *LenNext;
+ if (LenType == Int64Ty)
+ LenNext = BackwardLoopBuilder.CreateSub(
+ Len, BackwardLoopBuilder.CreateZExt(ActualVL, Int64Ty));
----------------
dadra-oc wrote:
Thanks, already changed it.
https://github.com/llvm/llvm-project/pull/165585
More information about the llvm-commits
mailing list