[llvm] [CodeGen][PreISelIntrinsicLowering] Add VP-based lowering for memcpy/memmove/memset (PR #165585)

Tue Nov 4 03:58:08 PST 2025

================
@@ -373,6 +465,173 @@ tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2,
   return {ResAddr1, ResAddr2};
 }
 
+// Lower memmove to IR using VP intrinsics. memmove is required to correctly
+// copy overlapping memory regions; therefore, it has to check the relative
+// positions of the source and destination pointers and choose the copy
+// direction accordingly.
+//
+// The code below is an IR rendition of this C function using RISCV intrinsics:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+//    uint8_t *d = (uint8_t *)dest;
+//    const uint8_t *s = (const uint8_t *)src;
+//    if (d < s || d >= s + n) {
+//        // Forward copy
+//        while (n > 0) {
+//            size_t vl = __riscv_vsetvl_e8m8(n);
+//            vuint8m8_t v = __riscv_vle8_v_u8m8(s, vl);
+//            __riscv_vse8_v_u8m8(d, v, vl);
+//            s += vl;
+//            d += vl;
+//            n -= vl;
+//        }
+//    } else {
+//        // Backward copy
+//        s += n;
+//        d += n;
+//        while (n > 0) {
+//            size_t vl = __riscv_vsetvl_e8m8(n);
+//            s -= vl;
+//            d -= vl;
+//            vuint8m8_t v = __riscv_vle8_v_u8m8(s, vl);
+//            __riscv_vse8_v_u8m8(d, v, vl);
+//            n -= vl;
+//        }
+//    }
+//    return dest;
+// }
+static void createMemMoveScalableVectorLoop(Instruction *InsertBefore,
+                                            Value *SrcAddr, Value *DstAddr,
+                                            Value *CopyLen, Align SrcAlign,
+                                            Align DstAlign, bool SrcIsVolatile,
+                                            bool DstIsVolatile,
+                                            const TargetTransformInfo &TTI) {
+
+  BasicBlock *EntryBB = InsertBefore->getParent();
+  Function *F = EntryBB->getParent();
+  LLVMContext &Ctx = F->getContext();
+  Module *M = F->getParent();
+
+  Type *Int8Ty = Type::getInt8Ty(Ctx);
+  ElementCount EC = ElementCount::getScalable(8);
+  Type *VecTy = VectorType::get(Int8Ty, EC); // <vscale x 8 x i8>
+  Type *PtrTy = PointerType::get(VecTy, 0);  // i8* or <vscale x N x i8>*
+
+  Type *Int1Ty = Type::getInt1Ty(Ctx);
+  Type *MaskTy = VectorType::get(Int1Ty, EC);
+
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+  Type *Int64Ty = Type::getInt64Ty(Ctx);
+
+  Type *LenType = CopyLen->getType();
+  Value *TrueMaskVec = Constant::getAllOnesValue(MaskTy);
+
+  IRBuilder<> Builder(InsertBefore);
+  // Create the a comparison of src and dst, based on which we jump to either
+  // the forward-copy part of the function (if src >= dst) or the backwards-copy
+  // part (if src < dst).
+  // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+  // structure. Its block terminators (unconditional branches) are replaced by
+  // the appropriate conditional branches when the loop is built.
+  // If the pointers are in different address spaces, they need to be converted
+  // to a compatible one. Cases where memory ranges in the different address
+  // spaces cannot overlap are lowered as memcpy and not handled here.
+  auto [CmpSrcAddr, CmpDstAddr] =
+      tryInsertCastToCommonAddrSpace(Builder, SrcAddr, DstAddr, TTI);
+  Value *PtrCompare =
+      Builder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_addr");
+  Instruction *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
+                                &ThenTerm, &ElseTerm);
+
+  // Then we will have two parts, backward and forward copy to ensure correctess
+  // of the reuslt on overlaping memory regions.
+  BasicBlock *BackwardBB = ThenTerm->getParent();
+  BackwardBB->setName("vec.backward");
+  BasicBlock *ForwardBB = ElseTerm->getParent();
+  ForwardBB->setName("vec.forward");
+  BasicBlock *ExitBB = InsertBefore->getParent();
+  ExitBB->setName("vec.done");
+
+  // === Backward Copy Loop ===
+  // This part is divided in two Blocks:
+  // - The preheader were pointers are moved to the end of the region to copy
+  // - The loop block that will perform the data copy
+  {
+    // Init block for initializing Src and Dest addresses
+    IRBuilder<> BackwardBuilder(BackwardBB);
+    Value *SrcStart = BackwardBuilder.CreateGEP(Int8Ty, SrcAddr, CopyLen);
+    Value *DstStart = BackwardBuilder.CreateGEP(Int8Ty, DstAddr, CopyLen);
+
+    BasicBlock *BackwardLoopBB =
+        BasicBlock::Create(F->getContext(), "vec.backward.loop", F, ForwardBB);
+    BackwardBuilder.CreateBr(BackwardLoopBB);
+
+    // Backward copy loop
+    IRBuilder<> BackwardLoopBuilder(BackwardLoopBB);
+    PHINode *Src =
+        BackwardLoopBuilder.CreatePHI(SrcStart->getType(), 2, "src.b");
+    PHINode *Dst =
+        BackwardLoopBuilder.CreatePHI(DstStart->getType(), 2, "dst.b");
+    PHINode *Len = BackwardLoopBuilder.CreatePHI(LenType, 2, "len.b");
+
+    Src->addIncoming(SrcStart, BackwardBB);
+    Dst->addIncoming(DstStart, BackwardBB);
+    Len->addIncoming(CopyLen, BackwardBB);
+
+    // Compute next chunk to be processed
+    Value *VL = Len;
+    Value *ActualVL = BackwardLoopBuilder.CreateIntrinsic(
+        Intrinsic::experimental_get_vector_length, {LenType},
+        {VL, ConstantInt::get(Int32Ty, 8), BackwardLoopBuilder.getTrue()});
+
+    Value *SrcNext = BackwardLoopBuilder.CreateGEP(
+        Int8Ty, Src, BackwardLoopBuilder.CreateNeg(ActualVL));
+    Value *DstNext = BackwardLoopBuilder.CreateGEP(
+        Int8Ty, Dst, BackwardLoopBuilder.CreateNeg(ActualVL));
+
+    // Load and store a chunk
+    Value *SrcBitcast = BackwardLoopBuilder.CreateBitCast(SrcNext, PtrTy);
+    Value *DstBitcast = BackwardLoopBuilder.CreateBitCast(DstNext, PtrTy);
+
+    FunctionCallee VPLoad = Intrinsic::getOrInsertDeclaration(
+        M, Intrinsic::vp_load, {VecTy, PtrTy});
+    Value *Vec = BackwardLoopBuilder.CreateCall(
+        VPLoad, {SrcBitcast, TrueMaskVec, ActualVL});
+
+    FunctionCallee VPStore = Intrinsic::getOrInsertDeclaration(
+        M, Intrinsic::vp_store, {VecTy, PtrTy});
+    BackwardLoopBuilder.CreateCall(VPStore,
+                                   {Vec, DstBitcast, TrueMaskVec, ActualVL});
+
+    // Update loop state
+    // On 64 bit the two operator types for update Len will be different types
+    // VL -> Int32Ty
+    // Len -> Int64Ty
+    Value *LenNext;
+    if (LenType == Int64Ty)
+      LenNext = BackwardLoopBuilder.CreateSub(
+          Len, BackwardLoopBuilder.CreateZExt(ActualVL, Int64Ty));
----------------
dadra-oc wrote:

Thanks, already changed it.

https://github.com/llvm/llvm-project/pull/165585