[llvm] [LoopIdiom] Initial support for generating memset_pattern intrinsic (disabled by default) (PR #98311)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 10 05:41:06 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-transforms

Author: Alex Bradbury (asb)

<details>
<summary>Changes</summary>

Stacks on top of #<!-- -->97583.

When not disabled, produce the memset_pattern intrinsic that was
introduced in https://github.com/llvm/llvm-project/pull/97583.

---

Patch is 61.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/98311.diff


17 Files Affected:

- (modified) llvm/docs/LangRef.rst (+56) 
- (modified) llvm/include/llvm/IR/InstVisitor.h (+3) 
- (modified) llvm/include/llvm/IR/IntrinsicInst.h (+21-1) 
- (modified) llvm/include/llvm/IR/Intrinsics.td (+8) 
- (modified) llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp (+8) 
- (modified) llvm/lib/IR/Verifier.cpp (+2-1) 
- (modified) llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp (+89-21) 
- (modified) llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp (+113) 
- (added) llvm/test/CodeGen/RISCV/memset-pattern.ll (+591) 
- (added) llvm/test/Transforms/LoopIdiom/memset-pattern-intrinsic.ll (+141) 
- (added) llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg (+2) 
- (added) llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll (+8) 
- (added) llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg (+2) 
- (added) llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern-non-power-of-two-pattern.ll (+8) 
- (added) llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll (+234) 
- (modified) llvm/test/Verifier/intrinsic-immarg.ll (+10) 
- (added) llvm/test/Verifier/memset-pattern-inline.ll (+9) 


``````````diff
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ae39217dc8ff8..44f1d6d6d19bd 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15230,6 +15230,62 @@ The behavior of '``llvm.memset.inline.*``' is equivalent to the behavior of
 '``llvm.memset.*``', but the generated code is guaranteed not to call any
 external functions.
 
+.. _int_memset_pattern:
+
+'``llvm.memset_pattern``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.memset_pattern`` on
+any integer bit width and for different address spaces. Not all targets
+support all bit widths however.
+
+::
+
+      declare void @llvm.memset_pattern.p0.i64.i128(ptr <dest>, i128 <val>,
+                                                    i64 <len>, i1 <isvolatile>)
+
+Overview:
+"""""""""
+
+The '``llvm.memset_pattern.*``' intrinsics fill a block of memory with
+a particular value. This may be expanded to an inline loop, a sequence of
+stores, or a libcall depending on what is available for the target and the
+expected performance and code size impact.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to the destination to fill, the second
+is the value with which to fill it, the third argument is an integer
+argument specifying the number of bytes to fill, and the fourth is a boolean
+indicating a volatile access.
+
+The :ref:`align <attr_align>` parameter attribute can be provided
+for the first argument.
+
+If the ``isvolatile`` parameter is ``true``, the
+``llvm.memset_pattern`` call is a :ref:`volatile operation <volatile>`. The
+detailed access behavior is not very cleanly specified and it is unwise to
+depend on it.
+
+Semantics:
+""""""""""
+
+The '``llvm.memset_pattern.*``' intrinsics fill "len" bytes of memory
+starting at the destination location. If the argument is known to be aligned
+to some boundary, this can be specified as an attribute on the argument.
+
+If ``<len>`` is not an integer multiple of the pattern width in bytes, then any
+remainder bytes will be copied from ``<val>``.
+If ``<len>`` is 0, it is no-op modulo the behavior of attributes attached to
+the arguments.
+If ``<len>`` is not a well-defined value, the behavior is undefined.
+If ``<len>`` is not zero, ``<dest>`` should be well-defined, otherwise the
+behavior is undefined.
+
 .. _int_sqrt:
 
 '``llvm.sqrt.*``' Intrinsic
diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h
index 311e0ac47ddfa..aa4f0f36e4ed7 100644
--- a/llvm/include/llvm/IR/InstVisitor.h
+++ b/llvm/include/llvm/IR/InstVisitor.h
@@ -208,6 +208,7 @@ class InstVisitor {
   RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I){ DELEGATE(IntrinsicInst); }
   RetTy visitMemSetInst(MemSetInst &I)            { DELEGATE(MemIntrinsic); }
   RetTy visitMemSetInlineInst(MemSetInlineInst &I){ DELEGATE(MemSetInst); }
+  RetTy visitMemSetPatternInst(MemSetPatternInst &I) { DELEGATE(MemSetInst); }
   RetTy visitMemCpyInst(MemCpyInst &I)            { DELEGATE(MemTransferInst); }
   RetTy visitMemCpyInlineInst(MemCpyInlineInst &I){ DELEGATE(MemCpyInst); }
   RetTy visitMemMoveInst(MemMoveInst &I)          { DELEGATE(MemTransferInst); }
@@ -295,6 +296,8 @@ class InstVisitor {
       case Intrinsic::memset:      DELEGATE(MemSetInst);
       case Intrinsic::memset_inline:
         DELEGATE(MemSetInlineInst);
+      case Intrinsic::memset_pattern:
+        DELEGATE(MemSetPatternInst);
       case Intrinsic::vastart:     DELEGATE(VAStartInst);
       case Intrinsic::vaend:       DELEGATE(VAEndInst);
       case Intrinsic::vacopy:      DELEGATE(VACopyInst);
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index a2ecf625ff61a..af8789d4958d8 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1208,6 +1208,7 @@ class MemIntrinsic : public MemIntrinsicBase<MemIntrinsic> {
     case Intrinsic::memmove:
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
     case Intrinsic::memcpy_inline:
       return true;
     default:
@@ -1219,7 +1220,8 @@ class MemIntrinsic : public MemIntrinsicBase<MemIntrinsic> {
   }
 };
 
-/// This class wraps the llvm.memset and llvm.memset.inline intrinsics.
+/// This class wraps the llvm.memset, llvm.memset.inline, and
+/// llvm.memset_pattern intrinsics.
 class MemSetInst : public MemSetBase<MemIntrinsic> {
 public:
   // Methods for support type inquiry through isa, cast, and dyn_cast:
@@ -1227,6 +1229,7 @@ class MemSetInst : public MemSetBase<MemIntrinsic> {
     switch (I->getIntrinsicID()) {
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
       return true;
     default:
       return false;
@@ -1249,6 +1252,21 @@ class MemSetInlineInst : public MemSetInst {
   }
 };
 
+/// This class wraps the llvm.memset.pattern intrinsic.
+class MemSetPatternInst : public MemSetInst {
+public:
+  ConstantInt *getLength() const {
+    return cast<ConstantInt>(MemSetInst::getLength());
+  }
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::memset_pattern;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This class wraps the llvm.memcpy/memmove intrinsics.
 class MemTransferInst : public MemTransferBase<MemIntrinsic> {
 public:
@@ -1328,6 +1346,7 @@ class AnyMemIntrinsic : public MemIntrinsicBase<AnyMemIntrinsic> {
     case Intrinsic::memmove:
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
     case Intrinsic::memcpy_element_unordered_atomic:
     case Intrinsic::memmove_element_unordered_atomic:
     case Intrinsic::memset_element_unordered_atomic:
@@ -1350,6 +1369,7 @@ class AnyMemSetInst : public MemSetBase<AnyMemIntrinsic> {
     switch (I->getIntrinsicID()) {
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
     case Intrinsic::memset_element_unordered_atomic:
       return true;
     default:
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 65a9b68b5229d..f79a19e0a8d29 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1003,6 +1003,14 @@ def int_memset_inline
        NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
        ImmArg<ArgIndex<3>>]>;
 
+// Memset variant that writes a given pattern.
+def int_memset_pattern
+    : Intrinsic<[],
+      [llvm_anyptr_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_i1_ty],
+      [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, IntrNoFree, IntrNoCallback,
+       NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
+       ImmArg<ArgIndex<3>>], "llvm.memset_pattern">;
+
 // FIXME: Add version of these floating point intrinsics which allow non-default
 // rounding modes and FP exception handling.
 
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 8572cdc160456..97267afeacef3 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -276,6 +276,13 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
       Memset->eraseFromParent();
       break;
     }
+    case Intrinsic::memset_pattern: {
+      auto *Memset = cast<MemSetPatternInst>(Inst);
+      expandMemSetAsLoop(Memset);
+      Changed = true;
+      Memset->eraseFromParent();
+      break;
+    }
     default:
       llvm_unreachable("unhandled intrinsic");
     }
@@ -294,6 +301,7 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
     case Intrinsic::memmove:
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
       Changed |= expandMemIntrinsicUses(F);
       break;
     case Intrinsic::load_relative:
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index d156eaec4c172..75f4bf2973919 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5435,7 +5435,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::memcpy_inline:
   case Intrinsic::memmove:
   case Intrinsic::memset:
-  case Intrinsic::memset_inline: {
+  case Intrinsic::memset_inline:
+  case Intrinsic::memset_pattern: {
     break;
   }
   case Intrinsic::memcpy_element_unordered_atomic:
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 635bd1236196e..27fe98ec363ca 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -133,6 +133,11 @@ static cl::opt<bool> UseLIRCodeSizeHeurs(
              "with -Os/-Oz"),
     cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnableMemsetPatternIntrinsic(
+    "loop-idiom-enable-memset-pattern-intrinsic",
+    cl::desc("Enable use of the memset_pattern intrinsic."), cl::init(false),
+    cl::Hidden);
+
 namespace {
 
 class LoopIdiomRecognize {
@@ -300,7 +305,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
   HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
   HasMemcpy = TLI->has(LibFunc_memcpy);
 
-  if (HasMemset || HasMemsetPattern || HasMemcpy)
+  if (HasMemset || HasMemsetPattern || EnableMemsetPatternIntrinsic ||
+      HasMemcpy)
     if (SE->hasLoopInvariantBackedgeTakenCount(L))
       return runOnCountableLoop();
 
@@ -457,7 +463,8 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
     // It looks like we can use SplatValue.
     return LegalStoreKind::Memset;
   }
-  if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
+  if (!UnorderedAtomic && (HasMemsetPattern || EnableMemsetPatternIntrinsic) &&
+      !DisableLIRP::Memset &&
       // Don't create memset_pattern16s with address spaces.
       StorePtr->getType()->getPointerAddressSpace() == 0 &&
       getMemSetPatternValue(StoredVal, DL)) {
@@ -993,6 +1000,46 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
                         SCEV::FlagNUW);
 }
 
+ConstantInt *memSetPatternValueToI128ConstantInt(LLVMContext &Context,
+                                                 Value *MemSetPatternValue) {
+  if (auto CIMemSetPatternValue = dyn_cast<ConstantInt>(MemSetPatternValue)) {
+    return CIMemSetPatternValue;
+  }
+
+  if (auto Array = dyn_cast<ConstantDataArray>(MemSetPatternValue)) {
+    Type *ElementType = Array->getElementType();
+    unsigned ElementSize = Array->getElementByteSize() * 8;
+
+    APInt Result(128, 0);
+    unsigned totalBits = 0;
+
+    for (unsigned i = 0; i < Array->getNumElements(); ++i) {
+      if (totalBits + ElementSize > 128) {
+        report_fatal_error("Pattern value unexpectedly greater than 128 bits");
+      }
+
+      APInt ElementBits;
+      if (ElementType->isIntegerTy()) {
+        ElementBits = Array->getElementAsAPInt(i);
+      } else if (ElementType->isFloatingPointTy()) {
+        APFloat APF = Array->getElementAsAPFloat(i);
+        ElementBits = APF.bitcastToAPInt();
+      } else {
+        llvm_unreachable("Unexpected element type");
+      }
+
+      // Shift the existing result left by the element's size and OR in the new
+      // value
+      Result = (Result << ElementSize) | ElementBits.zextOrTrunc(128);
+      totalBits += ElementSize;
+    }
+
+    // Create and return a ConstantInt with the resulting value
+    return ConstantInt::get(Context, Result);
+  }
+  report_fatal_error("Encountered unrecognised type");
+}
+
 /// processLoopStridedStore - We see a strided store of some value.  If we can
 /// transform this into a memset or memset_pattern in the loop preheader, do so.
 bool LoopIdiomRecognize::processLoopStridedStore(
@@ -1070,7 +1117,8 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
 
-  if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16))
+  if (!SplatValue && !(isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16) ||
+                       EnableMemsetPatternIntrinsic))
     return Changed;
 
   AAMDNodes AATags = TheStore->getAAMetadata();
@@ -1087,24 +1135,44 @@ bool LoopIdiomRecognize::processLoopStridedStore(
         BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
         /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
   } else {
-    assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
-    // Everything is emitted in default address space
-    Type *Int8PtrTy = DestInt8PtrTy;
-
-    StringRef FuncName = "memset_pattern16";
-    FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
-                            Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
-    inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
-
-    // Otherwise we should form a memset_pattern16.  PatternValue is known to be
-    // an constant array of 16-bytes.  Plop the value into a mergable global.
-    GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
-                                            GlobalValue::PrivateLinkage,
-                                            PatternValue, ".memset_pattern");
-    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
-    GV->setAlignment(Align(16));
-    Value *PatternPtr = GV;
-    NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
+    assert(isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16) ||
+           EnableMemsetPatternIntrinsic);
+    if (EnableMemsetPatternIntrinsic) {
+      // Everything is emitted in default address space
+
+      // Get or insert the intrinsic declaration
+      Function *MemsetPatternIntrinsic = Intrinsic::getDeclaration(
+          M, Intrinsic::memset_pattern,
+          {DestInt8PtrTy, Builder.getInt128Ty(), Builder.getInt64Ty()});
+
+      // Create the call to the intrinsic
+      NewCall = Builder.CreateCall(
+          MemsetPatternIntrinsic,
+          {BasePtr,
+           memSetPatternValueToI128ConstantInt(M->getContext(), PatternValue),
+           NumBytes, ConstantInt::getFalse(M->getContext())});
+    } else {
+      // Everything is emitted in default address space
+      Type *Int8PtrTy = DestInt8PtrTy;
+
+      StringRef FuncName = "memset_pattern16";
+      FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
+                                              Builder.getVoidTy(), Int8PtrTy,
+                                              Int8PtrTy, IntIdxTy);
+      inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
+
+      // Otherwise we should form a memset_pattern16.  PatternValue is known to
+      // be an constant array of 16-bytes.  Plop the value into a mergable
+      // global.
+      GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
+                                              GlobalValue::PrivateLinkage,
+                                              PatternValue, ".memset_pattern");
+      GV->setUnnamedAddr(
+          GlobalValue::UnnamedAddr::Global); // Ok to merge these.
+      GV->setAlignment(Align(16));
+      Value *PatternPtr = GV;
+      NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
+    }
 
     // Set the TBAA info if present.
     if (AATags.TBAA)
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index d2814f07530d8..8d7c234d16878 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -456,6 +456,109 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   ElseTerm->eraseFromParent();
 }
 
+static void createMemSetPatternLoop(Instruction *InsertBefore, Value *DstAddr,
+                                    Value *CopyLen, Value *SetValue,
+                                    Align DstAlign, bool IsVolatile) {
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  const DataLayout &DL = F->getDataLayout();
+
+  if (DL.isBigEndian())
+    report_fatal_error("memset_pattern.inline expansion not currently "
+                       "implemented for big-endian targets",
+                       false);
+
+  // To start with, let's assume SetValue is an i128 and bail out if it's not.
+  if (!isPowerOf2_32(SetValue->getType()->getScalarSizeInBits()))
+    report_fatal_error("Pattern width for memset_pattern must be a power of 2",
+                       false);
+  unsigned PatternSize = SetValue->getType()->getScalarSizeInBits() / 8;
+
+  Type *TypeOfCopyLen = CopyLen->getType();
+
+  BasicBlock *NewBB = OrigBB->splitBasicBlock(InsertBefore, "split");
+  BasicBlock *LoopBB =
+      BasicBlock::Create(F->getContext(), "storeloop", F, NewBB);
+  BasicBlock *RemCheckBB =
+      BasicBlock::Create(F->getContext(), "remcheck", F, NewBB);
+  BasicBlock *RemainderLoopBB =
+      BasicBlock::Create(F->getContext(), "remainderloop", F, NewBB);
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  ConstantInt *CILoopOpSize =
+      ConstantInt::get(dyn_cast<IntegerType>(TypeOfCopyLen), PatternSize);
+  Value *RuntimeLoopCount =
+      getRuntimeLoopCount(DL, Builder, CopyLen, CILoopOpSize, PatternSize);
+  Value *RuntimeRemainder =
+      getRuntimeLoopRemainder(DL, Builder, CopyLen, CILoopOpSize, PatternSize);
+
+  Builder.CreateCondBr(Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0),
+                                            RuntimeLoopCount),
+                       RemCheckBB, LoopBB);
+  OrigBB->getTerminator()->eraseFromParent();
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *CurrentDst = LoopBuilder.CreatePHI(DstAddr->getType(), 0);
+  CurrentDst->addIncoming(DstAddr, OrigBB);
+  PHINode *LoopCount = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopCount->addIncoming(RuntimeLoopCount, OrigBB);
+
+  // Create the store instruction for the pattern
+  LoopBuilder.CreateAlignedStore(SetValue, CurrentDst, DstAlign, IsVolatile);
+
+  Value *NextDst = LoopBuilder.CreateInBoundsGEP(
+      SetValue->getType(), CurrentDst,
+      ConstantInt::get(TypeOfCopyLen, PatternSize));
+  CurrentDst->addIncoming(NextDst, LoopBB);
+
+  Value *NewLoopCount =
+      LoopBuilder.CreateSub(LoopCount, ConstantInt::get(TypeOfCopyLen, 1));
+  LoopCount->addIncoming(NewLoopCount, LoopBB);
+
+  LoopBuilder.CreateCondBr(
+      LoopBuilder.CreateICmpNE(NewLoopCount,
+                               ConstantInt::get(TypeOfCopyLen, 0)),
+      LoopBB, RemCheckBB);
+
+  IRBuilder<> RemCheckBuilder(RemCheckBB, RemCheckBB->begin());
+  // Branch to the end if there are no remainder bytes.
+  PHINode *RemainderDstPHI = RemCheckBuilder.CreatePHI(NextDst->getType(), 0);
+  RemainderDstPHI->addIncoming(DstAddr, OrigBB);
+  RemainderDstPHI->addIncoming(NextDst, LoopBB);
+  RemCheckBuilder.CreateCondBr(
+      RemCheckBuilder.CreateICmpEQ(RuntimeRemainder,
+                                   ConstantInt::get(TypeOfCopyLen, 0)),
+      NewBB, RemainderLoopBB);
+
+  // Remainder loop
+  IRBuilder<> RemainderLoopBuilder(RemainderLoopBB);
+  PHINode *ByteIndex = RemainderLoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  ByteIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), RemCheckBB);
+  Type *TypeOfSetValue = SetValue->getType();
+  PHINode *ShiftedValue = RemainderLoopBuilder.CreatePHI(TypeOfSetValue, 0);
+  ShiftedValue->addIncoming(SetValue, RemCheckBB);
+
+  Value *ByteToStore = RemainderLoopBuilder.CreateTrunc(
+      ShiftedValue, RemainderLoopBuilder.getInt8Ty());
+
+  RemainderLoopBuilder.CreateStore(
+      ByteToStore,
+      RemainderLoopBuilder.CreateInBoundsGEP(RemainderLoopBuilder.getInt8Ty(),
+                                             RemainderDstPHI, ByteIndex),
+      IsVolatile);
+
+  Value *NewByteIndex = RemainderLoopBuilder.CreateAdd(
+      ByteIndex, ConstantInt::get(TypeOfCopyLen, 1));
+  ByteIndex->addIncoming(NewByteIndex, RemainderLoopBB);
+  Value *NewShiftedValue = RemainderLoopBuilder.CreateLShr(
+      ShiftedValue, ConstantInt::get(TypeOfSetValue, 8));
+  ShiftedValue->addIncoming(NewShiftedValue, RemainderLoopBB);
+
+  RemainderLoopBuilder.CreateCondBr(
+      RemainderLoopBuilder.CreateICmpULT(NewByte...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/98311


More information about the llvm-commits mailing list