[llvm] [IR] Initial introduction of memset_pattern (PR #97583)

Alex Bradbury via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 10 05:38:04 PDT 2024


https://github.com/asb updated https://github.com/llvm/llvm-project/pull/97583

>From 627f1ef07008107924c3e5031776d312d497becf Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 10 Jul 2024 10:33:45 +0100
Subject: [PATCH] [IR] Initial introduction of memset_pattern

---
 llvm/docs/LangRef.rst                         |  56 ++
 llvm/include/llvm/IR/InstVisitor.h            |   3 +
 llvm/include/llvm/IR/IntrinsicInst.h          |  22 +-
 llvm/include/llvm/IR/Intrinsics.td            |   8 +
 llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp |   8 +
 llvm/lib/IR/Verifier.cpp                      |   3 +-
 .../Transforms/Utils/LowerMemIntrinsics.cpp   | 113 ++++
 llvm/test/CodeGen/RISCV/memset-pattern.ll     | 591 ++++++++++++++++++
 .../PowerPC/lit.local.cfg                     |   2 +
 .../PowerPC/memset-pattern.ll                 |   8 +
 .../RISCV/lit.local.cfg                       |   2 +
 ...memset-pattern-non-power-of-two-pattern.ll |   8 +
 .../RISCV/memset-pattern.ll                   | 234 +++++++
 llvm/test/Verifier/intrinsic-immarg.ll        |  10 +
 llvm/test/Verifier/memset-pattern-inline.ll   |   9 +
 15 files changed, 1075 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/memset-pattern.ll
 create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg
 create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll
 create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg
 create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern-non-power-of-two-pattern.ll
 create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll
 create mode 100644 llvm/test/Verifier/memset-pattern-inline.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ae39217dc8ff8..44f1d6d6d19bd 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15230,6 +15230,62 @@ The behavior of '``llvm.memset.inline.*``' is equivalent to the behavior of
 '``llvm.memset.*``', but the generated code is guaranteed not to call any
 external functions.
 
+.. _int_memset_pattern:
+
+'``llvm.memset_pattern``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.memset_pattern`` on
+any integer bit width and for different address spaces. Not all targets
+support all bit widths however.
+
+::
+
+      declare void @llvm.memset_pattern.p0.i64.i128(ptr <dest>, i128 <val>,
+                                                    i64 <len>, i1 <isvolatile>)
+
+Overview:
+"""""""""
+
+The '``llvm.memset_pattern.*``' intrinsics fill a block of memory with
+a particular value. This may be expanded to an inline loop, a sequence of
+stores, or a libcall depending on what is available for the target and the
+expected performance and code size impact.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to the destination to fill, the second
+is the value with which to fill it, the third argument is an integer
+argument specifying the number of bytes to fill, and the fourth is a boolean
+indicating a volatile access.
+
+The :ref:`align <attr_align>` parameter attribute can be provided
+for the first argument.
+
+If the ``isvolatile`` parameter is ``true``, the
+``llvm.memset_pattern`` call is a :ref:`volatile operation <volatile>`. The
+detailed access behavior is not very cleanly specified and it is unwise to
+depend on it.
+
+Semantics:
+""""""""""
+
+The '``llvm.memset_pattern.*``' intrinsics fill "len" bytes of memory
+starting at the destination location. If the argument is known to be aligned
+to some boundary, this can be specified as an attribute on the argument.
+
+If ``<len>`` is not an integer multiple of the pattern width in bytes, then any
+remainder bytes will be copied from ``<val>``.
+If ``<len>`` is 0, it is no-op modulo the behavior of attributes attached to
+the arguments.
+If ``<len>`` is not a well-defined value, the behavior is undefined.
+If ``<len>`` is not zero, ``<dest>`` should be well-defined, otherwise the
+behavior is undefined.
+
 .. _int_sqrt:
 
 '``llvm.sqrt.*``' Intrinsic
diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h
index 311e0ac47ddfa..aa4f0f36e4ed7 100644
--- a/llvm/include/llvm/IR/InstVisitor.h
+++ b/llvm/include/llvm/IR/InstVisitor.h
@@ -208,6 +208,7 @@ class InstVisitor {
   RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I){ DELEGATE(IntrinsicInst); }
   RetTy visitMemSetInst(MemSetInst &I)            { DELEGATE(MemIntrinsic); }
   RetTy visitMemSetInlineInst(MemSetInlineInst &I){ DELEGATE(MemSetInst); }
+  RetTy visitMemSetPatternInst(MemSetPatternInst &I) { DELEGATE(MemSetInst); }
   RetTy visitMemCpyInst(MemCpyInst &I)            { DELEGATE(MemTransferInst); }
   RetTy visitMemCpyInlineInst(MemCpyInlineInst &I){ DELEGATE(MemCpyInst); }
   RetTy visitMemMoveInst(MemMoveInst &I)          { DELEGATE(MemTransferInst); }
@@ -295,6 +296,8 @@ class InstVisitor {
       case Intrinsic::memset:      DELEGATE(MemSetInst);
       case Intrinsic::memset_inline:
         DELEGATE(MemSetInlineInst);
+      case Intrinsic::memset_pattern:
+        DELEGATE(MemSetPatternInst);
       case Intrinsic::vastart:     DELEGATE(VAStartInst);
       case Intrinsic::vaend:       DELEGATE(VAEndInst);
       case Intrinsic::vacopy:      DELEGATE(VACopyInst);
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index a2ecf625ff61a..af8789d4958d8 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1208,6 +1208,7 @@ class MemIntrinsic : public MemIntrinsicBase<MemIntrinsic> {
     case Intrinsic::memmove:
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
     case Intrinsic::memcpy_inline:
       return true;
     default:
@@ -1219,7 +1220,8 @@ class MemIntrinsic : public MemIntrinsicBase<MemIntrinsic> {
   }
 };
 
-/// This class wraps the llvm.memset and llvm.memset.inline intrinsics.
+/// This class wraps the llvm.memset, llvm.memset.inline, and
+/// llvm.memset_pattern intrinsics.
 class MemSetInst : public MemSetBase<MemIntrinsic> {
 public:
   // Methods for support type inquiry through isa, cast, and dyn_cast:
@@ -1227,6 +1229,7 @@ class MemSetInst : public MemSetBase<MemIntrinsic> {
     switch (I->getIntrinsicID()) {
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
       return true;
     default:
       return false;
@@ -1249,6 +1252,21 @@ class MemSetInlineInst : public MemSetInst {
   }
 };
 
+/// This class wraps the llvm.memset.pattern intrinsic.
+class MemSetPatternInst : public MemSetInst {
+public:
+  ConstantInt *getLength() const {
+    return cast<ConstantInt>(MemSetInst::getLength());
+  }
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::memset_pattern;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This class wraps the llvm.memcpy/memmove intrinsics.
 class MemTransferInst : public MemTransferBase<MemIntrinsic> {
 public:
@@ -1328,6 +1346,7 @@ class AnyMemIntrinsic : public MemIntrinsicBase<AnyMemIntrinsic> {
     case Intrinsic::memmove:
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
     case Intrinsic::memcpy_element_unordered_atomic:
     case Intrinsic::memmove_element_unordered_atomic:
     case Intrinsic::memset_element_unordered_atomic:
@@ -1350,6 +1369,7 @@ class AnyMemSetInst : public MemSetBase<AnyMemIntrinsic> {
     switch (I->getIntrinsicID()) {
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
     case Intrinsic::memset_element_unordered_atomic:
       return true;
     default:
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 65a9b68b5229d..f79a19e0a8d29 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1003,6 +1003,14 @@ def int_memset_inline
        NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
        ImmArg<ArgIndex<3>>]>;
 
+// Memset variant that writes a given pattern.
+def int_memset_pattern
+    : Intrinsic<[],
+      [llvm_anyptr_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_i1_ty],
+      [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, IntrNoFree, IntrNoCallback,
+       NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
+       ImmArg<ArgIndex<3>>], "llvm.memset_pattern">;
+
 // FIXME: Add version of these floating point intrinsics which allow non-default
 // rounding modes and FP exception handling.
 
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 8572cdc160456..97267afeacef3 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -276,6 +276,13 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
       Memset->eraseFromParent();
       break;
     }
+    case Intrinsic::memset_pattern: {
+      auto *Memset = cast<MemSetPatternInst>(Inst);
+      expandMemSetAsLoop(Memset);
+      Changed = true;
+      Memset->eraseFromParent();
+      break;
+    }
     default:
       llvm_unreachable("unhandled intrinsic");
     }
@@ -294,6 +301,7 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
     case Intrinsic::memmove:
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
       Changed |= expandMemIntrinsicUses(F);
       break;
     case Intrinsic::load_relative:
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index d156eaec4c172..75f4bf2973919 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5435,7 +5435,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::memcpy_inline:
   case Intrinsic::memmove:
   case Intrinsic::memset:
-  case Intrinsic::memset_inline: {
+  case Intrinsic::memset_inline:
+  case Intrinsic::memset_pattern: {
     break;
   }
   case Intrinsic::memcpy_element_unordered_atomic:
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index d2814f07530d8..8d7c234d16878 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -456,6 +456,109 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   ElseTerm->eraseFromParent();
 }
 
+static void createMemSetPatternLoop(Instruction *InsertBefore, Value *DstAddr,
+                                    Value *CopyLen, Value *SetValue,
+                                    Align DstAlign, bool IsVolatile) {
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  const DataLayout &DL = F->getDataLayout();
+
+  if (DL.isBigEndian())
+    report_fatal_error("memset_pattern.inline expansion not currently "
+                       "implemented for big-endian targets",
+                       false);
+
+  // To start with, let's assume SetValue is an i128 and bail out if it's not.
+  if (!isPowerOf2_32(SetValue->getType()->getScalarSizeInBits()))
+    report_fatal_error("Pattern width for memset_pattern must be a power of 2",
+                       false);
+  unsigned PatternSize = SetValue->getType()->getScalarSizeInBits() / 8;
+
+  Type *TypeOfCopyLen = CopyLen->getType();
+
+  BasicBlock *NewBB = OrigBB->splitBasicBlock(InsertBefore, "split");
+  BasicBlock *LoopBB =
+      BasicBlock::Create(F->getContext(), "storeloop", F, NewBB);
+  BasicBlock *RemCheckBB =
+      BasicBlock::Create(F->getContext(), "remcheck", F, NewBB);
+  BasicBlock *RemainderLoopBB =
+      BasicBlock::Create(F->getContext(), "remainderloop", F, NewBB);
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  ConstantInt *CILoopOpSize =
+      ConstantInt::get(dyn_cast<IntegerType>(TypeOfCopyLen), PatternSize);
+  Value *RuntimeLoopCount =
+      getRuntimeLoopCount(DL, Builder, CopyLen, CILoopOpSize, PatternSize);
+  Value *RuntimeRemainder =
+      getRuntimeLoopRemainder(DL, Builder, CopyLen, CILoopOpSize, PatternSize);
+
+  Builder.CreateCondBr(Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0),
+                                            RuntimeLoopCount),
+                       RemCheckBB, LoopBB);
+  OrigBB->getTerminator()->eraseFromParent();
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *CurrentDst = LoopBuilder.CreatePHI(DstAddr->getType(), 0);
+  CurrentDst->addIncoming(DstAddr, OrigBB);
+  PHINode *LoopCount = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopCount->addIncoming(RuntimeLoopCount, OrigBB);
+
+  // Create the store instruction for the pattern
+  LoopBuilder.CreateAlignedStore(SetValue, CurrentDst, DstAlign, IsVolatile);
+
+  Value *NextDst = LoopBuilder.CreateInBoundsGEP(
+      SetValue->getType(), CurrentDst,
+      ConstantInt::get(TypeOfCopyLen, PatternSize));
+  CurrentDst->addIncoming(NextDst, LoopBB);
+
+  Value *NewLoopCount =
+      LoopBuilder.CreateSub(LoopCount, ConstantInt::get(TypeOfCopyLen, 1));
+  LoopCount->addIncoming(NewLoopCount, LoopBB);
+
+  LoopBuilder.CreateCondBr(
+      LoopBuilder.CreateICmpNE(NewLoopCount,
+                               ConstantInt::get(TypeOfCopyLen, 0)),
+      LoopBB, RemCheckBB);
+
+  IRBuilder<> RemCheckBuilder(RemCheckBB, RemCheckBB->begin());
+  // Branch to the end if there are no remainder bytes.
+  PHINode *RemainderDstPHI = RemCheckBuilder.CreatePHI(NextDst->getType(), 0);
+  RemainderDstPHI->addIncoming(DstAddr, OrigBB);
+  RemainderDstPHI->addIncoming(NextDst, LoopBB);
+  RemCheckBuilder.CreateCondBr(
+      RemCheckBuilder.CreateICmpEQ(RuntimeRemainder,
+                                   ConstantInt::get(TypeOfCopyLen, 0)),
+      NewBB, RemainderLoopBB);
+
+  // Remainder loop
+  IRBuilder<> RemainderLoopBuilder(RemainderLoopBB);
+  PHINode *ByteIndex = RemainderLoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  ByteIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), RemCheckBB);
+  Type *TypeOfSetValue = SetValue->getType();
+  PHINode *ShiftedValue = RemainderLoopBuilder.CreatePHI(TypeOfSetValue, 0);
+  ShiftedValue->addIncoming(SetValue, RemCheckBB);
+
+  Value *ByteToStore = RemainderLoopBuilder.CreateTrunc(
+      ShiftedValue, RemainderLoopBuilder.getInt8Ty());
+
+  RemainderLoopBuilder.CreateStore(
+      ByteToStore,
+      RemainderLoopBuilder.CreateInBoundsGEP(RemainderLoopBuilder.getInt8Ty(),
+                                             RemainderDstPHI, ByteIndex),
+      IsVolatile);
+
+  Value *NewByteIndex = RemainderLoopBuilder.CreateAdd(
+      ByteIndex, ConstantInt::get(TypeOfCopyLen, 1));
+  ByteIndex->addIncoming(NewByteIndex, RemainderLoopBB);
+  Value *NewShiftedValue = RemainderLoopBuilder.CreateLShr(
+      ShiftedValue, ConstantInt::get(TypeOfSetValue, 8));
+  ShiftedValue->addIncoming(NewShiftedValue, RemainderLoopBB);
+
+  RemainderLoopBuilder.CreateCondBr(
+      RemainderLoopBuilder.CreateICmpULT(NewByteIndex, RuntimeRemainder),
+      RemainderLoopBB, NewBB);
+}
+
 static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
                              Value *CopyLen, Value *SetValue, Align DstAlign,
                              bool IsVolatile) {
@@ -591,6 +694,16 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
 }
 
 void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
+  if (isa<MemSetPatternInst>(Memset)) {
+    return createMemSetPatternLoop(
+        /* InsertBefore */ Memset,
+        /* DstAddr */ Memset->getRawDest(),
+        /* CopyLen */ Memset->getLength(),
+        /* SetValue */ Memset->getValue(),
+        /* Alignment */ Memset->getDestAlign().valueOrOne(),
+        Memset->isVolatile());
+  }
+
   createMemSetLoop(/* InsertBefore */ Memset,
                    /* DstAddr */ Memset->getRawDest(),
                    /* CopyLen */ Memset->getLength(),
diff --git a/llvm/test/CodeGen/RISCV/memset-pattern.ll b/llvm/test/CodeGen/RISCV/memset-pattern.ll
new file mode 100644
index 0000000000000..ea50ae0b56e40
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/memset-pattern.ll
@@ -0,0 +1,591 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m \
+; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m \
+; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+unaligned-scalar-mem \
+; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+unaligned-scalar-mem \
+; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST
+
+define void @memset_1(ptr %a, i128 %value) nounwind {
+; RV32-BOTH-LABEL: memset_1:
+; RV32-BOTH:       # %bb.0:
+; RV32-BOTH-NEXT:    lw a1, 0(a1)
+; RV32-BOTH-NEXT:    sb a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: memset_1:
+; RV64-BOTH:       # %bb.0:
+; RV64-BOTH-NEXT:    sb a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_2(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_2:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sh a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_2:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sh a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 2, i1 0)
+  ret void
+}
+
+define void @memset_3(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_3:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a2, a1, 8
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    srli a1, a1, 16
+; RV32-NEXT:    sb a1, 2(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_3:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a2, a1, 8
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    srli a1, a1, 16
+; RV64-NEXT:    sb a1, 2(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_3:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sh a1, 0(a0)
+; RV32-FAST-NEXT:    srli a1, a1, 16
+; RV32-FAST-NEXT:    sb a1, 2(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_3:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sh a1, 0(a0)
+; RV64-FAST-NEXT:    srli a1, a1, 16
+; RV64-FAST-NEXT:    sb a1, 2(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 3, i1 0)
+  ret void
+}
+
+define void @memset_4(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_4:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a2, a1, 24
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_4:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_4:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_4:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 4, i1 0)
+  ret void
+}
+
+define void @memset_5(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_5:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lw a1, 4(a1)
+; RV32-NEXT:    sb a2, 0(a0)
+; RV32-NEXT:    sb a1, 4(a0)
+; RV32-NEXT:    srli a1, a2, 24
+; RV32-NEXT:    sb a1, 3(a0)
+; RV32-NEXT:    srli a1, a2, 16
+; RV32-NEXT:    sb a1, 2(a0)
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_5:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a2, a1, 8
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    sb a1, 4(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_5:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sb a2, 4(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_5:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    srli a1, a1, 32
+; RV64-FAST-NEXT:    sb a1, 4(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 5, i1 0)
+  ret void
+}
+
+define void @memset_6(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_6:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    srli a2, a1, 24
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_6:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a2, a1, 40
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    srli a2, a1, 32
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_6:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sh a2, 4(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_6:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    srli a1, a1, 32
+; RV64-FAST-NEXT:    sh a1, 4(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 6, i1 0)
+  ret void
+}
+
+define void @memset_7(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_7:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a3, a2, 8
+; RV32-NEXT:    sb a3, 5(a0)
+; RV32-NEXT:    srli a2, a2, 16
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    srli a2, a1, 24
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_7:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a2, a1, 40
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    srli a2, a1, 32
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    srli a2, a1, 48
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_7:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sh a2, 4(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    srli a2, a2, 16
+; RV32-FAST-NEXT:    sb a2, 6(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_7:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    srli a2, a1, 48
+; RV64-FAST-NEXT:    sb a2, 6(a0)
+; RV64-FAST-NEXT:    srli a1, a1, 32
+; RV64-FAST-NEXT:    sh a1, 4(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 7, i1 0)
+  ret void
+}
+
+define void @memset_8(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a3, a2, 24
+; RV32-NEXT:    sb a3, 7(a0)
+; RV32-NEXT:    srli a3, a2, 16
+; RV32-NEXT:    sb a3, 6(a0)
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    srli a2, a1, 24
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a2, a1, 56
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    srli a2, a1, 48
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    srli a2, a1, 40
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    srli a2, a1, 32
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_8:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_8:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 8, i1 0)
+  ret void
+}
+
+define void @memset_9(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_9:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw a3, 0(a1)
+; RV32-NEXT:    lw a1, 8(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    sb a3, 0(a0)
+; RV32-NEXT:    sb a1, 8(a0)
+; RV32-NEXT:    srli a1, a2, 24
+; RV32-NEXT:    sb a1, 7(a0)
+; RV32-NEXT:    srli a1, a2, 16
+; RV32-NEXT:    sb a1, 6(a0)
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    srli a1, a3, 24
+; RV32-NEXT:    sb a1, 3(a0)
+; RV32-NEXT:    srli a1, a3, 16
+; RV32-NEXT:    sb a1, 2(a0)
+; RV32-NEXT:    srli a3, a3, 8
+; RV32-NEXT:    sb a3, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_9:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    sb a2, 8(a0)
+; RV64-NEXT:    srli a2, a1, 56
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    srli a2, a1, 48
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    srli a2, a1, 40
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    srli a2, a1, 32
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_9:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    lw a3, 0(a1)
+; RV32-FAST-NEXT:    lw a1, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    sw a3, 0(a0)
+; RV32-FAST-NEXT:    sb a1, 8(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_9:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sb a2, 8(a0)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 9, i1 0)
+  ret void
+}
+
+define void @memset_16(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 12(a1)
+; RV32-NEXT:    lw a3, 8(a1)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a2, 12(a0)
+; RV32-NEXT:    sb a3, 8(a0)
+; RV32-NEXT:    sb a4, 4(a0)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a5, a2, 24
+; RV32-NEXT:    sb a5, 15(a0)
+; RV32-NEXT:    srli a5, a2, 16
+; RV32-NEXT:    sb a5, 14(a0)
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sb a2, 13(a0)
+; RV32-NEXT:    srli a2, a3, 24
+; RV32-NEXT:    sb a2, 11(a0)
+; RV32-NEXT:    srli a2, a3, 16
+; RV32-NEXT:    sb a2, 10(a0)
+; RV32-NEXT:    srli a3, a3, 8
+; RV32-NEXT:    sb a3, 9(a0)
+; RV32-NEXT:    srli a2, a4, 24
+; RV32-NEXT:    sb a2, 7(a0)
+; RV32-NEXT:    srli a2, a4, 16
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    srli a4, a4, 8
+; RV32-NEXT:    sb a4, 5(a0)
+; RV32-NEXT:    srli a2, a1, 24
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a2, 8(a0)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a3, a2, 56
+; RV64-NEXT:    sb a3, 15(a0)
+; RV64-NEXT:    srli a3, a2, 48
+; RV64-NEXT:    sb a3, 14(a0)
+; RV64-NEXT:    srli a3, a2, 40
+; RV64-NEXT:    sb a3, 13(a0)
+; RV64-NEXT:    srli a3, a2, 32
+; RV64-NEXT:    sb a3, 12(a0)
+; RV64-NEXT:    srli a3, a2, 24
+; RV64-NEXT:    sb a3, 11(a0)
+; RV64-NEXT:    srli a3, a2, 16
+; RV64-NEXT:    sb a3, 10(a0)
+; RV64-NEXT:    srli a2, a2, 8
+; RV64-NEXT:    sb a2, 9(a0)
+; RV64-NEXT:    srli a2, a1, 56
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    srli a2, a1, 48
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    srli a2, a1, 40
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    srli a2, a1, 32
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_16:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 12(a1)
+; RV32-FAST-NEXT:    lw a3, 8(a1)
+; RV32-FAST-NEXT:    lw a4, 4(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_16:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sd a2, 8(a0)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @memset_17(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_17:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 12(a1)
+; RV32-NEXT:    lw a3, 8(a1)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a2, 12(a0)
+; RV32-NEXT:    sb a3, 8(a0)
+; RV32-NEXT:    sb a4, 4(a0)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    sb a1, 16(a0)
+; RV32-NEXT:    srli a5, a2, 24
+; RV32-NEXT:    sb a5, 15(a0)
+; RV32-NEXT:    srli a5, a2, 16
+; RV32-NEXT:    sb a5, 14(a0)
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sb a2, 13(a0)
+; RV32-NEXT:    srli a2, a3, 24
+; RV32-NEXT:    sb a2, 11(a0)
+; RV32-NEXT:    srli a2, a3, 16
+; RV32-NEXT:    sb a2, 10(a0)
+; RV32-NEXT:    srli a3, a3, 8
+; RV32-NEXT:    sb a3, 9(a0)
+; RV32-NEXT:    srli a2, a4, 24
+; RV32-NEXT:    sb a2, 7(a0)
+; RV32-NEXT:    srli a2, a4, 16
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    srli a4, a4, 8
+; RV32-NEXT:    sb a4, 5(a0)
+; RV32-NEXT:    srli a2, a1, 24
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_17:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a2, 8(a0)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    sb a1, 16(a0)
+; RV64-NEXT:    srli a3, a2, 56
+; RV64-NEXT:    sb a3, 15(a0)
+; RV64-NEXT:    srli a3, a2, 48
+; RV64-NEXT:    sb a3, 14(a0)
+; RV64-NEXT:    srli a3, a2, 40
+; RV64-NEXT:    sb a3, 13(a0)
+; RV64-NEXT:    srli a3, a2, 32
+; RV64-NEXT:    sb a3, 12(a0)
+; RV64-NEXT:    srli a3, a2, 24
+; RV64-NEXT:    sb a3, 11(a0)
+; RV64-NEXT:    srli a3, a2, 16
+; RV64-NEXT:    sb a3, 10(a0)
+; RV64-NEXT:    srli a2, a2, 8
+; RV64-NEXT:    sb a2, 9(a0)
+; RV64-NEXT:    srli a2, a1, 56
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    srli a2, a1, 48
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    srli a2, a1, 40
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    srli a2, a1, 32
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_17:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 12(a1)
+; RV32-FAST-NEXT:    lw a3, 8(a1)
+; RV32-FAST-NEXT:    lw a4, 4(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    sb a1, 16(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_17:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sd a2, 8(a0)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    sb a1, 16(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 17, i1 0)
+  ret void
+}
+
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg
new file mode 100644
index 0000000000000..bb982488eb15e
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "PowerPC" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll
new file mode 100644
index 0000000000000..8434ca1c9016b
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll
@@ -0,0 +1,8 @@
+; RUN: not opt -mtriple=powerpc64 -passes=pre-isel-intrinsic-lowering -S -o - %s 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: memset_pattern.inline expansion not currently implemented for big-endian targets 
+
+define void @memset_pattern_x(ptr %a, i128 %value, i64 %x) nounwind {
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 %x, i1 0)
+  ret void
+}
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg
new file mode 100644
index 0000000000000..17351748513d9
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "RISCV" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern-non-power-of-two-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern-non-power-of-two-pattern.ll
new file mode 100644
index 0000000000000..ce4ae0cf14c9e
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern-non-power-of-two-pattern.ll
@@ -0,0 +1,8 @@
+; RUN: not opt -mtriple=riscv64 -passes=pre-isel-intrinsic-lowering -S -o - %s 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: Pattern width for memset_pattern must be a power of 2
+
+define void @memset_pattern_i127_x(ptr %a, i127 %value, i64 %x) nounwind {
+  tail call void @llvm.memset_pattern.p0.i64.i127(ptr %a, i127 %value, i64 %x, i1 0)
+  ret void
+}
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll
new file mode 100644
index 0000000000000..74ca4c4422a75
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll
@@ -0,0 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
+
+define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_1(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    br i1 true, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]]
+; CHECK:       [[STORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[TMP4]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]]
+; CHECK:       [[REMCHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ]
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]]
+; CHECK:       [[REMAINDERLOOP]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]]
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP12]] = lshr i128 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 1
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i128_3(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_3(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br i1 true, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]]
+; CHECK:       [[STORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[TMP4]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]]
+; CHECK:       [[REMCHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ]
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]]
+; CHECK:       [[REMAINDERLOOP]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]]
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP12]] = lshr i128 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 3
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 3, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i128_14(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_14(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br i1 true, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]]
+; CHECK:       [[STORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[TMP4]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]]
+; CHECK:       [[REMCHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ]
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]]
+; CHECK:       [[REMAINDERLOOP]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]]
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP12]] = lshr i128 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 14
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 14, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i128_16(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_16(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br i1 false, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]]
+; CHECK:       [[STORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 1, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[TMP4]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]]
+; CHECK:       [[REMCHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ]
+; CHECK-NEXT:    br i1 true, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]]
+; CHECK:       [[REMAINDERLOOP]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]]
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP12]] = lshr i128 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i128_38(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_38(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br i1 false, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]]
+; CHECK:       [[STORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 2, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[TMP4]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]]
+; CHECK:       [[REMCHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ]
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]]
+; CHECK:       [[REMAINDERLOOP]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]]
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP12]] = lshr i128 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 6
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 38, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i128_x(ptr %a, i128 %value, i64 %x) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_x(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[X]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[X]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 0, [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[REMCHECK:.*]], label %[[STORELOOP:.*]]
+; CHECK:       [[STORELOOP]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i64 [ [[TMP1]], [[TMP0]] ], [ [[TMP7:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6]] = getelementptr inbounds i128, ptr [[TMP4]], i64 16
+; CHECK-NEXT:    [[TMP7]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[STORELOOP]], label %[[REMCHECK]]
+; CHECK:       [[REMCHECK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP6]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]]
+; CHECK:       [[REMAINDERLOOP]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP15:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP16:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i128 [[TMP12]] to i8
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]]
+; CHECK-NEXT:    store i8 [[TMP13]], ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP15]] = add i64 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP16]] = lshr i128 [[TMP12]], 8
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ult i64 [[TMP15]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[REMAINDERLOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 %x, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i256_x(ptr %a, i256 %value, i64 %x) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i256_x(
+; CHECK-SAME: ptr [[A:%.*]], i256 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[X]], 5
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[X]], 31
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 0, [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[REMCHECK:.*]], label %[[STORELOOP:.*]]
+; CHECK:       [[STORELOOP]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i64 [ [[TMP1]], [[TMP0]] ], [ [[TMP7:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    store i256 [[VALUE]], ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6]] = getelementptr inbounds i256, ptr [[TMP4]], i64 32
+; CHECK-NEXT:    [[TMP7]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[STORELOOP]], label %[[REMCHECK]]
+; CHECK:       [[REMCHECK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP6]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]]
+; CHECK:       [[REMAINDERLOOP]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP15:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i256 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP16:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i256 [[TMP12]] to i8
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]]
+; CHECK-NEXT:    store i8 [[TMP13]], ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP15]] = add i64 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP16]] = lshr i256 [[TMP12]], 8
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ult i64 [[TMP15]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[REMAINDERLOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset_pattern.p0.i64.i256(ptr %a, i256 %value, i64 %x, i1 0)
+  ret void
+}
diff --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll
index b1b9f7ee4be11..37745c6a9acee 100644
--- a/llvm/test/Verifier/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/intrinsic-immarg.ll
@@ -72,6 +72,16 @@ define void @memset_inline_is_volatile(ptr %dest, i8 %value, i1 %is.volatile) {
 }
 
 
+declare void @llvm.memset_pattern.p0.i32.i32(ptr nocapture, i32, i32, i1)
+define void @memset_pattern_is_volatile(ptr %dest, i32 %value, i1 %is.volatile) {
+  ; CHECK: immarg operand has non-immediate parameter
+  ; CHECK-NEXT: i1 %is.volatile
+  ; CHECK-NEXT: call void @llvm.memset_pattern.p0.i32.i32(ptr %dest, i32 %value, i32 8, i1 %is.volatile)
+  call void @llvm.memset_pattern.p0.i32.i32(ptr %dest, i32 %value, i32 8, i1 %is.volatile)
+  ret void
+}
+
+
 declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1)
 define void @objectsize(ptr %ptr, i1 %a, i1 %b, i1 %c) {
   ; CHECK: immarg operand has non-immediate parameter
diff --git a/llvm/test/Verifier/memset-pattern-inline.ll b/llvm/test/Verifier/memset-pattern-inline.ll
new file mode 100644
index 0000000000000..7f2e01ef99ea5
--- /dev/null
+++ b/llvm/test/Verifier/memset-pattern-inline.ll
@@ -0,0 +1,9 @@
+; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s
+
+; CHECK: alignment is not a power of two
+
+define void @foo(ptr %P, i32 %value) {
+  call void @llvm.memset_pattern.p0.i32.i32(ptr align 3 %P, i32 %value, i32 4, i1 false)
+  ret void
+}
+declare void @llvm.memset_pattern.p0.i32.i32(ptr nocapture, i32, i32, i1) nounwind



More information about the llvm-commits mailing list