[llvm] [IR] Initial introduction of memset_pattern (PR #97583)

Alex Bradbury via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 3 07:33:18 PDT 2024


https://github.com/asb created https://github.com/llvm/llvm-project/pull/97583

Supersedes the draft PR #94992, taking a different approach following feedback:
* Lower in PreISelIntrinsicLowering
* Don't require that the number of bytes to set is a compile-time constant
* Define llvm.memset_pattern rather than llvm.memset_pattern.inline

As discussed in the [RFC thread](https://discourse.llvm.org/t/rfc-introducing-an-llvm-memset-pattern-inline-intrinsic/79496), the intent is that the intrinsic will be lowered to loops, a sequence of stores, or libcalls depending on the expected cost and availability of libcalls on the target. Right now, there's just a single lowering path that aims to handle all cases (except hasn't been generalised beyond i128 (equivalent to memset_pattern16 yet)). My intent would be to clean this up, then follow up with additional PRs that add additional optimisations when possible (e.g. when libcalls are available, when arguments are known to be constant etc).

>From 6560a5c1828cc7532a028dcf0c199cb9311e00d0 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 15 May 2024 12:47:05 +0100
Subject: [PATCH] [IR] Initial introduction of memset_pattern

---
 llvm/docs/LangRef.rst                         |  56 ++
 llvm/include/llvm/IR/InstVisitor.h            |   3 +
 llvm/include/llvm/IR/IntrinsicInst.h          |  22 +-
 llvm/include/llvm/IR/Intrinsics.td            |   8 +
 llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp |   8 +
 llvm/lib/IR/Verifier.cpp                      |   3 +-
 .../Transforms/Utils/LowerMemIntrinsics.cpp   | 110 ++++
 llvm/test/CodeGen/RISCV/memset-pattern.ll     | 591 ++++++++++++++++++
 .../RISCV/lit.local.cfg                       |   2 +
 .../RISCV/memset-pattern.ll                   | 162 +++++
 llvm/test/Verifier/intrinsic-immarg.ll        |   9 +
 llvm/test/Verifier/memset-pattern-inline.ll   |   9 +
 12 files changed, 981 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/memset-pattern.ll
 create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg
 create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll
 create mode 100644 llvm/test/Verifier/memset-pattern-inline.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index c98332d3a24fc..2641dc1a45348 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15231,6 +15231,62 @@ The behavior of '``llvm.memset.inline.*``' is equivalent to the behavior of
 '``llvm.memset.*``', but the generated code is guaranteed not to call any
 external functions.
 
+.. _int_memset_pattern:
+
+'``llvm.memset_pattern``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.memset_pattern`` on
+any integer bit width and for different address spaces. Not all targets
+support all bit widths however.
+
+::
+
+      declare void @llvm.memset_pattern.p0.i64.i128(ptr <dest>, i128 <val>,
+                                                    i64 <len>, i1 <isvolatile>)
+
+Overview:
+"""""""""
+
+The '``llvm.memset_pattern.*``' intrinsics fill a block of memory with
+a particular value. This may be expanded to an inline loop, a sequence of
+stores, or a libcall depending on what is available for the target and the
+expected performance and code size impact.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to the destination to fill, the second
+is the value with which to fill it, the third argument is an integer
+argument specifying the number of bytes to fill, and the fourth is a boolean
+indicating a volatile access.
+
+The :ref:`align <attr_align>` parameter attribute can be provided
+for the first argument.
+
+If the ``isvolatile`` parameter is ``true``, the
+``llvm.memset_pattern`` call is a :ref:`volatile operation <volatile>`. The
+detailed access behavior is not very cleanly specified and it is unwise to
+depend on it.
+
+Semantics:
+""""""""""
+
+The '``llvm.memset_pattern.*``' intrinsics fill "len" bytes of memory
+starting at the destination location. If the argument is known to be aligned
+to some boundary, this can be specified as an attribute on the argument.
+
+If ``<len>`` is not an integer multiple of the pattern width in bytes, then any
+remainder bytes will be copied from ``<val>``.
+If ``<len>`` is 0, it is no-op modulo the behavior of attributes attached to
+the arguments.
+If ``<len>`` is not a well-defined value, the behavior is undefined.
+If ``<len>`` is not zero, ``<dest>`` should be well-defined, otherwise the
+behavior is undefined.
+
 .. _int_sqrt:
 
 '``llvm.sqrt.*``' Intrinsic
diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h
index 311e0ac47ddfa..aa4f0f36e4ed7 100644
--- a/llvm/include/llvm/IR/InstVisitor.h
+++ b/llvm/include/llvm/IR/InstVisitor.h
@@ -208,6 +208,7 @@ class InstVisitor {
   RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I){ DELEGATE(IntrinsicInst); }
   RetTy visitMemSetInst(MemSetInst &I)            { DELEGATE(MemIntrinsic); }
   RetTy visitMemSetInlineInst(MemSetInlineInst &I){ DELEGATE(MemSetInst); }
+  RetTy visitMemSetPatternInst(MemSetPatternInst &I) { DELEGATE(MemSetInst); }
   RetTy visitMemCpyInst(MemCpyInst &I)            { DELEGATE(MemTransferInst); }
   RetTy visitMemCpyInlineInst(MemCpyInlineInst &I){ DELEGATE(MemCpyInst); }
   RetTy visitMemMoveInst(MemMoveInst &I)          { DELEGATE(MemTransferInst); }
@@ -295,6 +296,8 @@ class InstVisitor {
       case Intrinsic::memset:      DELEGATE(MemSetInst);
       case Intrinsic::memset_inline:
         DELEGATE(MemSetInlineInst);
+      case Intrinsic::memset_pattern:
+        DELEGATE(MemSetPatternInst);
       case Intrinsic::vastart:     DELEGATE(VAStartInst);
       case Intrinsic::vaend:       DELEGATE(VAEndInst);
       case Intrinsic::vacopy:      DELEGATE(VACopyInst);
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 3963a5c8ab8f9..72439ebc14a54 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1171,6 +1171,7 @@ class MemIntrinsic : public MemIntrinsicBase<MemIntrinsic> {
     case Intrinsic::memmove:
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
     case Intrinsic::memcpy_inline:
       return true;
     default:
@@ -1182,7 +1183,8 @@ class MemIntrinsic : public MemIntrinsicBase<MemIntrinsic> {
   }
 };
 
-/// This class wraps the llvm.memset and llvm.memset.inline intrinsics.
+/// This class wraps the llvm.memset, llvm.memset.inline, and
+/// llvm.memset_pattern intrinsics.
 class MemSetInst : public MemSetBase<MemIntrinsic> {
 public:
   // Methods for support type inquiry through isa, cast, and dyn_cast:
@@ -1190,6 +1192,7 @@ class MemSetInst : public MemSetBase<MemIntrinsic> {
     switch (I->getIntrinsicID()) {
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
       return true;
     default:
       return false;
@@ -1215,6 +1218,21 @@ class MemSetInlineInst : public MemSetInst {
   }
 };
 
+/// This class wraps the llvm.memset.pattern intrinsic.
+class MemSetPatternInst : public MemSetInst {
+public:
+  ConstantInt *getLength() const {
+    return cast<ConstantInt>(MemSetInst::getLength());
+  }
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::memset_pattern;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This class wraps the llvm.memcpy/memmove intrinsics.
 class MemTransferInst : public MemTransferBase<MemIntrinsic> {
 public:
@@ -1294,6 +1312,7 @@ class AnyMemIntrinsic : public MemIntrinsicBase<AnyMemIntrinsic> {
     case Intrinsic::memmove:
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
     case Intrinsic::memcpy_element_unordered_atomic:
     case Intrinsic::memmove_element_unordered_atomic:
     case Intrinsic::memset_element_unordered_atomic:
@@ -1316,6 +1335,7 @@ class AnyMemSetInst : public MemSetBase<AnyMemIntrinsic> {
     switch (I->getIntrinsicID()) {
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::memset_pattern:
     case Intrinsic::memset_element_unordered_atomic:
       return true;
     default:
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index c7d383a5d0c0c..09e759e0e25b3 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1003,6 +1003,14 @@ def int_memset_inline
        NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
        ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
 
+// Memset variant that writes a given pattern.
+def int_memset_pattern
+    : Intrinsic<[],
+      [llvm_anyptr_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_i1_ty],
+      [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, IntrNoFree, IntrNoCallback,
+       NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
+       ImmArg<ArgIndex<3>>], "llvm.memset_pattern">;
+
 // FIXME: Add version of these floating point intrinsics which allow non-default
 // rounding modes and FP exception handling.
 
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 0777acf633187..59e06dd92be14 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -263,6 +263,13 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
 
       break;
     }
+    case Intrinsic::memset_pattern: {
+      auto *Memset = cast<MemSetPatternInst>(Inst);
+      expandMemSetAsLoop(Memset);
+      Changed = true;
+      Memset->eraseFromParent();
+      break;
+    }
     default:
       llvm_unreachable("unhandled intrinsic");
     }
@@ -280,6 +287,7 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
     case Intrinsic::memcpy:
     case Intrinsic::memmove:
     case Intrinsic::memset:
+    case Intrinsic::memset_pattern:
       Changed |= expandMemIntrinsicUses(F);
       break;
     case Intrinsic::load_relative:
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c98f61d555140..a0bef394ecee3 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5446,7 +5446,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::memcpy_inline:
   case Intrinsic::memmove:
   case Intrinsic::memset:
-  case Intrinsic::memset_inline: {
+  case Intrinsic::memset_inline:
+  case Intrinsic::memset_pattern: {
     break;
   }
   case Intrinsic::memcpy_element_unordered_atomic:
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index d84e9f094e03a..3adcc51a81ed5 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -455,6 +455,106 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   ElseTerm->eraseFromParent();
 }
 
+static void createMemSetPatternLoop(Instruction *InsertBefore, Value *DstAddr,
+                                    Value *CopyLen, Value *SetValue,
+                                    Align DstAlign, bool IsVolatile) {
+
+  // To start with, let's assume SetValue is an i128 and bail out if it's not.
+  if (SetValue->getType()->getScalarSizeInBits() != 128) {
+    report_fatal_error("Only 128-bit variant supported for now");
+  }
+  unsigned PatternSize = SetValue->getType()->getScalarSizeInBits() / 8;
+
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  const DataLayout &DL = F->getDataLayout();
+
+  BasicBlock *NewBB = OrigBB->splitBasicBlock(InsertBefore, "split");
+  BasicBlock *LoopBB =
+      BasicBlock::Create(F->getContext(), "storeloop", F, NewBB);
+  BasicBlock *RemCheckBB =
+      BasicBlock::Create(F->getContext(), "remcheck", F, NewBB);
+  BasicBlock *RemainderLoopBB =
+      BasicBlock::Create(F->getContext(), "remainderloop", F, NewBB);
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  ConstantInt *CILoopOpSize =
+      ConstantInt::get(dyn_cast<IntegerType>(TypeOfCopyLen), PatternSize);
+  Value *RuntimeLoopCount =
+      getRuntimeLoopCount(DL, Builder, CopyLen, CILoopOpSize, PatternSize);
+  Value *RuntimeRemainder =
+      getRuntimeLoopRemainder(DL, Builder, CopyLen, CILoopOpSize, PatternSize);
+
+  Builder.CreateCondBr(Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0),
+                                            RuntimeLoopCount),
+                       RemCheckBB, LoopBB);
+  OrigBB->getTerminator()->eraseFromParent();
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *CurrentDst = LoopBuilder.CreatePHI(DstAddr->getType(), 0);
+  CurrentDst->addIncoming(DstAddr, OrigBB);
+  PHINode *LoopCount = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopCount->addIncoming(RuntimeLoopCount, OrigBB);
+
+  // Create the store instruction for the pattern
+  LoopBuilder.CreateAlignedStore(SetValue, CurrentDst, DstAlign, IsVolatile);
+
+  Value *NextDst = LoopBuilder.CreateInBoundsGEP(
+      SetValue->getType(), CurrentDst,
+      ConstantInt::get(TypeOfCopyLen, PatternSize));
+  CurrentDst->addIncoming(NextDst, LoopBB);
+
+  Value *NewLoopCount =
+      LoopBuilder.CreateSub(LoopCount, ConstantInt::get(TypeOfCopyLen, 1));
+  LoopCount->addIncoming(NewLoopCount, LoopBB);
+
+  LoopBuilder.CreateCondBr(
+      LoopBuilder.CreateICmpNE(NewLoopCount,
+                               ConstantInt::get(TypeOfCopyLen, 0)),
+      LoopBB, RemCheckBB);
+
+  IRBuilder<> RemCheckBuilder(RemCheckBB, RemCheckBB->begin());
+  // Branch to the end if there are no remainder bytes.
+  PHINode *RemainderDstPHI = RemCheckBuilder.CreatePHI(NextDst->getType(), 0);
+  RemainderDstPHI->addIncoming(DstAddr, OrigBB);
+  RemainderDstPHI->addIncoming(NextDst, LoopBB);
+  RemCheckBuilder.CreateCondBr(
+      RemCheckBuilder.CreateICmpEQ(RuntimeRemainder,
+                                   ConstantInt::get(TypeOfCopyLen, 0)),
+      NewBB, RemainderLoopBB);
+
+  // Remainder loop
+  IRBuilder<> RemainderLoopBuilder(RemainderLoopBB);
+  PHINode *ByteIndex = RemainderLoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  ByteIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), RemCheckBB);
+  Type *TypeOfSetValue = SetValue->getType();
+  PHINode *ShiftedValue = RemainderLoopBuilder.CreatePHI(TypeOfSetValue, 0);
+  ShiftedValue->addIncoming(SetValue, RemCheckBB);
+
+  // Get the byte to store
+  Value *ByteToStore = RemainderLoopBuilder.CreateTrunc(
+      ShiftedValue, RemainderLoopBuilder.getInt8Ty());
+
+  // Store the byte
+  RemainderLoopBuilder.CreateStore(
+      ByteToStore,
+      RemainderLoopBuilder.CreateInBoundsGEP(RemainderLoopBuilder.getInt8Ty(),
+                                             RemainderDstPHI, ByteIndex),
+      IsVolatile);
+
+  Value *NewByteIndex = RemainderLoopBuilder.CreateAdd(
+      ByteIndex, ConstantInt::get(TypeOfCopyLen, 1));
+  ByteIndex->addIncoming(NewByteIndex, RemainderLoopBB);
+  Value *NewShiftedValue = RemainderLoopBuilder.CreateLShr(
+      ShiftedValue, ConstantInt::get(TypeOfSetValue, 8));
+  ShiftedValue->addIncoming(NewShiftedValue, RemainderLoopBB);
+
+  RemainderLoopBuilder.CreateCondBr(
+      RemainderLoopBuilder.CreateICmpULT(NewByteIndex, RuntimeRemainder),
+      RemainderLoopBB, NewBB);
+}
+
 static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
                              Value *CopyLen, Value *SetValue, Align DstAlign,
                              bool IsVolatile) {
@@ -590,6 +690,16 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
 }
 
 void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
+  if (isa<MemSetPatternInst>(Memset)) {
+    return createMemSetPatternLoop(
+        /* InsertBefore */ Memset,
+        /* DstAddr */ Memset->getRawDest(),
+        /* CopyLen */ Memset->getLength(),
+        /* SetValue */ Memset->getValue(),
+        /* Alignment */ Memset->getDestAlign().valueOrOne(),
+        Memset->isVolatile());
+  }
+
   createMemSetLoop(/* InsertBefore */ Memset,
                    /* DstAddr */ Memset->getRawDest(),
                    /* CopyLen */ Memset->getLength(),
diff --git a/llvm/test/CodeGen/RISCV/memset-pattern.ll b/llvm/test/CodeGen/RISCV/memset-pattern.ll
new file mode 100644
index 0000000000000..ea50ae0b56e40
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/memset-pattern.ll
@@ -0,0 +1,591 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m \
+; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m \
+; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+unaligned-scalar-mem \
+; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+unaligned-scalar-mem \
+; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST
+
+define void @memset_1(ptr %a, i128 %value) nounwind {
+; RV32-BOTH-LABEL: memset_1:
+; RV32-BOTH:       # %bb.0:
+; RV32-BOTH-NEXT:    lw a1, 0(a1)
+; RV32-BOTH-NEXT:    sb a1, 0(a0)
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: memset_1:
+; RV64-BOTH:       # %bb.0:
+; RV64-BOTH-NEXT:    sb a1, 0(a0)
+; RV64-BOTH-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_2(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_2:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sh a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_2:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sh a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 2, i1 0)
+  ret void
+}
+
+define void @memset_3(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_3:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a2, a1, 8
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    srli a1, a1, 16
+; RV32-NEXT:    sb a1, 2(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_3:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a2, a1, 8
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    srli a1, a1, 16
+; RV64-NEXT:    sb a1, 2(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_3:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sh a1, 0(a0)
+; RV32-FAST-NEXT:    srli a1, a1, 16
+; RV32-FAST-NEXT:    sb a1, 2(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_3:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sh a1, 0(a0)
+; RV64-FAST-NEXT:    srli a1, a1, 16
+; RV64-FAST-NEXT:    sb a1, 2(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 3, i1 0)
+  ret void
+}
+
+define void @memset_4(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_4:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a2, a1, 24
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_4:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_4:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_4:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 4, i1 0)
+  ret void
+}
+
+define void @memset_5(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_5:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lw a1, 4(a1)
+; RV32-NEXT:    sb a2, 0(a0)
+; RV32-NEXT:    sb a1, 4(a0)
+; RV32-NEXT:    srli a1, a2, 24
+; RV32-NEXT:    sb a1, 3(a0)
+; RV32-NEXT:    srli a1, a2, 16
+; RV32-NEXT:    sb a1, 2(a0)
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_5:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a2, a1, 8
+; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    sb a1, 4(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_5:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sb a2, 4(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_5:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    srli a1, a1, 32
+; RV64-FAST-NEXT:    sb a1, 4(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 5, i1 0)
+  ret void
+}
+
+define void @memset_6(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_6:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    srli a2, a1, 24
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_6:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a2, a1, 40
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    srli a2, a1, 32
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_6:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sh a2, 4(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_6:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    srli a1, a1, 32
+; RV64-FAST-NEXT:    sh a1, 4(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 6, i1 0)
+  ret void
+}
+
+define void @memset_7(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_7:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a3, a2, 8
+; RV32-NEXT:    sb a3, 5(a0)
+; RV32-NEXT:    srli a2, a2, 16
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    srli a2, a1, 24
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_7:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a2, a1, 40
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    srli a2, a1, 32
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    srli a2, a1, 48
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_7:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sh a2, 4(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    srli a2, a2, 16
+; RV32-FAST-NEXT:    sb a2, 6(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_7:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    srli a2, a1, 48
+; RV64-FAST-NEXT:    sb a2, 6(a0)
+; RV64-FAST-NEXT:    srli a1, a1, 32
+; RV64-FAST-NEXT:    sh a1, 4(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 7, i1 0)
+  ret void
+}
+
+define void @memset_8(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a3, a2, 24
+; RV32-NEXT:    sb a3, 7(a0)
+; RV32-NEXT:    srli a3, a2, 16
+; RV32-NEXT:    sb a3, 6(a0)
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    srli a2, a1, 24
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a2, a1, 56
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    srli a2, a1, 48
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    srli a2, a1, 40
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    srli a2, a1, 32
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_8:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_8:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 8, i1 0)
+  ret void
+}
+
+define void @memset_9(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_9:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw a3, 0(a1)
+; RV32-NEXT:    lw a1, 8(a1)
+; RV32-NEXT:    sb a2, 4(a0)
+; RV32-NEXT:    sb a3, 0(a0)
+; RV32-NEXT:    sb a1, 8(a0)
+; RV32-NEXT:    srli a1, a2, 24
+; RV32-NEXT:    sb a1, 7(a0)
+; RV32-NEXT:    srli a1, a2, 16
+; RV32-NEXT:    sb a1, 6(a0)
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sb a2, 5(a0)
+; RV32-NEXT:    srli a1, a3, 24
+; RV32-NEXT:    sb a1, 3(a0)
+; RV32-NEXT:    srli a1, a3, 16
+; RV32-NEXT:    sb a1, 2(a0)
+; RV32-NEXT:    srli a3, a3, 8
+; RV32-NEXT:    sb a3, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_9:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    sb a2, 8(a0)
+; RV64-NEXT:    srli a2, a1, 56
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    srli a2, a1, 48
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    srli a2, a1, 40
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    srli a2, a1, 32
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_9:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 4(a1)
+; RV32-FAST-NEXT:    lw a3, 0(a1)
+; RV32-FAST-NEXT:    lw a1, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    sw a3, 0(a0)
+; RV32-FAST-NEXT:    sb a1, 8(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_9:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sb a2, 8(a0)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 9, i1 0)
+  ret void
+}
+
+define void @memset_16(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 12(a1)
+; RV32-NEXT:    lw a3, 8(a1)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a2, 12(a0)
+; RV32-NEXT:    sb a3, 8(a0)
+; RV32-NEXT:    sb a4, 4(a0)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    srli a5, a2, 24
+; RV32-NEXT:    sb a5, 15(a0)
+; RV32-NEXT:    srli a5, a2, 16
+; RV32-NEXT:    sb a5, 14(a0)
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sb a2, 13(a0)
+; RV32-NEXT:    srli a2, a3, 24
+; RV32-NEXT:    sb a2, 11(a0)
+; RV32-NEXT:    srli a2, a3, 16
+; RV32-NEXT:    sb a2, 10(a0)
+; RV32-NEXT:    srli a3, a3, 8
+; RV32-NEXT:    sb a3, 9(a0)
+; RV32-NEXT:    srli a2, a4, 24
+; RV32-NEXT:    sb a2, 7(a0)
+; RV32-NEXT:    srli a2, a4, 16
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    srli a4, a4, 8
+; RV32-NEXT:    sb a4, 5(a0)
+; RV32-NEXT:    srli a2, a1, 24
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a2, 8(a0)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    srli a3, a2, 56
+; RV64-NEXT:    sb a3, 15(a0)
+; RV64-NEXT:    srli a3, a2, 48
+; RV64-NEXT:    sb a3, 14(a0)
+; RV64-NEXT:    srli a3, a2, 40
+; RV64-NEXT:    sb a3, 13(a0)
+; RV64-NEXT:    srli a3, a2, 32
+; RV64-NEXT:    sb a3, 12(a0)
+; RV64-NEXT:    srli a3, a2, 24
+; RV64-NEXT:    sb a3, 11(a0)
+; RV64-NEXT:    srli a3, a2, 16
+; RV64-NEXT:    sb a3, 10(a0)
+; RV64-NEXT:    srli a2, a2, 8
+; RV64-NEXT:    sb a2, 9(a0)
+; RV64-NEXT:    srli a2, a1, 56
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    srli a2, a1, 48
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    srli a2, a1, 40
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    srli a2, a1, 32
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_16:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 12(a1)
+; RV32-FAST-NEXT:    lw a3, 8(a1)
+; RV32-FAST-NEXT:    lw a4, 4(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_16:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sd a2, 8(a0)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @memset_17(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_17:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 12(a1)
+; RV32-NEXT:    lw a3, 8(a1)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sb a2, 12(a0)
+; RV32-NEXT:    sb a3, 8(a0)
+; RV32-NEXT:    sb a4, 4(a0)
+; RV32-NEXT:    sb a1, 0(a0)
+; RV32-NEXT:    sb a1, 16(a0)
+; RV32-NEXT:    srli a5, a2, 24
+; RV32-NEXT:    sb a5, 15(a0)
+; RV32-NEXT:    srli a5, a2, 16
+; RV32-NEXT:    sb a5, 14(a0)
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sb a2, 13(a0)
+; RV32-NEXT:    srli a2, a3, 24
+; RV32-NEXT:    sb a2, 11(a0)
+; RV32-NEXT:    srli a2, a3, 16
+; RV32-NEXT:    sb a2, 10(a0)
+; RV32-NEXT:    srli a3, a3, 8
+; RV32-NEXT:    sb a3, 9(a0)
+; RV32-NEXT:    srli a2, a4, 24
+; RV32-NEXT:    sb a2, 7(a0)
+; RV32-NEXT:    srli a2, a4, 16
+; RV32-NEXT:    sb a2, 6(a0)
+; RV32-NEXT:    srli a4, a4, 8
+; RV32-NEXT:    sb a4, 5(a0)
+; RV32-NEXT:    srli a2, a1, 24
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sb a1, 1(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_17:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sb a2, 8(a0)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    sb a1, 16(a0)
+; RV64-NEXT:    srli a3, a2, 56
+; RV64-NEXT:    sb a3, 15(a0)
+; RV64-NEXT:    srli a3, a2, 48
+; RV64-NEXT:    sb a3, 14(a0)
+; RV64-NEXT:    srli a3, a2, 40
+; RV64-NEXT:    sb a3, 13(a0)
+; RV64-NEXT:    srli a3, a2, 32
+; RV64-NEXT:    sb a3, 12(a0)
+; RV64-NEXT:    srli a3, a2, 24
+; RV64-NEXT:    sb a3, 11(a0)
+; RV64-NEXT:    srli a3, a2, 16
+; RV64-NEXT:    sb a3, 10(a0)
+; RV64-NEXT:    srli a2, a2, 8
+; RV64-NEXT:    sb a2, 9(a0)
+; RV64-NEXT:    srli a2, a1, 56
+; RV64-NEXT:    sb a2, 7(a0)
+; RV64-NEXT:    srli a2, a1, 48
+; RV64-NEXT:    sb a2, 6(a0)
+; RV64-NEXT:    srli a2, a1, 40
+; RV64-NEXT:    sb a2, 5(a0)
+; RV64-NEXT:    srli a2, a1, 32
+; RV64-NEXT:    sb a2, 4(a0)
+; RV64-NEXT:    srli a2, a1, 24
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    srli a2, a1, 16
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 8
+; RV64-NEXT:    sb a1, 1(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_17:
+; RV32-FAST:       # %bb.0:
+; RV32-FAST-NEXT:    lw a2, 12(a1)
+; RV32-FAST-NEXT:    lw a3, 8(a1)
+; RV32-FAST-NEXT:    lw a4, 4(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    sb a1, 16(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_17:
+; RV64-FAST:       # %bb.0:
+; RV64-FAST-NEXT:    sd a2, 8(a0)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    sb a1, 16(a0)
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 17, i1 0)
+  ret void
+}
+
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg
new file mode 100644
index 0000000000000..17351748513d9
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "RISCV" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll
new file mode 100644
index 0000000000000..7a65bd30a1f10
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
+
+define void @memset_1(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_1(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    br i1 true, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]]
+; CHECK:       [[STORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[TMP4]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]]
+; CHECK:       [[REMCHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ]
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]]
+; CHECK:       [[REMAINDERLOOP]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]]
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP12]] = lshr i128 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 1
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_3(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_3(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br i1 true, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]]
+; CHECK:       [[STORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[TMP4]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]]
+; CHECK:       [[REMCHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ]
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]]
+; CHECK:       [[REMAINDERLOOP]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]]
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP12]] = lshr i128 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 3
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 3, i1 0)
+  ret void
+}
+
+define void @memset_14(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_14(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br i1 true, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]]
+; CHECK:       [[STORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[TMP4]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]]
+; CHECK:       [[REMCHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ]
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]]
+; CHECK:       [[REMAINDERLOOP]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]]
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP12]] = lshr i128 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 14
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 14, i1 0)
+  ret void
+}
+
+define void @memset_16(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_16(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br i1 false, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]]
+; CHECK:       [[STORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 1, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[TMP4]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]]
+; CHECK:       [[REMCHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ]
+; CHECK-NEXT:    br i1 true, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]]
+; CHECK:       [[REMAINDERLOOP]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]]
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP12]] = lshr i128 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @memset_38(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_38(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br i1 false, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]]
+; CHECK:       [[STORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 2, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[TMP4]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]]
+; CHECK:       [[REMCHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ]
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]]
+; CHECK:       [[REMAINDERLOOP]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]]
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP12]] = lshr i128 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 6
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 38, i1 0)
+  ret void
+}
diff --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll
index 47189c0b7d052..7b607a54bab70 100644
--- a/llvm/test/Verifier/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/intrinsic-immarg.ll
@@ -79,6 +79,15 @@ define void @memset_inline_variable_size(ptr %dest, i8 %value, i32 %size) {
   ret void
 }
 
+declare void @llvm.memset_pattern.p0.i32.i32(ptr nocapture, i32, i32, i1)
+define void @memset_pattern_is_volatile(ptr %dest, i32 %value, i1 %is.volatile) {
+  ; CHECK: immarg operand has non-immediate parameter
+  ; CHECK-NEXT: i1 %is.volatile
+  ; CHECK-NEXT: call void @llvm.memset_pattern.p0.i32.i32(ptr %dest, i32 %value, i32 8, i1 %is.volatile)
+  call void @llvm.memset_pattern.p0.i32.i32(ptr %dest, i32 %value, i32 8, i1 %is.volatile)
+  ret void
+}
+
 
 declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1)
 define void @objectsize(ptr %ptr, i1 %a, i1 %b, i1 %c) {
diff --git a/llvm/test/Verifier/memset-pattern-inline.ll b/llvm/test/Verifier/memset-pattern-inline.ll
new file mode 100644
index 0000000000000..7f2e01ef99ea5
--- /dev/null
+++ b/llvm/test/Verifier/memset-pattern-inline.ll
@@ -0,0 +1,9 @@
+; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s
+
+; CHECK: alignment is not a power of two
+
+define void @foo(ptr %P, i32 %value) {
+  call void @llvm.memset_pattern.p0.i32.i32(ptr align 3 %P, i32 %value, i32 4, i1 false)
+  ret void
+}
+declare void @llvm.memset_pattern.p0.i32.i32(ptr nocapture, i32, i32, i1) nounwind



More information about the llvm-commits mailing list