[llvm] 298127d - Reapply [IR] Initial introduction of llvm.experimental.memset_pattern (#97583)

Alex Bradbury via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 15 07:22:15 PST 2024


Author: Alex Bradbury
Date: 2024-11-15T15:21:39Z
New Revision: 298127dcbe2ecd1f3c49c2109ac96654778f20be

URL: https://github.com/llvm/llvm-project/commit/298127dcbe2ecd1f3c49c2109ac96654778f20be
DIFF: https://github.com/llvm/llvm-project/commit/298127dcbe2ecd1f3c49c2109ac96654778f20be.diff

LOG: Reapply [IR] Initial introduction of llvm.experimental.memset_pattern (#97583)

Relands 7ff3a9acd84654c9ec2939f45ba27f162ae7fbc3 after regenerating the
test case.

Supersedes the draft PR #94992, taking a different approach following
feedback:
* Lower in PreISelIntrinsicLowering
* Don't require that the number of bytes to set is a compile-time
constant
* Define llvm.memset_pattern rather than llvm.memset_pattern.inline

As discussed in the [RFC
thread](https://discourse.llvm.org/t/rfc-introducing-an-llvm-memset-pattern-inline-intrinsic/79496),
the intent is that the intrinsic will be lowered to loops, a sequence of
stores, or libcalls depending on the expected cost and availability of
libcalls on the target. Right now, there's just a single lowering path
that aims to handle all cases. My intent would be to follow up with
additional PRs that add additional optimisations when possible (e.g.
when libcalls are available, when arguments are known to be constant
etc).

Added: 
    llvm/test/CodeGen/RISCV/memset-pattern.ll
    llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg
    llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll
    llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg
    llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll
    llvm/test/Verifier/memset-pattern.ll

Modified: 
    llvm/docs/LangRef.rst
    llvm/include/llvm/IR/InstVisitor.h
    llvm/include/llvm/IR/IntrinsicInst.h
    llvm/include/llvm/IR/Intrinsics.td
    llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
    llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
    llvm/lib/IR/Verifier.cpp
    llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
    llvm/test/Verifier/intrinsic-immarg.ll

Removed: 
    


################################################################################
diff  --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index f67692f4bf17cf..c81abb38c6981f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15430,6 +15430,63 @@ The behavior of '``llvm.memset.inline.*``' is equivalent to the behavior of
 '``llvm.memset.*``', but the generated code is guaranteed not to call any
 external functions.
 
+.. _int_experimental_memset_pattern:
+
+'``llvm.experimental.memset.pattern``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use
+``llvm.experimental.memset.pattern`` on any integer bit width and for
+
diff erent address spaces. Not all targets support all bit widths however.
+
+::
+
+      declare void @llvm.experimental.memset.pattern.p0.i128.i64(ptr <dest>, i128 <val>,
+                                                                 i64 <count>, i1 <isvolatile>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.memset.pattern.*``' intrinsics fill a block of memory
+with a particular value. This may be expanded to an inline loop, a sequence of
+stores, or a libcall depending on what is available for the target and the
+expected performance and code size impact.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to the destination to fill, the second
+is the value with which to fill it, the third argument is an integer
+argument specifying the number of times to fill the value, and the fourth is a
+boolean indicating a volatile access.
+
+The :ref:`align <attr_align>` parameter attribute can be provided
+for the first argument.
+
+If the ``isvolatile`` parameter is ``true``, the
+``llvm.experimental.memset.pattern`` call is a :ref:`volatile operation
+<volatile>`. The detailed access behavior is not very cleanly specified and it
+is unwise to depend on it.
+
+Semantics:
+""""""""""
+
+The '``llvm.experimental.memset.pattern*``' intrinsic fills memory starting at
+the destination location with the given pattern ``<count>`` times,
+incrementing by the allocation size of the type each time. The stores follow
+the usual semantics of store instructions, including regarding endianness and
+padding. If the argument is known to be aligned to some boundary, this can be
+specified as an attribute on the argument.
+
+If ``<count>`` is 0, it is no-op modulo the behavior of attributes attached to
+the arguments.
+If ``<count>`` is not a well-defined value, the behavior is undefined.
+If ``<count>`` is not zero, ``<dest>`` should be well-defined, otherwise the
+behavior is undefined.
+
 .. _int_sqrt:
 
 '``llvm.sqrt.*``' Intrinsic

diff  --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h
index 311e0ac47ddfad..5fc6fbfd0f28e4 100644
--- a/llvm/include/llvm/IR/InstVisitor.h
+++ b/llvm/include/llvm/IR/InstVisitor.h
@@ -208,6 +208,9 @@ class InstVisitor {
   RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I){ DELEGATE(IntrinsicInst); }
   RetTy visitMemSetInst(MemSetInst &I)            { DELEGATE(MemIntrinsic); }
   RetTy visitMemSetInlineInst(MemSetInlineInst &I){ DELEGATE(MemSetInst); }
+  RetTy visitMemSetPatternInst(MemSetPatternInst &I) {
+    DELEGATE(IntrinsicInst);
+  }
   RetTy visitMemCpyInst(MemCpyInst &I)            { DELEGATE(MemTransferInst); }
   RetTy visitMemCpyInlineInst(MemCpyInlineInst &I){ DELEGATE(MemCpyInst); }
   RetTy visitMemMoveInst(MemMoveInst &I)          { DELEGATE(MemTransferInst); }
@@ -295,6 +298,8 @@ class InstVisitor {
       case Intrinsic::memset:      DELEGATE(MemSetInst);
       case Intrinsic::memset_inline:
         DELEGATE(MemSetInlineInst);
+      case Intrinsic::experimental_memset_pattern:
+        DELEGATE(MemSetPatternInst);
       case Intrinsic::vastart:     DELEGATE(VAStartInst);
       case Intrinsic::vaend:       DELEGATE(VAEndInst);
       case Intrinsic::vacopy:      DELEGATE(VACopyInst);

diff  --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 920eed01374c83..3436216d478e38 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1263,6 +1263,41 @@ class MemSetInlineInst : public MemSetInst {
   }
 };
 
+/// This is the base class for llvm.experimental.memset.pattern
+class MemSetPatternIntrinsic : public MemIntrinsicBase<MemIntrinsic> {
+private:
+  enum { ARG_VOLATILE = 3 };
+
+public:
+  ConstantInt *getVolatileCst() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(ARG_VOLATILE)));
+  }
+
+  bool isVolatile() const { return !getVolatileCst()->isZero(); }
+
+  void setVolatile(Constant *V) { setArgOperand(ARG_VOLATILE, V); }
+
+  // Methods for support of type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::experimental_memset_pattern;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This class wraps the llvm.experimental.memset.pattern intrinsic.
+class MemSetPatternInst : public MemSetBase<MemSetPatternIntrinsic> {
+public:
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::experimental_memset_pattern;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This class wraps the llvm.memcpy/memmove intrinsics.
 class MemTransferInst : public MemTransferBase<MemIntrinsic> {
 public:

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 88a4201dcfe375..1ca8c2565ab0b6 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1006,6 +1006,17 @@ def int_memset_inline
        NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
        ImmArg<ArgIndex<3>>]>;
 
+// Memset variant that writes a given pattern.
+def int_experimental_memset_pattern
+    : Intrinsic<[],
+      [llvm_anyptr_ty, // Destination.
+       llvm_anyint_ty, // Pattern value.
+       llvm_anyint_ty, // Count (number of times to fill value).
+       llvm_i1_ty],    // IsVolatile.
+      [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, IntrNoFree, IntrNoCallback,
+       NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
+       ImmArg<ArgIndex<3>>]>;
+
 // FIXME: Add version of these floating point intrinsics which allow non-default
 // rounding modes and FP exception handling.
 

diff  --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
index 314435324b473b..1007d282b2ac5c 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@@ -25,6 +25,7 @@ class Instruction;
 class MemCpyInst;
 class MemMoveInst;
 class MemSetInst;
+class MemSetPatternInst;
 class ScalarEvolution;
 class TargetTransformInfo;
 class Value;
@@ -57,6 +58,9 @@ bool expandMemMoveAsLoop(MemMoveInst *MemMove, const TargetTransformInfo &TTI);
 /// Expand \p MemSet as a loop. \p MemSet is not deleted.
 void expandMemSetAsLoop(MemSetInst *MemSet);
 
+/// Expand \p MemSetPattern as a loop. \p MemSet is not deleted.
+void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet);
+
 /// Expand \p AtomicMemCpy as a loop. \p AtomicMemCpy is not deleted.
 void expandAtomicMemCpyAsLoop(AtomicMemCpyInst *AtomicMemCpy,
                               const TargetTransformInfo &TTI,

diff  --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 3373b76edb268f..4a3d1673c2a7c1 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -320,6 +320,13 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
       Memset->eraseFromParent();
       break;
     }
+    case Intrinsic::experimental_memset_pattern: {
+      auto *Memset = cast<MemSetPatternInst>(Inst);
+      expandMemSetPatternAsLoop(Memset);
+      Changed = true;
+      Memset->eraseFromParent();
+      break;
+    }
     default:
       llvm_unreachable("unhandled intrinsic");
     }
@@ -339,6 +346,7 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
     case Intrinsic::memmove:
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
+    case Intrinsic::experimental_memset_pattern:
       Changed |= expandMemIntrinsicUses(F);
       break;
     case Intrinsic::load_relative:

diff  --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 24e7b48c834e7c..6ee0a5a16e2202 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5519,7 +5519,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::memcpy_inline:
   case Intrinsic::memmove:
   case Intrinsic::memset:
-  case Intrinsic::memset_inline: {
+  case Intrinsic::memset_inline:
+  case Intrinsic::experimental_memset_pattern: {
     break;
   }
   case Intrinsic::memcpy_element_unordered_atomic:

diff  --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 546217093550a2..91291b429ea434 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -970,6 +970,15 @@ void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
                    Memset->isVolatile());
 }
 
+void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) {
+  createMemSetLoop(/* InsertBefore=*/Memset,
+                   /* DstAddr=*/Memset->getRawDest(),
+                   /* CopyLen=*/Memset->getLength(),
+                   /* SetValue=*/Memset->getValue(),
+                   /* Alignment=*/Memset->getDestAlign().valueOrOne(),
+                   Memset->isVolatile());
+}
+
 void llvm::expandAtomicMemCpyAsLoop(AtomicMemCpyInst *AtomicMemcpy,
                                     const TargetTransformInfo &TTI,
                                     ScalarEvolution *SE) {

diff  --git a/llvm/test/CodeGen/RISCV/memset-pattern.ll b/llvm/test/CodeGen/RISCV/memset-pattern.ll
new file mode 100644
index 00000000000000..35ce7fad0ea677
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/memset-pattern.ll
@@ -0,0 +1,297 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m \
+; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m \
+; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+unaligned-scalar-mem \
+; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+unaligned-scalar-mem \
+; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST
+
+; TODO: Due to the initial naive lowering implementation of memset.pattern in
+; PreISelIntrinsicLowering, the generated code is not good.
+
+define void @memset_1(ptr %a, i128 %value) nounwind {
+; RV32-BOTH-LABEL: memset_1:
+; RV32-BOTH:       # %bb.0: # %loadstoreloop.preheader
+; RV32-BOTH-NEXT:    li a2, 0
+; RV32-BOTH-NEXT:    lw a3, 0(a1)
+; RV32-BOTH-NEXT:    lw a4, 4(a1)
+; RV32-BOTH-NEXT:    lw a5, 8(a1)
+; RV32-BOTH-NEXT:    lw a1, 12(a1)
+; RV32-BOTH-NEXT:    li a6, 0
+; RV32-BOTH-NEXT:  .LBB0_1: # %loadstoreloop
+; RV32-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-BOTH-NEXT:    slli a7, a2, 4
+; RV32-BOTH-NEXT:    addi a2, a2, 1
+; RV32-BOTH-NEXT:    add a7, a0, a7
+; RV32-BOTH-NEXT:    seqz t0, a2
+; RV32-BOTH-NEXT:    add a6, a6, t0
+; RV32-BOTH-NEXT:    or t0, a2, a6
+; RV32-BOTH-NEXT:    sw a3, 0(a7)
+; RV32-BOTH-NEXT:    sw a4, 4(a7)
+; RV32-BOTH-NEXT:    sw a5, 8(a7)
+; RV32-BOTH-NEXT:    sw a1, 12(a7)
+; RV32-BOTH-NEXT:    beqz t0, .LBB0_1
+; RV32-BOTH-NEXT:  # %bb.2: # %split
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: memset_1:
+; RV64-BOTH:       # %bb.0: # %loadstoreloop.preheader
+; RV64-BOTH-NEXT:    addi a3, a0, 16
+; RV64-BOTH-NEXT:  .LBB0_1: # %loadstoreloop
+; RV64-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-BOTH-NEXT:    sd a1, 0(a0)
+; RV64-BOTH-NEXT:    sd a2, 8(a0)
+; RV64-BOTH-NEXT:    addi a0, a0, 16
+; RV64-BOTH-NEXT:    bne a0, a3, .LBB0_1
+; RV64-BOTH-NEXT:  # %bb.2: # %split
+; RV64-BOTH-NEXT:    ret
+  tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
+; RV32-LABEL: memset_1_noalign:
+; RV32:       # %bb.0: # %loadstoreloop.preheader
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 0
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a5, 0(a1)
+; RV32-NEXT:    lw a6, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    srli a7, a4, 24
+; RV32-NEXT:    srli t0, a4, 16
+; RV32-NEXT:    srli t1, a4, 8
+; RV32-NEXT:    srli t2, a5, 24
+; RV32-NEXT:    srli t3, a5, 16
+; RV32-NEXT:    srli t4, a5, 8
+; RV32-NEXT:    srli t5, a6, 24
+; RV32-NEXT:    srli t6, a6, 16
+; RV32-NEXT:    srli s0, a6, 8
+; RV32-NEXT:    srli s1, a1, 24
+; RV32-NEXT:    srli s2, a1, 16
+; RV32-NEXT:    srli s3, a1, 8
+; RV32-NEXT:  .LBB1_1: # %loadstoreloop
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    slli s4, a2, 4
+; RV32-NEXT:    addi a2, a2, 1
+; RV32-NEXT:    add s4, a0, s4
+; RV32-NEXT:    seqz s5, a2
+; RV32-NEXT:    sb a4, 4(s4)
+; RV32-NEXT:    sb t1, 5(s4)
+; RV32-NEXT:    sb t0, 6(s4)
+; RV32-NEXT:    sb a7, 7(s4)
+; RV32-NEXT:    sb a5, 0(s4)
+; RV32-NEXT:    sb t4, 1(s4)
+; RV32-NEXT:    sb t3, 2(s4)
+; RV32-NEXT:    sb t2, 3(s4)
+; RV32-NEXT:    sb a6, 8(s4)
+; RV32-NEXT:    sb s0, 9(s4)
+; RV32-NEXT:    sb t6, 10(s4)
+; RV32-NEXT:    sb t5, 11(s4)
+; RV32-NEXT:    add a3, a3, s5
+; RV32-NEXT:    or s5, a2, a3
+; RV32-NEXT:    sb a1, 12(s4)
+; RV32-NEXT:    sb s3, 13(s4)
+; RV32-NEXT:    sb s2, 14(s4)
+; RV32-NEXT:    sb s1, 15(s4)
+; RV32-NEXT:    beqz s5, .LBB1_1
+; RV32-NEXT:  # %bb.2: # %split
+; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: memset_1_noalign:
+; RV64:       # %bb.0: # %loadstoreloop.preheader
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    sd s0, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi a3, a0, 16
+; RV64-NEXT:    srli a4, a1, 56
+; RV64-NEXT:    srli a5, a1, 48
+; RV64-NEXT:    srli a6, a1, 40
+; RV64-NEXT:    srli a7, a1, 32
+; RV64-NEXT:    srli t0, a1, 24
+; RV64-NEXT:    srli t1, a1, 16
+; RV64-NEXT:    srli t2, a1, 8
+; RV64-NEXT:    srli t3, a2, 56
+; RV64-NEXT:    srli t4, a2, 48
+; RV64-NEXT:    srli t5, a2, 40
+; RV64-NEXT:    srli t6, a2, 32
+; RV64-NEXT:    srli s0, a2, 24
+; RV64-NEXT:    srli s1, a2, 16
+; RV64-NEXT:    srli s2, a2, 8
+; RV64-NEXT:  .LBB1_1: # %loadstoreloop
+; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-NEXT:    sb a7, 4(a0)
+; RV64-NEXT:    sb a6, 5(a0)
+; RV64-NEXT:    sb a5, 6(a0)
+; RV64-NEXT:    sb a4, 7(a0)
+; RV64-NEXT:    sb a1, 0(a0)
+; RV64-NEXT:    sb t2, 1(a0)
+; RV64-NEXT:    sb t1, 2(a0)
+; RV64-NEXT:    sb t0, 3(a0)
+; RV64-NEXT:    sb t6, 12(a0)
+; RV64-NEXT:    sb t5, 13(a0)
+; RV64-NEXT:    sb t4, 14(a0)
+; RV64-NEXT:    sb t3, 15(a0)
+; RV64-NEXT:    sb a2, 8(a0)
+; RV64-NEXT:    sb s2, 9(a0)
+; RV64-NEXT:    sb s1, 10(a0)
+; RV64-NEXT:    sb s0, 11(a0)
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    bne a0, a3, .LBB1_1
+; RV64-NEXT:  # %bb.2: # %split
+; RV64-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memset_1_noalign:
+; RV32-FAST:       # %bb.0: # %loadstoreloop.preheader
+; RV32-FAST-NEXT:    li a2, 0
+; RV32-FAST-NEXT:    lw a3, 0(a1)
+; RV32-FAST-NEXT:    lw a4, 4(a1)
+; RV32-FAST-NEXT:    lw a5, 8(a1)
+; RV32-FAST-NEXT:    lw a1, 12(a1)
+; RV32-FAST-NEXT:    li a6, 0
+; RV32-FAST-NEXT:  .LBB1_1: # %loadstoreloop
+; RV32-FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-FAST-NEXT:    slli a7, a2, 4
+; RV32-FAST-NEXT:    addi a2, a2, 1
+; RV32-FAST-NEXT:    add a7, a0, a7
+; RV32-FAST-NEXT:    seqz t0, a2
+; RV32-FAST-NEXT:    add a6, a6, t0
+; RV32-FAST-NEXT:    or t0, a2, a6
+; RV32-FAST-NEXT:    sw a3, 0(a7)
+; RV32-FAST-NEXT:    sw a4, 4(a7)
+; RV32-FAST-NEXT:    sw a5, 8(a7)
+; RV32-FAST-NEXT:    sw a1, 12(a7)
+; RV32-FAST-NEXT:    beqz t0, .LBB1_1
+; RV32-FAST-NEXT:  # %bb.2: # %split
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memset_1_noalign:
+; RV64-FAST:       # %bb.0: # %loadstoreloop.preheader
+; RV64-FAST-NEXT:    addi a3, a0, 16
+; RV64-FAST-NEXT:  .LBB1_1: # %loadstoreloop
+; RV64-FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
+; RV64-FAST-NEXT:    addi a0, a0, 16
+; RV64-FAST-NEXT:    bne a0, a3, .LBB1_1
+; RV64-FAST-NEXT:  # %bb.2: # %split
+; RV64-FAST-NEXT:    ret
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_4(ptr %a, i128 %value) nounwind {
+; RV32-BOTH-LABEL: memset_4:
+; RV32-BOTH:       # %bb.0: # %loadstoreloop.preheader
+; RV32-BOTH-NEXT:    li a2, 0
+; RV32-BOTH-NEXT:    lw a3, 0(a1)
+; RV32-BOTH-NEXT:    lw a4, 4(a1)
+; RV32-BOTH-NEXT:    lw a5, 8(a1)
+; RV32-BOTH-NEXT:    lw a1, 12(a1)
+; RV32-BOTH-NEXT:    li a6, 0
+; RV32-BOTH-NEXT:  .LBB2_1: # %loadstoreloop
+; RV32-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-BOTH-NEXT:    slli a7, a2, 4
+; RV32-BOTH-NEXT:    addi a2, a2, 1
+; RV32-BOTH-NEXT:    seqz t0, a2
+; RV32-BOTH-NEXT:    sltiu t1, a2, 4
+; RV32-BOTH-NEXT:    add a6, a6, t0
+; RV32-BOTH-NEXT:    seqz t0, a6
+; RV32-BOTH-NEXT:    and t0, t0, t1
+; RV32-BOTH-NEXT:    add a7, a0, a7
+; RV32-BOTH-NEXT:    sw a3, 0(a7)
+; RV32-BOTH-NEXT:    sw a4, 4(a7)
+; RV32-BOTH-NEXT:    sw a5, 8(a7)
+; RV32-BOTH-NEXT:    sw a1, 12(a7)
+; RV32-BOTH-NEXT:    bnez t0, .LBB2_1
+; RV32-BOTH-NEXT:  # %bb.2: # %split
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: memset_4:
+; RV64-BOTH:       # %bb.0: # %loadstoreloop.preheader
+; RV64-BOTH-NEXT:    addi a3, a0, 64
+; RV64-BOTH-NEXT:  .LBB2_1: # %loadstoreloop
+; RV64-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-BOTH-NEXT:    sd a1, 0(a0)
+; RV64-BOTH-NEXT:    sd a2, 8(a0)
+; RV64-BOTH-NEXT:    addi a0, a0, 16
+; RV64-BOTH-NEXT:    bne a0, a3, .LBB2_1
+; RV64-BOTH-NEXT:  # %bb.2: # %split
+; RV64-BOTH-NEXT:    ret
+  tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 4, i1 0)
+  ret void
+}
+
+define void @memset_x(ptr %a, i128 %value, i64 %x) nounwind {
+; RV32-BOTH-LABEL: memset_x:
+; RV32-BOTH:       # %bb.0:
+; RV32-BOTH-NEXT:    or a4, a2, a3
+; RV32-BOTH-NEXT:    beqz a4, .LBB3_5
+; RV32-BOTH-NEXT:  # %bb.1: # %loadstoreloop.preheader
+; RV32-BOTH-NEXT:    li a4, 0
+; RV32-BOTH-NEXT:    lw a5, 0(a1)
+; RV32-BOTH-NEXT:    lw a6, 4(a1)
+; RV32-BOTH-NEXT:    lw a7, 8(a1)
+; RV32-BOTH-NEXT:    lw a1, 12(a1)
+; RV32-BOTH-NEXT:    li t0, 0
+; RV32-BOTH-NEXT:    j .LBB3_3
+; RV32-BOTH-NEXT:  .LBB3_2: # %loadstoreloop
+; RV32-BOTH-NEXT:    # in Loop: Header=BB3_3 Depth=1
+; RV32-BOTH-NEXT:    sltu t1, t0, a3
+; RV32-BOTH-NEXT:    beqz t1, .LBB3_5
+; RV32-BOTH-NEXT:  .LBB3_3: # %loadstoreloop
+; RV32-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-BOTH-NEXT:    slli t1, a4, 4
+; RV32-BOTH-NEXT:    addi a4, a4, 1
+; RV32-BOTH-NEXT:    seqz t2, a4
+; RV32-BOTH-NEXT:    add t0, t0, t2
+; RV32-BOTH-NEXT:    add t1, a0, t1
+; RV32-BOTH-NEXT:    sw a5, 0(t1)
+; RV32-BOTH-NEXT:    sw a6, 4(t1)
+; RV32-BOTH-NEXT:    sw a7, 8(t1)
+; RV32-BOTH-NEXT:    sw a1, 12(t1)
+; RV32-BOTH-NEXT:    bne t0, a3, .LBB3_2
+; RV32-BOTH-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
+; RV32-BOTH-NEXT:    sltu t1, a4, a2
+; RV32-BOTH-NEXT:    bnez t1, .LBB3_3
+; RV32-BOTH-NEXT:  .LBB3_5: # %split
+; RV32-BOTH-NEXT:    ret
+;
+; RV64-BOTH-LABEL: memset_x:
+; RV64-BOTH:       # %bb.0:
+; RV64-BOTH-NEXT:    beqz a3, .LBB3_3
+; RV64-BOTH-NEXT:  # %bb.1: # %loadstoreloop.preheader
+; RV64-BOTH-NEXT:    li a4, 0
+; RV64-BOTH-NEXT:  .LBB3_2: # %loadstoreloop
+; RV64-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-BOTH-NEXT:    sd a1, 0(a0)
+; RV64-BOTH-NEXT:    sd a2, 8(a0)
+; RV64-BOTH-NEXT:    addi a4, a4, 1
+; RV64-BOTH-NEXT:    addi a0, a0, 16
+; RV64-BOTH-NEXT:    bltu a4, a3, .LBB3_2
+; RV64-BOTH-NEXT:  .LBB3_3: # %split
+; RV64-BOTH-NEXT:    ret
+  tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 %x, i1 0)
+  ret void
+}

diff  --git a/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg
new file mode 100644
index 00000000000000..bb982488eb15ee
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "PowerPC" in config.root.targets:
+    config.unsupported = True

diff  --git a/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll
new file mode 100644
index 00000000000000..1f77c4a6051066
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=powerpc64 -passes=pre-isel-intrinsic-lowering -S -o - %s 2>&1 | FileCheck %s
+
+; Simple smoke test that memset.pattern is still expanded on big endian
+; targets.
+
+define void @memset.pattern(ptr %a, i128 %value, i64 %x) nounwind {
+; CHECK-LABEL: define void @memset.pattern(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[X]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 %x, i1 0)
+  ret void
+}

diff  --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg
new file mode 100644
index 00000000000000..17351748513d98
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "RISCV" in config.root.targets:
+    config.unsupported = True

diff  --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll
new file mode 100644
index 00000000000000..d3ef9fe4cefbd2
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
+
+define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_1(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i128_16(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_16(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i127_x(ptr %a, i127 %value, i64 %x) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i127_x(
+; CHECK-SAME: ptr [[A:%.*]], i127 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[X]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i127, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    store i127 [[VALUE]], ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i127 %value, i64 %x, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i128_x(ptr %a, i128 %value, i64 %x) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_x(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[X]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6]] = add i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP6]], [[X]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 %x, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i256_x(ptr %a, i256 %value, i64 %x) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i256_x(
+; CHECK-SAME: ptr [[A:%.*]], i256 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[X]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i256, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    store i256 [[VALUE]], ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6]] = add i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP6]], [[X]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i256 %value, i64 %x, i1 0)
+  ret void
+}
+
+; The common alignment of the allocation of the pattern stride (its allocation
+; size) and the destination pointer should be used.
+define void @memset_pattern_i15_x_alignment(ptr %a, i15 %value, i64 %x) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i15_x_alignment(
+; CHECK-SAME: ptr [[A:%.*]], i15 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[X]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i15, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    store i15 [[VALUE]], ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 0, [[X]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[SPLIT1:.*]], label %[[LOADSTORELOOP2:.*]]
+; CHECK:       [[LOADSTORELOOP2]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ 0, %[[SPLIT]] ], [ [[TMP9:%.*]], %[[LOADSTORELOOP2]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i15, ptr [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    store i15 [[VALUE]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP9]] = add i64 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[TMP9]], [[X]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[LOADSTORELOOP2]], label %[[SPLIT1]]
+; CHECK:       [[SPLIT1]]:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr align 1 %a, i15 %value, i64 %x, i1 0)
+  call void @llvm.experimental.memset.pattern(ptr align 2 %a, i15 %value, i64 %x, i1 0)
+  ret void
+}

diff  --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll
index e310cd2e0b7819..ab1286e8a3d760 100644
--- a/llvm/test/Verifier/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/intrinsic-immarg.ll
@@ -63,6 +63,14 @@ define void @memset_inline_is_volatile(ptr %dest, i8 %value, i1 %is.volatile) {
   ret void
 }
 
+declare void @llvm.experimental.memset.pattern.p0.i32.i32(ptr nocapture, i32, i32, i1)
+define void @memset_pattern_is_volatile(ptr %dest, i32 %value, i1 %is.volatile) {
+  ; CHECK: immarg operand has non-immediate parameter
+  ; CHECK-NEXT: i1 %is.volatile
+  ; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i32(ptr %dest, i32 %value, i32 8, i1 %is.volatile)
+  call void @llvm.experimental.memset.pattern.p0.i32.i32(ptr %dest, i32 %value, i32 8, i1 %is.volatile)
+  ret void
+}
 
 declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1)
 define void @objectsize(ptr %ptr, i1 %a, i1 %b, i1 %c) {

diff  --git a/llvm/test/Verifier/memset-pattern.ll b/llvm/test/Verifier/memset-pattern.ll
new file mode 100644
index 00000000000000..7f5301976b7490
--- /dev/null
+++ b/llvm/test/Verifier/memset-pattern.ll
@@ -0,0 +1,9 @@
+; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s
+
+; CHECK: alignment is not a power of two
+
+define void @foo(ptr %P, i32 %value) {
+  call void @llvm.experimental.memset.pattern.p0.i32.i32(ptr align 3 %P, i32 %value, i32 4, i1 false)
+  ret void
+}
+declare void @llvm.experimental.memset.pattern.p0.i32.i32(ptr nocapture, i32, i32, i1) nounwind


        


More information about the llvm-commits mailing list