[clang] 38637ee - [clang] Add support for __builtin_memset_inline

Fri Jun 10 06:22:41 PDT 2022

Author: Guillaume Chatelet
Date: 2022-06-10T13:13:59Z
New Revision: 38637ee477541370a90b37f149069d8e5c0c2efd

URL: https://github.com/llvm/llvm-project/commit/38637ee477541370a90b37f149069d8e5c0c2efd
DIFF: https://github.com/llvm/llvm-project/commit/38637ee477541370a90b37f149069d8e5c0c2efd.diff

LOG: [clang] Add support for __builtin_memset_inline

In the same spirit as D73543 and in reply to https://reviews.llvm.org/D126768#3549920 this patch is adding support for `__builtin_memset_inline`.

The idea is to get support from the compiler to easily write efficient memory function implementations.

This patch could be split in two:
 - one for the LLVM part adding the `llvm.memset.inline.*` intrinsics.
 - and another one for the Clang part providing the instrinsic as a builtin.

Differential Revision: https://reviews.llvm.org/D126903

Added: 
    clang/test/CodeGen/builtins-memset-inline.c
    clang/test/Sema/builtins-memset-inline.cpp
    llvm/test/CodeGen/AArch64/memset-inline.ll
    llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll
    llvm/test/CodeGen/X86/memset-inline.ll
    llvm/test/CodeGen/X86/memset-vs-memset-inline.ll
    llvm/test/Verifier/memset-inline.ll

Modified: 
    clang/docs/LanguageExtensions.rst
    clang/include/clang/Basic/Builtins.def
    clang/lib/CodeGen/CGBuilder.h
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/lib/Sema/SemaChecking.cpp
    llvm/docs/LangRef.rst
    llvm/include/llvm/CodeGen/SelectionDAG.h
    llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/include/llvm/IR/IRBuilder.h
    llvm/include/llvm/IR/IntrinsicInst.h
    llvm/include/llvm/IR/Intrinsics.td
    llvm/lib/Analysis/Lint.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/lib/IR/IRBuilder.cpp
    llvm/lib/IR/Verifier.cpp
    llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
    llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
    llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
    llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
    llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
    llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
    llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
    llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
    llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
    llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
    llvm/lib/Target/X86/X86SelectionDAGInfo.h
    llvm/test/Other/lint.ll
    llvm/test/Verifier/intrinsic-immarg.ll

Removed: 
    


################################################################################
diff  --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 44848a20dca3..3e4108bdbe50 100644

--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -3218,6 +3218,26 @@ Note that the `size` argument must be a compile time constant.
 
 Note that this intrinsic cannot yet be called in a ``constexpr`` context.
 
+Guaranteed inlined memset
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+. code-block:: c
+
+  void __builtin_memset_inline(void *dst, int value, size_t size);
+
+
+``__builtin_memset_inline`` has been designed as a building block for efficient
+``memset`` implementations. It is identical to ``__builtin_memset`` but also
+guarantees not to call any external functions. See LLVM IR `llvm.memset.inline
+<https://llvm.org/docs/LangRef.html#llvm-memset-inline-intrinsic>`_ intrinsic
+for more information.
+
+This is useful to implement a custom version of ``memset``, implement a
+``libc`` memset or work around the absence of a ``libc``.
+
+Note that the `size` argument must be a compile time constant.
+
+Note that this intrinsic cannot yet be called in a ``constexpr`` context.
 
 Atomic Min/Max builtins with memory ordering
 --------------------------------------------

diff  --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 173431ce3988..c084cc2c4cbd 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -559,6 +559,7 @@ BUILTIN(__builtin_memcpy_inline, "vv*vC*Iz", "n")
 BUILTIN(__builtin_memmove, "v*v*vC*z", "nF")
 BUILTIN(__builtin_mempcpy, "v*v*vC*z", "nF")
 BUILTIN(__builtin_memset, "v*v*iz", "nF")
+BUILTIN(__builtin_memset_inline, "vv*iIz", "n")
 BUILTIN(__builtin_printf, "icC*.", "Fp:0:")
 BUILTIN(__builtin_stpcpy, "c*c*cC*", "nF")
 BUILTIN(__builtin_stpncpy, "c*c*cC*z", "nF")

diff  --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h
index 68618df60155..2fcfea64ede6 100644
--- a/clang/lib/CodeGen/CGBuilder.h
+++ b/clang/lib/CodeGen/CGBuilder.h
@@ -344,6 +344,14 @@ class CGBuilderTy : public CGBuilderBaseTy {
                         Dest.getAlignment().getAsAlign(), IsVolatile);
   }
 
+  using CGBuilderBaseTy::CreateMemSetInline;
+  llvm::CallInst *CreateMemSetInline(Address Dest, llvm::Value *Value,
+                                     uint64_t Size) {
+    return CreateMemSetInline(Dest.getPointer(),
+                              Dest.getAlignment().getAsAlign(), Value,
+                              getInt64(Size));
+  }
+
   using CGBuilderBaseTy::CreatePreserveStructAccessIndex;
   Address CreatePreserveStructAccessIndex(Address Addr, unsigned Index,
                                           unsigned FieldIndex,

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9d2e6dfb320d..c67df4d8137a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3508,6 +3508,17 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
     return RValue::get(Dest.getPointer());
   }
+  case Builtin::BI__builtin_memset_inline: {
+    Address Dest = EmitPointerWithAlignment(E->getArg(0));
+    Value *ByteVal =
+        Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)), Builder.getInt8Ty());
+    uint64_t Size =
+        E->getArg(2)->EvaluateKnownConstInt(getContext()).getZExtValue();
+    EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
+                        E->getArg(0)->getExprLoc(), FD, 0);
+    Builder.CreateMemSetInline(Dest, ByteVal, Size);
+    return RValue::get(nullptr);
+  }
   case Builtin::BI__builtin___memset_chk: {
     // fold __builtin_memset_chk(x, y, cst1, cst2) to memset iff cst1<=cst2.
     Expr::EvalResult SizeResult, DstSizeResult;

diff  --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 657238eabd9d..24bcb0d47d24 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2277,6 +2277,17 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     }
     break;
   }
+  case Builtin::BI__builtin_memset_inline: {
+    clang::Expr *SizeOp = TheCall->getArg(2);
+    // We warn about filling to `nullptr` pointers when `size` is greater than
+    // 0. When `size` is value dependent we cannot evaluate its value so we bail
+    // out.
+    if (SizeOp->isValueDependent())
+      break;
+    if (!SizeOp->EvaluateKnownConstInt(Context).isZero())
+      CheckNonNullArgument(*this, TheCall->getArg(0), TheCall->getExprLoc());
+    break;
+  }
 #define BUILTIN(ID, TYPE, ATTRS)
 #define ATOMIC_BUILTIN(ID, TYPE, ATTRS) \
   case Builtin::BI##ID: \

diff  --git a/clang/test/CodeGen/builtins-memset-inline.c b/clang/test/CodeGen/builtins-memset-inline.c
new file mode 100644
index 000000000000..0647186992fd
--- /dev/null
+++ b/clang/test/CodeGen/builtins-memset-inline.c
@@ -0,0 +1,21 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -no-opaque-pointers -triple x86_64-unknown-linux -emit-llvm %s -o - | FileCheck %s
+
+// CHECK-LABEL: define{{.*}} void @test_memset_inline_0(i8* noundef %dst, i8 noundef signext %value)
+void test_memset_inline_0(void *dst, char value) {
+  // CHECK:    call void @llvm.memset.inline.p0i8.i64(i8* align 1 %0, i8 %2, i64 0, i1 false)
+  __builtin_memset_inline(dst, value, 0);
+}
+
+// CHECK-LABEL: define{{.*}} void @test_memset_inline_1(i8* noundef %dst, i8 noundef signext %value)
+void test_memset_inline_1(void *dst, char value) {
+  // CHECK:    call void @llvm.memset.inline.p0i8.i64(i8* align 1 %0, i8 %2, i64 1, i1 false)
+  __builtin_memset_inline(dst, value, 1);
+}
+
+// CHECK-LABEL: define{{.*}} void @test_memset_inline_4(i8* noundef %dst, i8 noundef signext %value)
+void test_memset_inline_4(void *dst, char value) {
+  // CHECK:    call void @llvm.memset.inline.p0i8.i64(i8* align 1 %0, i8 %2, i64 4, i1 false)
+  __builtin_memset_inline(dst, value, 4);
+}

diff  --git a/clang/test/Sema/builtins-memset-inline.cpp b/clang/test/Sema/builtins-memset-inline.cpp
new file mode 100644
index 000000000000..e445b3bcb3c2
--- /dev/null
+++ b/clang/test/Sema/builtins-memset-inline.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+#define NULL ((char *)0)
+
+#if __has_builtin(__builtin_memset_inline)
+#warning defined as expected
+// expected-warning at -1 {{defined as expected}}
+#endif
+
+void test_memset_inline_invalid_arg_types() {
+  __builtin_memset_inline(1, 2, 3); // expected-error {{cannot initialize a parameter of type 'void *' with an rvalue of type 'int'}}
+}
+
+void test_memset_inline_null_dst(void *ptr) {
+  __builtin_memset_inline(NULL, 1, 4); // expected-warning {{null passed to a callee that requires a non-null argument}}
+}
+
+void test_memset_inline_null_buffer_is_ok_if_size_is_zero(void *ptr, char value) {
+  __builtin_memset_inline(NULL, value, /*size */ 0);
+}
+
+void test_memset_inline_non_constant_size(void *dst, char value, unsigned size) {
+  __builtin_memset_inline(dst, value, size); // expected-error {{argument to '__builtin_memset_inline' must be a constant integer}}
+}
+
+template <unsigned size>
+void test_memset_inline_template(void *dst, char value) {
+  // we do not try to evaluate size in non intantiated templates.
+  __builtin_memset_inline(dst, value, size);
+}
+
+void test_memset_inline_implicit_conversion(void *ptr, char value) {
+  char a[5];
+  __builtin_memset_inline(a, value, 5);
+}
+
+void test_memset_inline_num_args(void *dst, char value) {
+  __builtin_memset_inline();                    // expected-error {{too few arguments to function call}}
+  __builtin_memset_inline(dst, value, 4, NULL); // expected-error {{too many arguments to function call}}
+}

diff  --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 7b5b4fc2770a..d50aac1d7e61 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -13867,6 +13867,71 @@ If ``<len>`` is not a well-defined value, the behavior is undefined.
 If ``<len>`` is not zero, ``<dest>`` should be well-defined, otherwise the
 behavior is undefined.
 
+.. _int_memset_inline:
+
+'``llvm.memset.inline``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.memset.inline`` on any
+integer bit width and for 
diff erent address spaces. Not all targets
+support all bit widths however.
+
+::
+
+      declare void @llvm.memset.inline.p0i8.p0i8.i32(i8* <dest>, i8 <val>,
+                                                     i32 <len>,
+                                                     i1 <isvolatile>)
+      declare void @llvm.memset.inline.p0i8.p0i8.i64(i8* <dest>, i8 <val>,
+                                                     i64 <len>,
+                                                     i1 <isvolatile>)
+
+Overview:
+"""""""""
+
+The '``llvm.memset.inline.*``' intrinsics fill a block of memory with a
+particular byte value and guarantees that no external functions are called.
+
+Note that, unlike the standard libc function, the ``llvm.memset.inline.*``
+intrinsics do not return a value, take an extra isvolatile argument and the
+pointer can be in specified address spaces.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to the destination to fill, the second
+is the byte value with which to fill it, the third argument is a constant
+integer argument specifying the number of bytes to fill, and the fourth
+is a boolean indicating a volatile access.
+
+The :ref:`align <attr_align>` parameter attribute can be provided
+for the first argument.
+
+If the ``isvolatile`` parameter is ``true``, the ``llvm.memset.inline`` call is
+a :ref:`volatile operation <volatile>`. The detailed access behavior is not
+very cleanly specified and it is unwise to depend on it.
+
+Semantics:
+""""""""""
+
+The '``llvm.memset.inline.*``' intrinsics fill "len" bytes of memory starting
+at the destination location. If the argument is known to be
+aligned to some boundary, this can be specified as an attribute on
+the argument.
+
+``len`` must be a constant expression.
+If ``<len>`` is 0, it is no-op modulo the behavior of attributes attached to
+the arguments.
+If ``<len>`` is not a well-defined value, the behavior is undefined.
+If ``<len>`` is not zero, ``<dest>`` should be well-defined, otherwise the
+behavior is undefined.
+
+The behavior of '``llvm.memset.inline.*``' is equivalent to the behavior of
+'``llvm.memset.*``', but the generated code is guaranteed not to call any
+external functions.
+
 '``llvm.sqrt.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff  --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 8722e89624f9..b64772558fa9 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1052,7 +1052,8 @@ class SelectionDAG {
                      const AAMDNodes &AAInfo = AAMDNodes());
 
   SDValue getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
-                    SDValue Size, Align Alignment, bool isVol, bool isTailCall,
+                    SDValue Size, Align Alignment, bool isVol,
+                    bool AlwaysInline, bool isTailCall,
                     MachinePointerInfo DstPtrInfo,
                     const AAMDNodes &AAInfo = AAMDNodes());
 

diff  --git a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
index 722c3275fd06..e7d608969124 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
@@ -76,11 +76,13 @@ class SelectionDAGTargetInfo {
   /// that don't fit the target's parameters for simple stores and can be more
   /// efficient than using a library call. This function can return a null
   /// SDValue if the target declines to use custom code and a 
diff erent
-  /// lowering strategy should be used.
+  /// lowering strategy should be used. Note that if AlwaysInline is true the
+  /// function has to return a valid SDValue.
   virtual SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                           SDValue Chain, SDValue Op1,
                                           SDValue Op2, SDValue Op3,
                                           Align Alignment, bool isVolatile,
+                                          bool AlwaysInline,
                                           MachinePointerInfo DstPtrInfo) const {
     return SDValue();
   }

diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8ad6c4d58817..4cc7fe9967ce 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3539,6 +3539,7 @@ class TargetLowering : public TargetLoweringBase {
 
   /// Determines the optimal series of memory ops to replace the memset / memcpy.
   /// Return true if the number of memory ops is below the threshold (Limit).
+  /// Note that this is always the case when Limit is ~0.
   /// It returns the types of the sequence of memory ops to perform
   /// memset / memcpy by reference.
   virtual bool

diff  --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 659960f2e09e..2e0c6a40bfdc 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -578,6 +578,12 @@ class IRBuilderBase {
                          MDNode *ScopeTag = nullptr,
                          MDNode *NoAliasTag = nullptr);
 
+  CallInst *CreateMemSetInline(Value *Dst, MaybeAlign DstAlign, Value *Val,
+                               Value *Size, bool IsVolatile = false,
+                               MDNode *TBAATag = nullptr,
+                               MDNode *ScopeTag = nullptr,
+                               MDNode *NoAliasTag = nullptr);
+
   /// Create and insert an element unordered-atomic memset of the region of
   /// memory starting at the given pointer to the given value.
   ///

diff  --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index e494fca8fd06..06d2335821d3 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -973,6 +973,7 @@ class MemIntrinsic : public MemIntrinsicBase<MemIntrinsic> {
     case Intrinsic::memcpy:
     case Intrinsic::memmove:
     case Intrinsic::memset:
+    case Intrinsic::memset_inline:
     case Intrinsic::memcpy_inline:
       return true;
     default:
@@ -984,12 +985,33 @@ class MemIntrinsic : public MemIntrinsicBase<MemIntrinsic> {
   }
 };
 
-/// This class wraps the llvm.memset intrinsic.
+/// This class wraps the llvm.memset and llvm.memset.inline intrinsics.
 class MemSetInst : public MemSetBase<MemIntrinsic> {
 public:
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const IntrinsicInst *I) {
-    return I->getIntrinsicID() == Intrinsic::memset;
+    switch (I->getIntrinsicID()) {
+    case Intrinsic::memset:
+    case Intrinsic::memset_inline:
+      return true;
+    default:
+      return false;
+    }
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This class wraps the llvm.memset.inline intrinsic.
+class MemSetInlineInst : public MemSetInst {
+public:
+  ConstantInt *getLength() const {
+    return cast<ConstantInt>(MemSetInst::getLength());
+  }
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::memset_inline;
   }
   static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
@@ -1074,6 +1096,7 @@ class AnyMemIntrinsic : public MemIntrinsicBase<AnyMemIntrinsic> {
     case Intrinsic::memcpy_inline:
     case Intrinsic::memmove:
     case Intrinsic::memset:
+    case Intrinsic::memset_inline:
     case Intrinsic::memcpy_element_unordered_atomic:
     case Intrinsic::memmove_element_unordered_atomic:
     case Intrinsic::memset_element_unordered_atomic:
@@ -1095,6 +1118,7 @@ class AnyMemSetInst : public MemSetBase<AnyMemIntrinsic> {
   static bool classof(const IntrinsicInst *I) {
     switch (I->getIntrinsicID()) {
     case Intrinsic::memset:
+    case Intrinsic::memset_inline:
     case Intrinsic::memset_element_unordered_atomic:
       return true;
     default:

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index c40f0d4ca412..1b1cfd428cbc 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -651,6 +651,17 @@ def int_memset  : Intrinsic<[],
                              NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
                              ImmArg<ArgIndex<3>>]>;
 
+// Memset version that is guaranteed to be inlined.
+// In particular this means that the generated code is not allowed to call any
+// external function.
+// The third argument (specifying the size) must be a constant.
+def int_memset_inline
+    : Intrinsic<[],
+      [llvm_anyptr_ty, llvm_i8_ty, llvm_anyint_ty, llvm_i1_ty],
+      [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, IntrNoFree,
+       NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>, 
+       ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+
 // FIXME: Add version of these floating point intrinsics which allow non-default
 // rounding modes and FP exception handling.
 

diff  --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 291fea898970..1f46a21576e5 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -335,6 +335,12 @@ void Lint::visitCallBase(CallBase &I) {
                            MSI->getDestAlign(), nullptr, MemRef::Write);
       break;
     }
+    case Intrinsic::memset_inline: {
+      MemSetInlineInst *MSII = cast<MemSetInlineInst>(&I);
+      visitMemoryReference(I, MemoryLocation::getForDest(MSII),
+                           MSII->getDestAlign(), nullptr, MemRef::Write);
+      break;
+    }
 
     case Intrinsic::vastart:
       Check(I.getParent()->getParent()->isVarArg(),

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index efb86e154d68..314aa7cc69ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6987,17 +6987,18 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
 /// \param Size Number of bytes to write.
 /// \param Alignment Alignment of the destination in bytes.
 /// \param isVol True if destination is volatile.
+/// \param AlwaysInline Makes sure no function call is generated.
 /// \param DstPtrInfo IR information on the memory pointer.
 /// \returns New head in the control flow, if lowering was successful, empty
 /// SDValue otherwise.
 ///
 /// The function tries to replace 'llvm.memset' intrinsic with several store
 /// operations and value calculation code. This is usually profitable for small
-/// memory size.
+/// memory size or when the semantic requires inlining.
 static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
                                SDValue Chain, SDValue Dst, SDValue Src,
                                uint64_t Size, Align Alignment, bool isVol,
-                               MachinePointerInfo DstPtrInfo,
+                               bool AlwaysInline, MachinePointerInfo DstPtrInfo,
                                const AAMDNodes &AAInfo) {
   // Turn a memset of undef to nop.
   // FIXME: We need to honor volatile even is Src is undef.
@@ -7017,8 +7018,10 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
     DstAlignCanChange = true;
   bool IsZeroVal =
       isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isZero();
+  unsigned Limit = AlwaysInline ? ~0 : TLI.getMaxStoresPerMemset(OptSize);
+
   if (!TLI.findOptimalMemOpLowering(
-          MemOps, TLI.getMaxStoresPerMemset(OptSize),
+          MemOps, Limit,
           MemOp::Set(Size, DstAlignCanChange, Alignment, IsZeroVal, isVol),
           DstPtrInfo.getAddrSpace(), ~0u, MF.getFunction().getAttributes()))
     return SDValue();
@@ -7314,7 +7317,7 @@ SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl,
 
 SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
                                 SDValue Src, SDValue Size, Align Alignment,
-                                bool isVol, bool isTailCall,
+                                bool isVol, bool AlwaysInline, bool isTailCall,
                                 MachinePointerInfo DstPtrInfo,
                                 const AAMDNodes &AAInfo) {
   // Check to see if we should lower the memset to stores first.
@@ -7327,7 +7330,7 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
 
     SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src,
                                      ConstantSize->getZExtValue(), Alignment,
-                                     isVol, DstPtrInfo, AAInfo);
+                                     isVol, false, DstPtrInfo, AAInfo);
 
     if (Result.getNode())
       return Result;
@@ -7337,11 +7340,23 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
   // code. If the target chooses to do this, this is the next best.
   if (TSI) {
     SDValue Result = TSI->EmitTargetCodeForMemset(
-        *this, dl, Chain, Dst, Src, Size, Alignment, isVol, DstPtrInfo);
+        *this, dl, Chain, Dst, Src, Size, Alignment, isVol, AlwaysInline, DstPtrInfo);
     if (Result.getNode())
       return Result;
   }
 
+  // If we really need inline code and the target declined to provide it,
+  // use a (potentially long) sequence of loads and stores.
+  if (AlwaysInline) {
+    assert(ConstantSize && "AlwaysInline requires a constant size!");
+    SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src,
+                                     ConstantSize->getZExtValue(), Alignment,
+                                     isVol, true, DstPtrInfo, AAInfo);
+    assert(Result &&
+           "getMemsetStores must return a valid sequence when AlwaysInline");
+    return Result;
+  }
+
   checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
 
   // Emit a library call.

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index a1293d3edab4..1da07ddfc9ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5909,10 +5909,28 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     bool isVol = MSI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     SDValue Root = isVol ? getRoot() : getMemoryRoot();
-    SDValue MS = DAG.getMemset(Root, sdl, Op1, Op2, Op3, Alignment, isVol, isTC,
+    SDValue MS = DAG.getMemset(
+        Root, sdl, Op1, Op2, Op3, Alignment, isVol, /* AlwaysInline */ false,
+        isTC, MachinePointerInfo(I.getArgOperand(0)), I.getAAMetadata());
+    updateDAGForMaybeTailCall(MS);
+    return;
+  }
+  case Intrinsic::memset_inline: {
+    const auto &MSII = cast<MemSetInlineInst>(I);
+    SDValue Dst = getValue(I.getArgOperand(0));
+    SDValue Value = getValue(I.getArgOperand(1));
+    SDValue Size = getValue(I.getArgOperand(2));
+    assert(isa<ConstantSDNode>(Size) && "memset_inline needs constant size");
+    // @llvm.memset defines 0 and 1 to both mean no alignment.
+    Align DstAlign = MSII.getDestAlign().valueOrOne();
+    bool isVol = MSII.isVolatile();
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+    SDValue Root = isVol ? getRoot() : getMemoryRoot();
+    SDValue MC = DAG.getMemset(Root, sdl, Dst, Value, Size, DstAlign, isVol,
+                               /* AlwaysInline */ true, isTC,
                                MachinePointerInfo(I.getArgOperand(0)),
                                I.getAAMetadata());
-    updateDAGForMaybeTailCall(MS);
+    updateDAGForMaybeTailCall(MC);
     return;
   }
   case Intrinsic::memmove: {

diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5cf5aa9c2980..1bd83261523f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -196,7 +196,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
 bool TargetLowering::findOptimalMemOpLowering(
     std::vector<EVT> &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS,
     unsigned SrcAS, const AttributeList &FuncAttributes) const {
-  if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
+  if (Limit != ~unsigned(0) && Op.isMemcpyWithFixedDstAlign() &&
+      Op.getSrcAlign() < Op.getDstAlign())
     return false;
 
   EVT VT = getOptimalMemOpType(Op, FuncAttributes);

diff  --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 622ed402fbe4..d0c622fe2389 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -164,6 +164,35 @@ CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size,
   return CI;
 }
 
+CallInst *IRBuilderBase::CreateMemSetInline(Value *Dst, MaybeAlign DstAlign,
+                                            Value *Val, Value *Size,
+                                            bool IsVolatile, MDNode *TBAATag,
+                                            MDNode *ScopeTag,
+                                            MDNode *NoAliasTag) {
+  Dst = getCastedInt8PtrValue(Dst);
+  Value *Ops[] = {Dst, Val, Size, getInt1(IsVolatile)};
+  Type *Tys[] = {Dst->getType(), Size->getType()};
+  Module *M = BB->getParent()->getParent();
+  Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset_inline, Tys);
+
+  CallInst *CI = createCallHelper(TheFn, Ops, this);
+
+  if (DstAlign)
+    cast<MemSetInlineInst>(CI)->setDestAlignment(*DstAlign);
+
+  // Set the TBAA info if present.
+  if (TBAATag)
+    CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+  if (ScopeTag)
+    CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
+
+  if (NoAliasTag)
+    CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
+
+  return CI;
+}
+
 CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet(
     Value *Ptr, Value *Val, Value *Size, Align Alignment, uint32_t ElementSize,
     MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) {

diff  --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 49522793e4ae..0df0c327a337 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4917,7 +4917,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::memcpy:
   case Intrinsic::memcpy_inline:
   case Intrinsic::memmove:
-  case Intrinsic::memset: {
+  case Intrinsic::memset:
+  case Intrinsic::memset_inline: {
     const auto *MI = cast<MemIntrinsic>(&Call);
     auto IsValidAlignment = [&](unsigned Alignment) -> bool {
       return Alignment == 0 || isPowerOf2_32(Alignment);

diff  --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 07494c42c647..677797a6797b 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -91,7 +91,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
 
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, Align Alignment, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();

diff  --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 47fe3bf7dcf5..73f93724d6fc 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -34,7 +34,7 @@ class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
                                   SDValue Size, Align Alignment,
-                                  bool isVolatile,
+                                  bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
   SDValue
   EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,

diff  --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 12d4ad889897..379521752261 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -296,7 +296,7 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
 
 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, Align Alignment, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
 
   const ARMSubtarget &Subtarget =
@@ -314,6 +314,9 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
   }
 
-  return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
-                                Alignment.value(), RTLIB::MEMSET);
+  if (!AlwaysInline)
+    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+                                  Alignment.value(), RTLIB::MEMSET);
+
+  return SDValue();
 }

diff  --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
index 7aa831c09248..ffa8b5049351 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -55,6 +55,7 @@ class ARMSelectionDAGInfo : public SelectionDAGTargetInfo {
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
                                   SDValue Op3, Align Alignment, bool isVolatile,
+                                  bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &dl,

diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 32305af68c90..9a1be95b67da 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1000,13 +1000,15 @@ bool SystemZTargetLowering::findOptimalMemOpLowering(
     unsigned SrcAS, const AttributeList &FuncAttributes) const {
   const int MVCFastLen = 16;
 
-  // Don't expand Op into scalar loads/stores in these cases:
-  if (Op.isMemcpy() && Op.allowOverlap() && Op.size() <= MVCFastLen)
-    return false;  // Small memcpy: Use MVC
-  if (Op.isMemset() && Op.size() - 1 <= MVCFastLen)
-    return false;  // Small memset (first byte with STC/MVI): Use MVC
-  if (Op.isZeroMemset())
-    return false;  // Memset zero: Use XC
+  if (Limit != ~unsigned(0)) {
+    // Don't expand Op into scalar loads/stores in these cases:
+    if (Op.isMemcpy() && Op.allowOverlap() && Op.size() <= MVCFastLen)
+      return false; // Small memcpy: Use MVC
+    if (Op.isMemset() && Op.size() - 1 <= MVCFastLen)
+      return false; // Small memset (first byte with STC/MVI): Use MVC
+    if (Op.isZeroMemset())
+      return false; // Memset zero: Use XC
+  }
 
   return TargetLowering::findOptimalMemOpLowering(MemOps, Limit, Op, DstAS,
                                                   SrcAS, FuncAttributes);

diff  --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index db4b4879b33a..ce30d8ef2cba 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -88,7 +88,7 @@ static SDValue memsetStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
 SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst,
     SDValue Byte, SDValue Size, Align Alignment, bool IsVolatile,
-    MachinePointerInfo DstPtrInfo) const {
+    bool AlwaysInline, MachinePointerInfo DstPtrInfo) const {
   EVT PtrVT = Dst.getValueType();
 
   if (IsVolatile)

diff  --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index da6725777e43..6ac5bf8c6c1a 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -31,7 +31,7 @@ class SystemZSelectionDAGInfo : public SelectionDAGTargetInfo {
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Dst, SDValue Byte,
                                   SDValue Size, Align Alignment,
-                                  bool IsVolatile,
+                                  bool IsVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   std::pair<SDValue, SDValue>

diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index 16e05150c64e..74af4c8873f7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -44,7 +44,7 @@ SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemmove(
 
 SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Val,
-    SDValue Size, Align Alignment, bool IsVolatile,
+    SDValue Size, Align Alignment, bool IsVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
   auto &ST = DAG.getMachineFunction().getSubtarget<WebAssemblySubtarget>();
   if (!ST.hasBulkMemory())

diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index f4d2132fd3af..fd517b238715 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -37,6 +37,7 @@ class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo {
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
                                   SDValue Op3, Align Alignment, bool IsVolatile,
+                                  bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
 };
 

diff  --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index e51d05fab5ab..78a286ae5b28 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -46,7 +46,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
 
 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
-    SDValue Size, Align Alignment, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   const X86Subtarget &Subtarget =
@@ -143,7 +143,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
                       DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
                                   DAG.getConstant(Offset, dl, AddrVT)),
                       Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
-                      isVolatile, false, DstPtrInfo.getWithOffset(Offset));
+                      isVolatile, AlwaysInline,
+                      /* isTailCall */ false, DstPtrInfo.getWithOffset(Offset));
   }
 
   // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.

diff  --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/llvm/lib/Target/X86/X86SelectionDAGInfo.h
index dac62973636c..19136ca4f6f5 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -29,7 +29,7 @@ class X86SelectionDAGInfo : public SelectionDAGTargetInfo {
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
                                   SDValue Size, Align Alignment,
-                                  bool isVolatile,
+                                  bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,

diff  --git a/llvm/test/CodeGen/AArch64/memset-inline.ll b/llvm/test/CodeGen/AArch64/memset-inline.ll
new file mode 100644
index 000000000000..66731ac0f04c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/memset-inline.ll
@@ -0,0 +1,296 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -mattr=-neon | FileCheck %s --check-prefixes=ALL,GPR
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -mattr=neon  | FileCheck %s --check-prefixes=ALL,NEON
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @memset_1(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_1:
+; ALL:       // %bb.0:
+; ALL-NEXT:    strb w1, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_2(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_2:
+; ALL:       // %bb.0:
+; ALL-NEXT:    bfi w1, w1, #8, #24
+; ALL-NEXT:    strh w1, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 2, i1 0)
+  ret void
+}
+
+define void @memset_4(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_4:
+; ALL:       // %bb.0:
+; ALL-NEXT:    mov w8, #16843009
+; ALL-NEXT:    and w9, w1, #0xff
+; ALL-NEXT:    mul w8, w9, w8
+; ALL-NEXT:    str w8, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 4, i1 0)
+  ret void
+}
+
+define void @memset_8(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_8:
+; ALL:       // %bb.0:
+; ALL-NEXT:    // kill: def $w1 killed $w1 def $x1
+; ALL-NEXT:    mov x8, #72340172838076673
+; ALL-NEXT:    and x9, x1, #0xff
+; ALL-NEXT:    mul x8, x9, x8
+; ALL-NEXT:    str x8, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0)
+  ret void
+}
+
+define void @memset_16(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_16:
+; ALL:       // %bb.0:
+; ALL-NEXT:    // kill: def $w1 killed $w1 def $x1
+; ALL-NEXT:    mov x8, #72340172838076673
+; ALL-NEXT:    and x9, x1, #0xff
+; ALL-NEXT:    mul x8, x9, x8
+; ALL-NEXT:    stp x8, x8, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @memset_32(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_32:
+; GPR:       // %bb.0:
+; GPR-NEXT:    // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT:    mov x8, #72340172838076673
+; GPR-NEXT:    and x9, x1, #0xff
+; GPR-NEXT:    mul x8, x9, x8
+; GPR-NEXT:    stp x8, x8, [x0, #16]
+; GPR-NEXT:    stp x8, x8, [x0]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: memset_32:
+; NEON:       // %bb.0:
+; NEON-NEXT:    dup v0.16b, w1
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 32, i1 0)
+  ret void
+}
+
+define void @memset_64(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_64:
+; GPR:       // %bb.0:
+; GPR-NEXT:    // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT:    mov x8, #72340172838076673
+; GPR-NEXT:    and x9, x1, #0xff
+; GPR-NEXT:    mul x8, x9, x8
+; GPR-NEXT:    stp x8, x8, [x0, #48]
+; GPR-NEXT:    stp x8, x8, [x0, #32]
+; GPR-NEXT:    stp x8, x8, [x0, #16]
+; GPR-NEXT:    stp x8, x8, [x0]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: memset_64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    dup v0.16b, w1
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    stp q0, q0, [x0, #32]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 64, i1 0)
+  ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @aligned_memset_16(i8* align 16 %a, i8 %value) nounwind {
+; ALL-LABEL: aligned_memset_16:
+; ALL:       // %bb.0:
+; ALL-NEXT:    // kill: def $w1 killed $w1 def $x1
+; ALL-NEXT:    mov x8, #72340172838076673
+; ALL-NEXT:    and x9, x1, #0xff
+; ALL-NEXT:    mul x8, x9, x8
+; ALL-NEXT:    stp x8, x8, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @aligned_memset_32(i8* align 32 %a, i8 %value) nounwind {
+; GPR-LABEL: aligned_memset_32:
+; GPR:       // %bb.0:
+; GPR-NEXT:    // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT:    mov x8, #72340172838076673
+; GPR-NEXT:    and x9, x1, #0xff
+; GPR-NEXT:    mul x8, x9, x8
+; GPR-NEXT:    stp x8, x8, [x0, #16]
+; GPR-NEXT:    stp x8, x8, [x0]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: aligned_memset_32:
+; NEON:       // %bb.0:
+; NEON-NEXT:    dup v0.16b, w1
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 %value, i64 32, i1 0)
+  ret void
+}
+
+define void @aligned_memset_64(i8* align 64 %a, i8 %value) nounwind {
+; GPR-LABEL: aligned_memset_64:
+; GPR:       // %bb.0:
+; GPR-NEXT:    // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT:    mov x8, #72340172838076673
+; GPR-NEXT:    and x9, x1, #0xff
+; GPR-NEXT:    mul x8, x9, x8
+; GPR-NEXT:    stp x8, x8, [x0, #48]
+; GPR-NEXT:    stp x8, x8, [x0, #32]
+; GPR-NEXT:    stp x8, x8, [x0, #16]
+; GPR-NEXT:    stp x8, x8, [x0]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: aligned_memset_64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    dup v0.16b, w1
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    stp q0, q0, [x0, #32]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 %value, i64 64, i1 0)
+  ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @bzero_1(i8* %a) nounwind {
+; ALL-LABEL: bzero_1:
+; ALL:       // %bb.0:
+; ALL-NEXT:    strb wzr, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 1, i1 0)
+  ret void
+}
+
+define void @bzero_2(i8* %a) nounwind {
+; ALL-LABEL: bzero_2:
+; ALL:       // %bb.0:
+; ALL-NEXT:    strh wzr, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 2, i1 0)
+  ret void
+}
+
+define void @bzero_4(i8* %a) nounwind {
+; ALL-LABEL: bzero_4:
+; ALL:       // %bb.0:
+; ALL-NEXT:    str wzr, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 4, i1 0)
+  ret void
+}
+
+define void @bzero_8(i8* %a) nounwind {
+; ALL-LABEL: bzero_8:
+; ALL:       // %bb.0:
+; ALL-NEXT:    str xzr, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 8, i1 0)
+  ret void
+}
+
+define void @bzero_16(i8* %a) nounwind {
+; ALL-LABEL: bzero_16:
+; ALL:       // %bb.0:
+; ALL-NEXT:    stp xzr, xzr, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 16, i1 0)
+  ret void
+}
+
+define void @bzero_32(i8* %a) nounwind {
+; GPR-LABEL: bzero_32:
+; GPR:       // %bb.0:
+; GPR-NEXT:    adrp x8, .LCPI15_0
+; GPR-NEXT:    ldr q0, [x8, :lo12:.LCPI15_0]
+; GPR-NEXT:    stp q0, q0, [x0]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: bzero_32:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v0.2d, #0000000000000000
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 32, i1 0)
+  ret void
+}
+
+define void @bzero_64(i8* %a) nounwind {
+; GPR-LABEL: bzero_64:
+; GPR:       // %bb.0:
+; GPR-NEXT:    adrp x8, .LCPI16_0
+; GPR-NEXT:    ldr q0, [x8, :lo12:.LCPI16_0]
+; GPR-NEXT:    stp q0, q0, [x0]
+; GPR-NEXT:    stp q0, q0, [x0, #32]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: bzero_64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v0.2d, #0000000000000000
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    stp q0, q0, [x0, #32]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 64, i1 0)
+  ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @aligned_bzero_16(i8* %a) nounwind {
+; ALL-LABEL: aligned_bzero_16:
+; ALL:       // %bb.0:
+; ALL-NEXT:    stp xzr, xzr, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 0, i64 16, i1 0)
+  ret void
+}
+
+define void @aligned_bzero_32(i8* %a) nounwind {
+; GPR-LABEL: aligned_bzero_32:
+; GPR:       // %bb.0:
+; GPR-NEXT:    adrp x8, .LCPI18_0
+; GPR-NEXT:    ldr q0, [x8, :lo12:.LCPI18_0]
+; GPR-NEXT:    stp q0, q0, [x0]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: aligned_bzero_32:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v0.2d, #0000000000000000
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 0, i64 32, i1 0)
+  ret void
+}
+
+define void @aligned_bzero_64(i8* %a) nounwind {
+; GPR-LABEL: aligned_bzero_64:
+; GPR:       // %bb.0:
+; GPR-NEXT:    adrp x8, .LCPI19_0
+; GPR-NEXT:    ldr q0, [x8, :lo12:.LCPI19_0]
+; GPR-NEXT:    stp q0, q0, [x0]
+; GPR-NEXT:    stp q0, q0, [x0, #32]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: aligned_bzero_64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v0.2d, #0000000000000000
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    stp q0, q0, [x0, #32]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 0)
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll b/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll
new file mode 100644
index 000000000000..47d7dd1ad489
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+define void @test1(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    mov x8, #72340172838076673
+; CHECK-NEXT:    and x9, x1, #0xff
+; CHECK-NEXT:    mul x8, x9, x8
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0)
+  ret void
+}
+
+define void @regular_memset_calls_external_function(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: regular_memset_calls_external_function:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #1024
+; CHECK-NEXT:    b memset
+  tail call void @llvm.memset.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0)
+  ret void
+}
+
+define void @inlined_set_doesnt_call_external_function(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: inlined_set_doesnt_call_external_function:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.16b, w1
+; CHECK-NEXT:    stp q0, q0, [x0]
+; CHECK-NEXT:    stp q0, q0, [x0, #32]
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0)
+  ret void
+}

diff  --git a/llvm/test/CodeGen/X86/memset-inline.ll b/llvm/test/CodeGen/X86/memset-inline.ll
new file mode 100644
index 000000000000..65cfceaf3491
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memset-inline.ll
@@ -0,0 +1,548 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2,-sse4.2 | FileCheck %s --check-prefixes=GPR,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.2,-avx  | FileCheck %s --check-prefixes=GPR,SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx,-avx512f | FileCheck %s --check-prefixes=GPR,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f      | FileCheck %s --check-prefixes=GPR,AVX512
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @memset_1(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_1:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movb %sil, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_2(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_2:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movzbl %sil, %eax
+; GPR-NEXT:    shll $8, %esi
+; GPR-NEXT:    orl %esi, %eax
+; GPR-NEXT:    movw %ax, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 2, i1 0)
+  ret void
+}
+
+define void @memset_4(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_4:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movzbl %sil, %eax
+; GPR-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; GPR-NEXT:    movl %eax, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 4, i1 0)
+  ret void
+}
+
+define void @memset_8(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_8:
+; GPR:       # %bb.0:
+; GPR-NEXT:    # kill: def $esi killed $esi def $rsi
+; GPR-NEXT:    movzbl %sil, %eax
+; GPR-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; GPR-NEXT:    imulq %rax, %rcx
+; GPR-NEXT:    movq %rcx, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0)
+  ret void
+}
+
+define void @memset_16(i8* %a, i8 %value) nounwind {
+; SSE2-LABEL: memset_16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
+; SSE2-NEXT:    movzbl %sil, %eax
+; SSE2-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE2-NEXT:    imulq %rax, %rcx
+; SSE2-NEXT:    movq %rcx, 8(%rdi)
+; SSE2-NEXT:    movq %rcx, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: memset_16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pshufb %xmm1, %xmm0
+; SSE4-NEXT:    movdqu %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: memset_16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %esi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: memset_16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %esi, %xmm0
+; AVX512-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @memset_32(i8* %a, i8 %value) nounwind {
+; SSE2-LABEL: memset_32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
+; SSE2-NEXT:    movzbl %sil, %eax
+; SSE2-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE2-NEXT:    imulq %rax, %rcx
+; SSE2-NEXT:    movq %rcx, 24(%rdi)
+; SSE2-NEXT:    movq %rcx, 16(%rdi)
+; SSE2-NEXT:    movq %rcx, 8(%rdi)
+; SSE2-NEXT:    movq %rcx, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: memset_32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pshufb %xmm1, %xmm0
+; SSE4-NEXT:    movdqu %xmm0, 16(%rdi)
+; SSE4-NEXT:    movdqu %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: memset_32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %esi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: memset_32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %esi, %xmm0
+; AVX512-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 32, i1 0)
+  ret void
+}
+
+define void @memset_64(i8* %a, i8 %value) nounwind {
+; SSE2-LABEL: memset_64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
+; SSE2-NEXT:    movzbl %sil, %eax
+; SSE2-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE2-NEXT:    imulq %rax, %rcx
+; SSE2-NEXT:    movq %rcx, 56(%rdi)
+; SSE2-NEXT:    movq %rcx, 48(%rdi)
+; SSE2-NEXT:    movq %rcx, 40(%rdi)
+; SSE2-NEXT:    movq %rcx, 32(%rdi)
+; SSE2-NEXT:    movq %rcx, 24(%rdi)
+; SSE2-NEXT:    movq %rcx, 16(%rdi)
+; SSE2-NEXT:    movq %rcx, 8(%rdi)
+; SSE2-NEXT:    movq %rcx, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: memset_64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pshufb %xmm1, %xmm0
+; SSE4-NEXT:    movdqu %xmm0, 48(%rdi)
+; SSE4-NEXT:    movdqu %xmm0, 32(%rdi)
+; SSE4-NEXT:    movdqu %xmm0, 16(%rdi)
+; SSE4-NEXT:    movdqu %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: memset_64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %esi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX-NEXT:    vmovups %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: memset_64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movzbl %sil, %eax
+; AVX512-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; AVX512-NEXT:    vpbroadcastd %eax, %zmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 64, i1 0)
+  ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @aligned_memset_16(i8* align 16 %a, i8 %value) nounwind {
+; SSE2-LABEL: aligned_memset_16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: aligned_memset_16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pshufb %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: aligned_memset_16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %esi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: aligned_memset_16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %esi, %xmm0
+; AVX512-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @aligned_memset_32(i8* align 32 %a, i8 %value) nounwind {
+; SSE2-LABEL: aligned_memset_32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: aligned_memset_32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pshufb %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, 16(%rdi)
+; SSE4-NEXT:    movdqa %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: aligned_memset_32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %esi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: aligned_memset_32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %esi, %xmm0
+; AVX512-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512-NEXT:    vmovdqa %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 %value, i64 32, i1 0)
+  ret void
+}
+
+define void @aligned_memset_64(i8* align 64 %a, i8 %value) nounwind {
+; SSE2-LABEL: aligned_memset_64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, 48(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, 32(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: aligned_memset_64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pshufb %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, 48(%rdi)
+; SSE4-NEXT:    movdqa %xmm0, 32(%rdi)
+; SSE4-NEXT:    movdqa %xmm0, 16(%rdi)
+; SSE4-NEXT:    movdqa %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: aligned_memset_64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %esi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovaps %ymm0, 32(%rdi)
+; AVX-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: aligned_memset_64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movzbl %sil, %eax
+; AVX512-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; AVX512-NEXT:    vpbroadcastd %eax, %zmm0
+; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 %value, i64 64, i1 0)
+  ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @bzero_1(i8* %a) nounwind {
+; GPR-LABEL: bzero_1:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movb $0, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 1, i1 0)
+  ret void
+}
+
+define void @bzero_2(i8* %a) nounwind {
+; GPR-LABEL: bzero_2:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movw $0, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 2, i1 0)
+  ret void
+}
+
+define void @bzero_4(i8* %a) nounwind {
+; GPR-LABEL: bzero_4:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movl $0, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 4, i1 0)
+  ret void
+}
+
+define void @bzero_8(i8* %a) nounwind {
+; GPR-LABEL: bzero_8:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movq $0, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 8, i1 0)
+  ret void
+}
+
+define void @bzero_16(i8* %a) nounwind {
+; SSE2-LABEL: bzero_16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq $0, 8(%rdi)
+; SSE2-NEXT:    movq $0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: bzero_16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    xorps %xmm0, %xmm0
+; SSE4-NEXT:    movups %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: bzero_16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovups %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: bzero_16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %xmm0, (%rdi)
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 16, i1 0)
+  ret void
+}
+
+define void @bzero_32(i8* %a) nounwind {
+; SSE2-LABEL: bzero_32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq $0, 24(%rdi)
+; SSE2-NEXT:    movq $0, 16(%rdi)
+; SSE2-NEXT:    movq $0, 8(%rdi)
+; SSE2-NEXT:    movq $0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: bzero_32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    xorps %xmm0, %xmm0
+; SSE4-NEXT:    movups %xmm0, 16(%rdi)
+; SSE4-NEXT:    movups %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: bzero_32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovups %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: bzero_32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 32, i1 0)
+  ret void
+}
+
+define void @bzero_64(i8* %a) nounwind {
+; SSE2-LABEL: bzero_64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq $0, 56(%rdi)
+; SSE2-NEXT:    movq $0, 48(%rdi)
+; SSE2-NEXT:    movq $0, 40(%rdi)
+; SSE2-NEXT:    movq $0, 32(%rdi)
+; SSE2-NEXT:    movq $0, 24(%rdi)
+; SSE2-NEXT:    movq $0, 16(%rdi)
+; SSE2-NEXT:    movq $0, 8(%rdi)
+; SSE2-NEXT:    movq $0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: bzero_64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    xorps %xmm0, %xmm0
+; SSE4-NEXT:    movups %xmm0, 48(%rdi)
+; SSE4-NEXT:    movups %xmm0, 32(%rdi)
+; SSE4-NEXT:    movups %xmm0, 16(%rdi)
+; SSE4-NEXT:    movups %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: bzero_64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX-NEXT:    vmovups %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: bzero_64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 64, i1 0)
+  ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @aligned_bzero_16(i8* %a) nounwind {
+; SSE2-LABEL: aligned_bzero_16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    movaps %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: aligned_bzero_16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    xorps %xmm0, %xmm0
+; SSE4-NEXT:    movaps %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: aligned_bzero_16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: aligned_bzero_16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 0, i64 16, i1 0)
+  ret void
+}
+
+define void @aligned_bzero_32(i8* %a) nounwind {
+; SSE2-LABEL: aligned_bzero_32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    movaps %xmm0, 16(%rdi)
+; SSE2-NEXT:    movaps %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: aligned_bzero_32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    xorps %xmm0, %xmm0
+; SSE4-NEXT:    movaps %xmm0, 16(%rdi)
+; SSE4-NEXT:    movaps %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: aligned_bzero_32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: aligned_bzero_32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 0, i64 32, i1 0)
+  ret void
+}
+
+define void @aligned_bzero_64(i8* %a) nounwind {
+; SSE2-LABEL: aligned_bzero_64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    movaps %xmm0, 48(%rdi)
+; SSE2-NEXT:    movaps %xmm0, 32(%rdi)
+; SSE2-NEXT:    movaps %xmm0, 16(%rdi)
+; SSE2-NEXT:    movaps %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: aligned_bzero_64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    xorps %xmm0, %xmm0
+; SSE4-NEXT:    movaps %xmm0, 48(%rdi)
+; SSE4-NEXT:    movaps %xmm0, 32(%rdi)
+; SSE4-NEXT:    movaps %xmm0, 16(%rdi)
+; SSE4-NEXT:    movaps %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: aligned_bzero_64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovaps %ymm0, 32(%rdi)
+; AVX-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: aligned_bzero_64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 0)
+  ret void
+}

diff  --git a/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll b/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll
new file mode 100644
index 000000000000..659b162585fb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+define void @test1(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; CHECK-NEXT:    imulq %rax, %rcx
+; CHECK-NEXT:    movq %rcx, (%rdi)
+; CHECK-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0)
+  ret void
+}
+
+define void @regular_memset_calls_external_function(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: regular_memset_calls_external_function:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $1024, %edx # imm = 0x400
+; CHECK-NEXT:    jmp memset at PLT # TAILCALL
+  tail call void @llvm.memset.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0)
+  ret void
+}
+
+define void @inlined_set_doesnt_call_external_function(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: inlined_set_doesnt_call_external_function:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT:    movzbl %sil, %ecx
+; CHECK-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; CHECK-NEXT:    imulq %rcx, %rax
+; CHECK-NEXT:    movq %rax, 1016(%rdi)
+; CHECK-NEXT:    movq %rax, 1008(%rdi)
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0)
+  ret void
+}

diff  --git a/llvm/test/Other/lint.ll b/llvm/test/Other/lint.ll
index 799d3fe7cbfb..fbbb7bf2d4a9 100644
--- a/llvm/test/Other/lint.ll
+++ b/llvm/test/Other/lint.ll
@@ -6,6 +6,8 @@ declare fastcc void @bar()
 declare void @llvm.stackrestore(i8*)
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
 declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
+declare void @llvm.memset.p0i8.i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i8.i64(i8* nocapture, i8, i64, i1) nounwind
 declare void @has_sret(i8* sret(i8) %p)
 declare void @has_noaliases(i32* noalias %p, i32* %q)
 declare void @one_arg(i32)
@@ -87,6 +89,11 @@ call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* bitcast (i32* @CG to i8*), i8* b
 ; CHECK: Unusual: noalias argument aliases another argument
 call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (i32* @CG to i8*), i8* bitcast (i32* @CG to i8*), i64 1, i1 0)
 
+; CHECK: Write to read-only memory
+call void @llvm.memset.p0i8.i8.i64(i8* bitcast (i32* @CG to i8*), i8 1, i64 1, i1 0)
+; CHECK: Write to read-only memory
+call void @llvm.memset.inline.p0i8.i8.i64(i8* bitcast (i32* @CG to i8*), i8 1, i64 1, i1 0)
+
 ; CHECK: Undefined behavior: Buffer overflow
   %wider = bitcast i8* %buf to i16*
   store i16 0, i16* %wider

diff  --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll
index e1e77a9bfd20..5a58f33890f8 100644
--- a/llvm/test/Verifier/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/intrinsic-immarg.ll
@@ -62,6 +62,23 @@ define void @memset(i8* %dest, i8 %val, i1 %is.volatile) {
   ret void
 }
 
+declare void @llvm.memset.inline.p0i8.i32(i8* nocapture, i8, i32, i1)
+define void @memset_inline_is_volatile(i8* %dest, i8 %value, i1 %is.volatile) {
+  ; CHECK: immarg operand has non-immediate parameter
+  ; CHECK-NEXT: i1 %is.volatile
+  ; CHECK-NEXT: call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 8, i1 %is.volatile)
+  call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 8, i1 %is.volatile)
+  ret void
+}
+
+define void @memset_inline_variable_size(i8* %dest, i8 %value, i32 %size) {
+  ; CHECK: immarg operand has non-immediate parameter
+  ; CHECK-NEXT: i32 %size
+  ; CHECK-NEXT: call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 %size, i1 true)
+  call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 %size, i1 true)
+  ret void
+}
+
 
 declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1, i1)
 define void @objectsize(i8* %ptr, i1 %a, i1 %b, i1 %c) {

diff  --git a/llvm/test/Verifier/memset-inline.ll b/llvm/test/Verifier/memset-inline.ll
new file mode 100644
index 000000000000..cdde24659552
--- /dev/null
+++ b/llvm/test/Verifier/memset-inline.ll
@@ -0,0 +1,9 @@
+; RUN: not opt -verify < %s 2>&1 | FileCheck %s
+
+; CHECK: alignment is not a power of two 
+
+define void @foo(i8* %P, i8 %value) {
+  call void @llvm.memset.inline.p0i8.i32(i8* align 3 %P, i8 %value, i32 4, i1 false)
+  ret void
+}
+declare void @llvm.memset.inline.p0i8.i32(i8* nocapture, i8, i32, i1) nounwind