[llvm] e229017 - Support -fstack-clash-protection for x86

via llvm-commits llvm-commits at lists.llvm.org
Sat Feb 8 04:32:16 PST 2020


Author: serge_sans_paille
Date: 2020-02-08T13:31:52+01:00
New Revision: e229017732bcf1911210903ee9811033d5588e0d

URL: https://github.com/llvm/llvm-project/commit/e229017732bcf1911210903ee9811033d5588e0d
DIFF: https://github.com/llvm/llvm-project/commit/e229017732bcf1911210903ee9811033d5588e0d.diff

LOG: Support -fstack-clash-protection for x86

Implement protection against the stack clash attack [0] through inline stack
probing.

Probe stack allocation every PAGE_SIZE during frame lowering or dynamic
allocation to make sure the page guard, if any, is touched when touching the
stack, in a similar manner to GCC[1].

This extends the existing `probe-stack' mechanism with a special value `inline-asm'.
Technically the former uses function call before stack allocation while this
patch provides inlined stack probes and chunk allocation.

Only implemented for x86.

[0] https://www.qualys.com/2017/06/19/stack-clash/stack-clash.txt
[1] https://gcc.gnu.org/ml/gcc-patches/2017-07/msg00556.html

This a recommit of 39f50da2a357a8f685b3540246c5d762734e035f with better option
handling and more portable testing

Differential Revision: https://reviews.llvm.org/D68720

Added: 
    clang/test/CodeGen/stack-clash-protection.c
    clang/test/Driver/stack-clash-protection.c
    llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
    llvm/test/CodeGen/X86/stack-clash-large.ll
    llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll
    llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll
    llvm/test/CodeGen/X86/stack-clash-medium.ll
    llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll
    llvm/test/CodeGen/X86/stack-clash-small.ll
    llvm/test/CodeGen/X86/stack-clash-unknown-call.ll

Modified: 
    clang/docs/ClangCommandLineReference.rst
    clang/docs/ReleaseNotes.rst
    clang/include/clang/Basic/CodeGenOptions.def
    clang/include/clang/Basic/DiagnosticCommonKinds.td
    clang/include/clang/Basic/TargetInfo.h
    clang/include/clang/Driver/Options.td
    clang/lib/Basic/Targets/X86.h
    clang/lib/CodeGen/CGStmt.cpp
    clang/lib/CodeGen/CodeGenModule.cpp
    clang/lib/Driver/ToolChains/Clang.cpp
    clang/lib/Frontend/CompilerInvocation.cpp
    llvm/docs/ReleaseNotes.rst
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/Target/X86/X86CallFrameOptimization.cpp
    llvm/lib/Target/X86/X86FrameLowering.cpp
    llvm/lib/Target/X86/X86FrameLowering.h
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h
    llvm/lib/Target/X86/X86InstrCompiler.td
    llvm/lib/Target/X86/X86InstrInfo.td

Removed: 
    


################################################################################
diff  --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst
index 24c8ee2bc9ef..609e7fa66c00 100644
--- a/clang/docs/ClangCommandLineReference.rst
+++ b/clang/docs/ClangCommandLineReference.rst
@@ -1917,6 +1917,10 @@ Use a strong heuristic to apply stack protectors to functions
 
 Emit section containing metadata on function stack sizes
 
+.. option:: -fstack-clash-protection, -fno-stack-clash-protection
+
+Instrument stack allocation to prevent stack clash attacks (x86, non-Windows only).
+
 .. option:: -fstandalone-debug, -fno-limit-debug-info, -fno-standalone-debug
 
 Emit full debug info for all types used by the program

diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e957550a93cc..d24cd85673c6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -61,6 +61,10 @@ New Compiler Flags
 ------------------
 
 
+- -fstack-clash-protection will provide a protection against the stack clash
+  attack for x86 architecture through automatic probing of each page of
+  allocated stack.
+
 Deprecated Compiler Flags
 -------------------------
 

diff  --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 21e391912584..48c0df49e32d 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -150,6 +150,7 @@ CODEGENOPT(NoWarn            , 1, 0) ///< Set when -Wa,--no-warn is enabled.
 CODEGENOPT(EnableSegmentedStacks , 1, 0) ///< Set when -fsplit-stack is enabled.
 CODEGENOPT(NoInlineLineTables, 1, 0) ///< Whether debug info should contain
                                      ///< inline line tables.
+CODEGENOPT(StackClashProtector, 1, 0) ///< Set when -fstack-clash-protection is enabled.
 CODEGENOPT(NoImplicitFloat   , 1, 0) ///< Set when -mno-implicit-float is enabled.
 CODEGENOPT(NoInfsFPMath      , 1, 0) ///< Assume FP arguments, results not +-Inf.
 CODEGENOPT(NoSignedZeros     , 1, 0) ///< Allow ignoring the signedness of FP zero

diff  --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index 20b49605eb6f..95cb81f09922 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -239,6 +239,10 @@ def note_invalid_subexpr_in_const_expr : Note<
 let CategoryName = "Inline Assembly Issue" in {
   def err_asm_invalid_type_in_input : Error<
     "invalid type %0 in asm input for constraint '%1'">;
+
+  def warn_stack_clash_protection_inline_asm : Warning<
+    "Unable to protect inline asm that clobbers stack pointer against stack clash">,
+    InGroup<DiagGroup<"stack-protector">>;
 }
 
 // Sema && Serialization

diff  --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 3a8e35524695..cb5aabdc468f 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -816,6 +816,8 @@ class TargetInfo : public virtual TransferrableTargetInfo,
   StringRef getNormalizedGCCRegisterName(StringRef Name,
                                          bool ReturnCanonical = false) const;
 
+  virtual bool isSPRegName(StringRef) const { return false; }
+
   /// Extracts a register from the passed constraint (if it is a
   /// single-register constraint) and the asm label expression related to a
   /// variable in the input or output list of an inline asm statement.

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 3206b27a81aa..5987e735c77a 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1773,6 +1773,10 @@ def fno_signed_char : Flag<["-"], "fno-signed-char">, Group<f_Group>,
 def fsplit_stack : Flag<["-"], "fsplit-stack">, Group<f_Group>;
 def fstack_protector_all : Flag<["-"], "fstack-protector-all">, Group<f_Group>,
   HelpText<"Enable stack protectors for all functions">;
+def fstack_clash_protection : Flag<["-"], "fstack-clash-protection">, Group<f_Group>, Flags<[CC1Option]>,
+  HelpText<"Enable stack clash protection">;
+def fnostack_clash_protection : Flag<["-"], "fnostack-clash-protection">, Group<f_Group>,
+  HelpText<"Disable stack clash protection">;
 def fstack_protector_strong : Flag<["-"], "fstack-protector-strong">, Group<f_Group>,
   HelpText<"Enable stack protectors for some functions vulnerable to stack smashing. "
            "Compared to -fstack-protector, this uses a stronger heuristic "

diff  --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 5b5e284e5141..5141caa6fb5b 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -166,6 +166,10 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
 
   ArrayRef<TargetInfo::AddlRegName> getGCCAddlRegNames() const override;
 
+  bool isSPRegName(StringRef RegName) const override {
+    return RegName.equals("esp") || RegName.equals("rsp");
+  }
+
   bool validateCpuSupports(StringRef Name) const override;
 
   bool validateCpuIs(StringRef Name) const override;

diff  --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 138459c68dbf..bf63a1f7128b 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -2247,8 +2247,14 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
 
     if (Clobber == "memory")
       ReadOnly = ReadNone = false;
-    else if (Clobber != "cc")
+    else if (Clobber != "cc") {
       Clobber = getTarget().getNormalizedGCCRegisterName(Clobber);
+      if (CGM.getCodeGenOpts().StackClashProtector &&
+          getTarget().isSPRegName(Clobber)) {
+        CGM.getDiags().Report(S.getAsmLoc(),
+                              diag::warn_stack_clash_protection_inline_asm);
+      }
+    }
 
     if (!Constraints.empty())
       Constraints += ',';

diff  --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 4e730025ac4f..235a40501afc 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1514,6 +1514,9 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
   if (CodeGenOpts.UnwindTables)
     B.addAttribute(llvm::Attribute::UWTable);
 
+  if (CodeGenOpts.StackClashProtector)
+    B.addAttribute("probe-stack", "inline-asm");
+
   if (!hasUnwindExceptions(LangOpts))
     B.addAttribute(llvm::Attribute::NoUnwind);
 

diff  --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 9e2279e90b7e..7901f8a48f5f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3002,6 +3002,21 @@ static void RenderSSPOptions(const ToolChain &TC, const ArgList &Args,
   }
 }
 
+static void RenderSCPOptions(const ToolChain &TC, const ArgList &Args,
+                             ArgStringList &CmdArgs) {
+  const llvm::Triple &EffectiveTriple = TC.getEffectiveTriple();
+
+  if (!EffectiveTriple.isOSLinux())
+    return;
+
+  if (!EffectiveTriple.isX86())
+    return;
+
+  if (Args.hasFlag(options::OPT_fstack_clash_protection,
+                   options::OPT_fnostack_clash_protection, false))
+    CmdArgs.push_back("-fstack-clash-protection");
+}
+
 static void RenderTrivialAutoVarInitOptions(const Driver &D,
                                             const ToolChain &TC,
                                             const ArgList &Args,
@@ -5248,6 +5263,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Args.MakeArgString("-mspeculative-load-hardening"));
 
   RenderSSPOptions(TC, Args, CmdArgs, KernelOrKext);
+  RenderSCPOptions(TC, Args, CmdArgs);
   RenderTrivialAutoVarInitOptions(D, TC, Args, CmdArgs);
 
   // Translate -mstackrealign

diff  --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 17e190627881..e57c7ef55bb5 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1238,6 +1238,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
 
   Opts.NoStackArgProbe = Args.hasArg(OPT_mno_stack_arg_probe);
 
+  Opts.StackClashProtector = Args.hasArg(OPT_fstack_clash_protection);
+
   if (Arg *A = Args.getLastArg(OPT_fobjc_dispatch_method_EQ)) {
     StringRef Name = A->getValue();
     unsigned Method = llvm::StringSwitch<unsigned>(Name)

diff  --git a/clang/test/CodeGen/stack-clash-protection.c b/clang/test/CodeGen/stack-clash-protection.c
new file mode 100644
index 000000000000..f970bf909cbe
--- /dev/null
+++ b/clang/test/CodeGen/stack-clash-protection.c
@@ -0,0 +1,22 @@
+// Check the correct function attributes are generated
+// RUN: %clang_cc1 -triple x86_64-linux -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s
+
+// CHECK: define void @large_stack() #[[A:.*]] {
+void large_stack() {
+  volatile int stack[20000], i;
+  for (i = 0; i < sizeof(stack) / sizeof(int); ++i)
+    stack[i] = i;
+}
+
+// CHECK: define void @vla({{.*}}) #[[A:.*]] {
+void vla(int n) {
+  volatile int vla[n];
+  __builtin_memset(&vla[0], 0, 1);
+}
+
+// CHECK: define void @builtin_alloca({{.*}}) #[[A:.*]] {
+void builtin_alloca(int n) {
+  volatile void *mem = __builtin_alloca(n);
+}
+
+// CHECK: attributes #[[A]] = {{.*}} "probe-stack"="inline-asm"

diff  --git a/clang/test/Driver/stack-clash-protection.c b/clang/test/Driver/stack-clash-protection.c
new file mode 100644
index 000000000000..e48486be3ca5
--- /dev/null
+++ b/clang/test/Driver/stack-clash-protection.c
@@ -0,0 +1,33 @@
+// RUN: %clang -target i386-unknown-linux -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-i386
+// RUN: %clang -target i386-unknown-linux -fnostack-clash-protection -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-i386
+// RUN: %clang -target i386-unknown-linux -fstack-clash-protection -fnostack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-i386-NO
+// SCP-i386: "-fstack-clash-protection"
+// SCP-i386-NO-NOT: "-fstack-clash-protection"
+
+// RUN: %clang -target x86_64-scei-linux -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-x86
+// SCP-x86: "-fstack-clash-protection"
+
+// RUN: %clang -target armv7k-apple-linux -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-armv7
+// SCP-armv7-NOT: "-fstack-clash-protection"
+// SCP-armv7: argument unused during compilation: '-fstack-clash-protection'
+
+// RUN: %clang -target x86_64-unknown-linux -fstack-clash-protection -c %s 2>&1 | FileCheck %s -check-prefix=SCP-warn
+// SCP-warn: warning: Unable to protect inline asm that clobbers stack pointer against stack clash
+
+// RUN: %clang -target x86_64-pc-unknown-linux -fstack-clash-protection -S -emit-llvm -o- %s | FileCheck %s -check-prefix=SCP-ll-linux64
+// SCP-ll-linux64: attributes {{.*}} "probe-stack"="inline-asm"
+
+// RUN: %clang -target x86_64-pc-windows-msvc -fstack-clash-protection -S -emit-llvm -o- %s 2>&1 | FileCheck %s -check-prefix=SCP-ll-win64
+// SCP-ll-win64-NOT: attributes {{.*}} "probe-stack"="inline-asm"
+// SCP-ll-win64: argument unused during compilation: '-fstack-clash-protection'
+
+int foo(int c) {
+  int r;
+  __asm__("sub %0, %%rsp"
+          :
+          : "rm"(c)
+          : "rsp");
+  __asm__("mov %%rsp, %0"
+          : "=rm"(r)::);
+  return r;
+}

diff  --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 520c1eb3dc84..65d74ef62c77 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -84,6 +84,10 @@ Changes to the X86 Target
 During this release ...
 
 
+* Functions with the probe-stack attribute set to "inline-asm" are now protected
+  against stack clash without the need of a third-party probing function and
+  with limited impact on performance.
+
 Changes to the AMDGPU Target
 -----------------------------
 

diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e61c6460b919..fdb262c4bbee 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1727,6 +1727,10 @@ class TargetLoweringBase {
 
   /// Returns the name of the symbol used to emit stack probes or the empty
   /// string if not applicable.
+  virtual bool hasStackProbeSymbol(MachineFunction &MF) const { return false; }
+
+  virtual bool hasInlineStackProbe(MachineFunction &MF) const { return false; }
+
   virtual StringRef getStackProbeSymbolName(MachineFunction &MF) const {
     return "";
   }

diff  --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index f8faa572dffc..467d24394dad 100644
--- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -162,14 +162,13 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
   // memory for arguments.
   unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
-  bool UseStackProbe =
-      !STI->getTargetLowering()->getStackProbeSymbolName(MF).empty();
+  bool EmitStackProbeCall = STI->getTargetLowering()->hasStackProbeSymbol(MF);
   unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF);
   for (MachineBasicBlock &BB : MF) {
     bool InsideFrameSequence = false;
     for (MachineInstr &MI : BB) {
       if (MI.getOpcode() == FrameSetupOpcode) {
-        if (TII->getFrameSize(MI) >= StackProbeSize && UseStackProbe)
+        if (TII->getFrameSize(MI) >= StackProbeSize && EmitStackProbeCall)
           return false;
         if (InsideFrameSequence)
           return false;

diff  --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 39f73d9fb585..0be707494072 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -17,6 +17,7 @@
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -32,6 +33,12 @@
 #include "llvm/Target/TargetOptions.h"
 #include <cstdlib>
 
+#define DEBUG_TYPE "x86-fl"
+
+STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue");
+STATISTIC(NumFrameExtraProbe,
+          "Number of extra stack probes generated in prologue");
+
 using namespace llvm;
 
 X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
@@ -257,7 +264,27 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
 
   uint64_t Chunk = (1LL << 31) - 1;
 
-  if (Offset > Chunk) {
+  MachineFunction &MF = *MBB.getParent();
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86TargetLowering &TLI = *STI.getTargetLowering();
+  const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
+
+  // It's ok to not take into account large chunks when probing, as the
+  // allocation is split in smaller chunks anyway.
+  if (EmitInlineStackProbe && !InEpilogue) {
+
+    // stack probing may involve looping, and control flow generations is
+    // disallowed at this point. Rely to later processing through
+    // `inlineStackProbe`.
+    MachineInstr *Stub = emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
+
+    // Encode the static offset as a metadata attached to the stub.
+    LLVMContext &Context = MF.getFunction().getContext();
+    MachineInstrBuilder(MF, Stub).addMetadata(
+        MDTuple::get(Context, {ConstantAsMetadata::get(ConstantInt::get(
+                                  IntegerType::get(Context, 64), Offset))}));
+    return;
+  } else if (Offset > Chunk) {
     // Rather than emit a long series of instructions for large offsets,
     // load the offset into a register and do one sub/add
     unsigned Reg = 0;
@@ -381,8 +408,8 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
   } else {
     bool IsSub = Offset < 0;
     uint64_t AbsOffset = IsSub ? -Offset : Offset;
-    unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
-                         : getADDriOpcode(Uses64BitFramePtr, AbsOffset);
+    const unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
+                               : getADDriOpcode(Uses64BitFramePtr, AbsOffset);
     MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
              .addReg(StackPtr)
              .addImm(AbsOffset);
@@ -528,6 +555,169 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
                                             const DebugLoc &DL,
                                             bool InProlog) const {
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  if (STI.isTargetWindowsCoreCLR() && STI.is64Bit())
+    emitStackProbeInlineWindowsCoreCLR64(MF, MBB, MBBI, DL, InProlog);
+  else
+    emitStackProbeInlineGeneric(MF, MBB, MBBI, DL, InProlog);
+}
+
+void X86FrameLowering::emitStackProbeInlineGeneric(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
+  MachineInstr &CallToInline = *std::prev(MBBI);
+  assert(CallToInline.getOperand(1).isMetadata() &&
+         "no metadata attached to that probe");
+  uint64_t Offset =
+      cast<ConstantInt>(
+          cast<ConstantAsMetadata>(
+              cast<MDTuple>(CallToInline.getOperand(1).getMetadata())
+                  ->getOperand(0))
+              ->getValue())
+          ->getZExtValue();
+
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86TargetLowering &TLI = *STI.getTargetLowering();
+  assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) &&
+         "
diff erent expansion expected for CoreCLR 64 bit");
+
+  const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+  uint64_t ProbeChunk = StackProbeSize * 8;
+
+  // Synthesize a loop or unroll it, depending on the number of iterations.
+  if (Offset > ProbeChunk) {
+    emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset);
+  } else {
+    emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset);
+  }
+}
+
+void X86FrameLowering::emitStackProbeInlineGenericBlock(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+    uint64_t Offset) const {
+
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86TargetLowering &TLI = *STI.getTargetLowering();
+  const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
+  const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
+  const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+  uint64_t CurrentOffset = 0;
+  // 0 Thanks to return address being saved on the stack
+  uint64_t CurrentProbeOffset = 0;
+
+  // For the first N - 1 pages, just probe. I tried to take advantage of
+  // natural probes but it implies much more logic and there was very few
+  // interesting natural probes to interleave.
+  while (CurrentOffset + StackProbeSize < Offset) {
+    MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+                           .addReg(StackPtr)
+                           .addImm(StackProbeSize)
+                           .setMIFlag(MachineInstr::FrameSetup);
+    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+
+
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
+                     .setMIFlag(MachineInstr::FrameSetup),
+                 StackPtr, false, 0)
+        .addImm(0)
+        .setMIFlag(MachineInstr::FrameSetup);
+    NumFrameExtraProbe++;
+    CurrentOffset += StackProbeSize;
+    CurrentProbeOffset += StackProbeSize;
+  }
+
+  uint64_t ChunkSize = Offset - CurrentOffset;
+  MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+                         .addReg(StackPtr)
+                         .addImm(ChunkSize)
+                         .setMIFlag(MachineInstr::FrameSetup);
+  MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+}
+
+void X86FrameLowering::emitStackProbeInlineGenericLoop(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+    uint64_t Offset) const {
+
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86TargetLowering &TLI = *STI.getTargetLowering();
+  const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
+  const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+
+  // Synthesize a loop
+  NumFrameLoopProbe++;
+  const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+  MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+  MachineFunction::iterator MBBIter = ++MBB.getIterator();
+  MF.insert(MBBIter, testMBB);
+  MF.insert(MBBIter, tailMBB);
+
+  unsigned FinalStackPtr = Uses64BitFramePtr ? X86::R11 : X86::R11D;
+  BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FinalStackPtr)
+      .addReg(StackPtr)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // save loop bound
+  {
+    const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
+    BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackPtr)
+        .addReg(FinalStackPtr)
+        .addImm(Offset / StackProbeSize * StackProbeSize)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // allocate a page
+  {
+    const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
+    BuildMI(testMBB, DL, TII.get(Opc), StackPtr)
+        .addReg(StackPtr)
+        .addImm(StackProbeSize)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // touch the page
+  addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc))
+                   .setMIFlag(MachineInstr::FrameSetup),
+               StackPtr, false, 0)
+      .addImm(0)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // cmp with stack pointer bound
+  BuildMI(testMBB, DL, TII.get(IsLP64 ? X86::CMP64rr : X86::CMP32rr))
+      .addReg(StackPtr)
+      .addReg(FinalStackPtr)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // jump
+  BuildMI(testMBB, DL, TII.get(X86::JCC_1))
+      .addMBB(testMBB)
+      .addImm(X86::COND_NE)
+      .setMIFlag(MachineInstr::FrameSetup);
+  testMBB->addSuccessor(testMBB);
+  testMBB->addSuccessor(tailMBB);
+
+  // allocate a block and touch it
+
+  tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end());
+  tailMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(testMBB);
+
+  if (Offset % StackProbeSize) {
+    const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
+    BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr)
+        .addReg(StackPtr)
+        .addImm(Offset % StackProbeSize)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+}
+
+void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   assert(STI.is64Bit() && "
diff erent expansion needed for 32 bit");
   assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
   const TargetInstrInfo &TII = *STI.getInstrInfo();
@@ -821,13 +1011,13 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
   }
 }
 
-void X86FrameLowering::emitStackProbeInlineStub(
+MachineInstr *X86FrameLowering::emitStackProbeInlineStub(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
 
   assert(InProlog && "ChkStkStub called outside prolog!");
 
-  BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+  return BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
       .addExternalSymbol("__chkstk_stub");
 }
 
@@ -1014,7 +1204,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     X86FI->setCalleeSavedFrameSize(
       X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
 
-  bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();
+  const bool EmitStackProbeCall =
+      STI.getTargetLowering()->hasStackProbeSymbol(MF);
   unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);
 
   // Re-align the stack on 64-bit if the x86-interrupt calling convention is
@@ -1032,11 +1223,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // pointer, calls, or dynamic alloca then we do not need to adjust the
   // stack pointer (we fit in the Red Zone). We also check that we don't
   // push and pop from the stack.
-  if (has128ByteRedZone(MF) &&
-      !TRI->needsStackRealignment(MF) &&
+  if (has128ByteRedZone(MF) && !TRI->needsStackRealignment(MF) &&
       !MFI.hasVarSizedObjects() &&             // No dynamic alloca.
       !MFI.adjustsStack() &&                   // No calls.
-      !UseStackProbe &&                        // No stack probes.
+      !EmitStackProbeCall &&                   // No stack probes.
       !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
       !MF.shouldSplitStack()) {                // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
@@ -1237,7 +1427,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   uint64_t AlignedNumBytes = NumBytes;
   if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
     AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
-  if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
+  if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) {
     assert(!X86FI->getUsesRedZone() &&
            "The Red Zone is not accounted for in stack probes");
 

diff  --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index a59ad175b11c..92bf934888b9 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -195,11 +195,33 @@ class X86FrameLowering : public TargetFrameLowering {
   void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
                             const DebugLoc &DL, bool InProlog) const;
+  void emitStackProbeInlineWindowsCoreCLR64(MachineFunction &MF,
+                                            MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI,
+                                            const DebugLoc &DL,
+                                            bool InProlog) const;
+  void emitStackProbeInlineGeneric(MachineFunction &MF, MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   const DebugLoc &DL, bool InProlog) const;
+
+  void emitStackProbeInlineGenericBlock(MachineFunction &MF,
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        const DebugLoc &DL,
+                                        uint64_t Offset) const;
+
+  void emitStackProbeInlineGenericLoop(MachineFunction &MF,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       const DebugLoc &DL,
+                                       uint64_t Offset) const;
 
   /// Emit a stub to later inline the target stack probe.
-  void emitStackProbeInlineStub(MachineFunction &MF, MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MBBI,
-                                const DebugLoc &DL, bool InProlog) const;
+  MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
+                                         MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI,
+                                         const DebugLoc &DL,
+                                         bool InProlog) const;
 
   /// Aligns the stack pointer by ANDing it with -MaxAlign.
   void BuildStackAlignAND(MachineBasicBlock &MBB,

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 85741ca1c057..613d4f723b35 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23111,9 +23111,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool SplitStack = MF.shouldSplitStack();
-  bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
+  bool EmitStackProbeCall = hasStackProbeSymbol(MF);
   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
-               SplitStack || EmitStackProbe;
+               SplitStack || EmitStackProbeCall;
   SDLoc dl(Op);
 
   // Get the inputs.
@@ -23137,11 +23137,21 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
                     " not tell us which reg is the stack pointer!");
 
-    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
-    Chain = SP.getValue(1);
     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
     const Align StackAlign(TFI.getStackAlignment());
-    Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+    if (hasInlineStackProbe(MF)) {
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+
+      const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+      Register Vreg = MRI.createVirtualRegister(AddrRegClass);
+      Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+      Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
+                           DAG.getRegister(Vreg, SPTy));
+    } else {
+      SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+      Chain = SP.getValue(1);
+      Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+    }
     if (Alignment && Alignment > StackAlign)
       Result =
           DAG.getNode(ISD::AND, dl, VT, Result,
@@ -29874,6 +29884,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(MEMBARRIER)
   NODE_NAME_CASE(MFENCE)
   NODE_NAME_CASE(SEG_ALLOCA)
+  NODE_NAME_CASE(PROBED_ALLOCA)
   NODE_NAME_CASE(RDRAND)
   NODE_NAME_CASE(RDSEED)
   NODE_NAME_CASE(RDPKRU)
@@ -31141,6 +31152,97 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
   return SinkMBB;
 }
 
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
+                                           MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
+  DebugLoc DL = MI.getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+  const unsigned ProbeSize = getStackProbeSize(*MF);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+  MachineFunction::iterator MBBIter = ++BB->getIterator();
+  MF->insert(MBBIter, testMBB);
+  MF->insert(MBBIter, blockMBB);
+  MF->insert(MBBIter, tailMBB);
+
+  unsigned sizeVReg = MI.getOperand(1).getReg();
+
+  const TargetRegisterClass *SizeRegClass = MRI.getRegClass(sizeVReg);
+
+  unsigned tmpSizeVReg = MRI.createVirtualRegister(SizeRegClass);
+  unsigned tmpSizeVReg2 = MRI.createVirtualRegister(SizeRegClass);
+
+  unsigned physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
+
+  // test rsp size
+  BuildMI(testMBB, DL, TII->get(X86::PHI), tmpSizeVReg)
+      .addReg(sizeVReg)
+      .addMBB(BB)
+      .addReg(tmpSizeVReg2)
+      .addMBB(blockMBB);
+
+  BuildMI(testMBB, DL,
+          TII->get(TFI.Uses64BitFramePtr ? X86::CMP64ri32 : X86::CMP32ri))
+      .addReg(tmpSizeVReg)
+      .addImm(ProbeSize);
+
+  BuildMI(testMBB, DL, TII->get(X86::JCC_1))
+      .addMBB(tailMBB)
+      .addImm(X86::COND_L);
+  testMBB->addSuccessor(blockMBB);
+  testMBB->addSuccessor(tailMBB);
+
+  // allocate a block and touch it
+
+  BuildMI(blockMBB, DL,
+          TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri),
+          tmpSizeVReg2)
+      .addReg(tmpSizeVReg)
+      .addImm(ProbeSize);
+
+  BuildMI(blockMBB, DL,
+          TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri),
+          physSPReg)
+      .addReg(physSPReg)
+      .addImm(ProbeSize);
+
+  const unsigned MovMIOpc =
+      TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi;
+  addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0)
+      .addImm(0);
+
+  BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
+  blockMBB->addSuccessor(testMBB);
+
+  // allocate the tail and continue
+  BuildMI(tailMBB, DL,
+          TII->get(TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr),
+          physSPReg)
+      .addReg(physSPReg)
+      .addReg(tmpSizeVReg);
+  BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+      .addReg(physSPReg);
+
+  tailMBB->splice(tailMBB->end(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  tailMBB->transferSuccessorsAndUpdatePHIs(BB);
+  BB->addSuccessor(testMBB);
+
+  // Delete the original pseudo instruction.
+  MI.eraseFromParent();
+
+  // And we're done.
+  return tailMBB;
+}
+
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
                                         MachineBasicBlock *BB) const {
@@ -32301,6 +32403,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::SEG_ALLOCA_32:
   case X86::SEG_ALLOCA_64:
     return EmitLoweredSegAlloca(MI, BB);
+  case X86::PROBED_ALLOCA_32:
+  case X86::PROBED_ALLOCA_64:
+    return EmitLoweredProbedAlloca(MI, BB);
   case X86::TLSCall_32:
   case X86::TLSCall_64:
     return EmitLoweredTLSCall(MI, BB);
@@ -47560,10 +47665,35 @@ bool X86TargetLowering::supportSwiftError() const {
   return Subtarget.is64Bit();
 }
 
+/// Returns true if stack probing through a function call is requested.
+bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
+  return !getStackProbeSymbolName(MF).empty();
+}
+
+/// Returns true if stack probing through inline assembly is requested.
+bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
+
+  // No inline stack probe for Windows, they have their own mechanism.
+  if (Subtarget.isOSWindows() ||
+      MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
+    return false;
+
+  // If the function specifically requests inline stack probes, emit them.
+  if (MF.getFunction().hasFnAttribute("probe-stack"))
+    return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+           "inline-asm";
+
+  return false;
+}
+
 /// Returns the name of the symbol used to emit stack probes or the empty
 /// string if not applicable.
 StringRef
 X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+  // Inline Stack probes disable stack probe call
+  if (hasInlineStackProbe(MF))
+    return "";
+
   // If the function specifically requests stack probes, emit them.
   if (MF.getFunction().hasFnAttribute("probe-stack"))
     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 6336b3ffa260..cf88ec8ae413 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -537,6 +537,10 @@ namespace llvm {
       // falls back to heap allocation if not.
       SEG_ALLOCA,
 
+      // For allocating stack space when using stack clash protector.
+      // Allocation is performed by block, and each block is probed.
+      PROBED_ALLOCA,
+
       // Memory barriers.
       MEMBARRIER,
       MFENCE,
@@ -1224,6 +1228,8 @@ namespace llvm {
 
     bool supportSwiftError() const override;
 
+    bool hasStackProbeSymbol(MachineFunction &MF) const override;
+    bool hasInlineStackProbe(MachineFunction &MF) const override;
     StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
 
     unsigned getStackProbeSize(MachineFunction &MF) const;
@@ -1448,6 +1454,9 @@ namespace llvm {
     MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
                                             MachineBasicBlock *BB) const;
 
+    MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
+                                               MachineBasicBlock *BB) const;
+
     MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
                                           MachineBasicBlock *BB) const;
 

diff  --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 43a67cbc42dc..96ebb44fecb5 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -111,6 +111,23 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
                       [(set GR64:$dst,
                          (X86SegAlloca GR64:$size))]>,
                     Requires<[In64BitMode]>;
+
+// To protect against stack clash, dynamic allocation should perform a memory
+// probe at each page.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def PROBED_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+                      "# variable sized alloca with probing",
+                      [(set GR32:$dst,
+                         (X86ProbedAlloca GR32:$size))]>,
+                    Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def PROBED_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+                      "# variable sized alloca with probing",
+                      [(set GR64:$dst,
+                         (X86ProbedAlloca GR64:$size))]>,
+                    Requires<[In64BitMode]>;
 }
 
 // Dynamic stack allocation yields a _chkstk or _alloca call for all Windows

diff  --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 131832b903ed..a0c2b6f47f13 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -121,6 +121,8 @@ def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
 
 def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 
+def SDT_X86PROBED_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
 def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
@@ -292,6 +294,9 @@ def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
 def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
                           [SDNPHasChain]>;
 
+def X86ProbedAlloca : SDNode<"X86ISD::PROBED_ALLOCA", SDT_X86PROBED_ALLOCA,
+                          [SDNPHasChain]>;
+
 def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 

diff  --git a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
new file mode 100644
index 000000000000..c0a199e16a94
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s | FileCheck %s
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo(i32 %n) local_unnamed_addr #0 {
+
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:  	pushq	%rbp
+; CHECK-NEXT:  	.cfi_def_cfa_offset 16
+; CHECK-NEXT:  	.cfi_offset %rbp, -16
+; CHECK-NEXT:  	movq	%rsp, %rbp
+; CHECK-NEXT:  	.cfi_def_cfa_register %rbp
+; CHECK-NEXT:  	movl	%edi, %eax
+; CHECK-NEXT:  	leaq	15(,%rax,4), %rax
+; CHECK-NEXT:  	andq	$-16, %rax
+; CHECK-NEXT:  	cmpq	$4096, %rax # imm = 0x1000
+; CHECK-NEXT:  	jl	.LBB0_3
+; CHECK-NEXT:  .LBB0_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:  	subq	$4096, %rax # imm = 0x1000
+; CHECK-NEXT:  	subq	$4096, %rsp # imm = 0x1000
+; CHECK-NEXT:  	movq	$0, (%rsp)
+; CHECK-NEXT:  	cmpq	$4096, %rax # imm = 0x1000
+; CHECK-NEXT:  	jge	.LBB0_2
+; CHECK-NEXT:  .LBB0_3:
+; CHECK-NEXT:  	subq	%rax, %rsp
+; CHECK-NEXT:  	movq	%rsp, %rax
+; CHECK-NEXT:  	movl	$1, 4792(%rax)
+; CHECK-NEXT:  	movl	(%rax), %eax
+; CHECK-NEXT:  	movq	%rbp, %rsp
+; CHECK-NEXT:  	popq	%rbp
+; CHECK-NEXT:  .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:   retq
+
+  %a = alloca i32, i32 %n, align 16
+  %b = getelementptr inbounds i32, i32* %a, i64 1198
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}

diff  --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll
new file mode 100644
index 000000000000..f9a5fdc17b84
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-clash-large.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s | FileCheck %s
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo() local_unnamed_addr #0 {
+
+; CHECK-LABEL: foo:
+; CHECK:        # %bb.0:
+; CHECK-NEXT:	movq	%rsp, %r11
+; CHECK-NEXT:	subq	$69632, %r11 # imm = 0x11000
+; CHECK-NEXT:   .LBB0_1:
+; CHECK-NEXT:	subq	$4096, %rsp # imm = 0x1000
+; CHECK-NEXT:	movq	$0, (%rsp)
+; CHECK-NEXT:	cmpq	%r11, %rsp
+; CHECK-NEXT:	jne	.LBB0_1
+; CHECK-NEXT:# %bb.2:
+; CHECK-NEXT:	subq	$2248, %rsp # imm = 0x8C8
+; CHECK-NEXT:	.cfi_def_cfa_offset 71888
+; CHECK-NEXT:	movl	$1, 264(%rsp)
+; CHECK-NEXT:	movl	$1, 28664(%rsp)
+; CHECK-NEXT:	movl	-128(%rsp), %eax
+; CHECK-NEXT:	addq	$71880, %rsp # imm = 0x118C8
+; CHECK-NEXT:	.cfi_def_cfa_offset 8
+; CHECK-NEXT:	retq
+
+
+  %a = alloca i32, i64 18000, align 16
+  %b0 = getelementptr inbounds i32, i32* %a, i64 98
+  %b1 = getelementptr inbounds i32, i32* %a, i64 7198
+  store volatile i32 1, i32* %b0
+  store volatile i32 1, i32* %b1
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}

diff  --git a/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll
new file mode 100644
index 000000000000..1a595f639f26
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s | FileCheck %s
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo() local_unnamed_addr #0 {
+
+; CHECK-LABEL: foo:
+; CHECK:         # %bb.0:
+; CHECK-NEXT	subq	$4096, %rsp # imm = 0x1000
+; CHECK-NEXT	.cfi_def_cfa_offset 5888
+; CHECK-NEXT	movl	$1, 2088(%rsp)
+; CHECK-NEXT	subq	$1784, %rsp # imm = 0x6F8
+; CHECK-NEXT	movl	$2, 672(%rsp)
+; CHECK-NEXT	movl	1872(%rsp), %eax
+; CHECK-NEXT	addq	$5880, %rsp # imm = 0x16F8
+; CHECK-NEXT	.cfi_def_cfa_offset 8
+; CHECK-NEXT	retq
+
+
+  %a = alloca i32, i64 1000, align 16
+  %b = alloca i32, i64 500, align 16
+  %a0 = getelementptr inbounds i32, i32* %a, i64 500
+  %b0 = getelementptr inbounds i32, i32* %b, i64 200
+  store volatile i32 1, i32* %a0
+  store volatile i32 2, i32* %b0
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}

diff  --git a/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll
new file mode 100644
index 000000000000..bb2be8846ec2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s | FileCheck %s
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo() local_unnamed_addr #0 {
+
+; CHECK-LABEL: foo:
+; CHECK:         # %bb.0:
+; CHECK-NEXT:	subq	$4096, %rsp             # imm = 0x1000
+; CHECK-NEXT:	movq	$0, (%rsp)
+; CHECK-NEXT:	subq	$3784, %rsp             # imm = 0xEC8
+; CHECK-NEXT:	.cfi_def_cfa_offset 7888
+; CHECK-NEXT:	movl	$1, 264(%rsp)
+; CHECK-NEXT:	movl	$1, 4664(%rsp)
+; CHECK-NEXT:	movl	-128(%rsp), %eax
+; CHECK-NEXT:	addq	$7880, %rsp             # imm = 0x1EC8
+; CHECK-NEXT:	.cfi_def_cfa_offset 8
+; CHECK-NEXT:	retq
+
+
+
+  %a = alloca i32, i64 2000, align 16
+  %b0 = getelementptr inbounds i32, i32* %a, i64 98
+  %b1 = getelementptr inbounds i32, i32* %a, i64 1198
+  store i32 1, i32* %b0
+  store i32 1, i32* %b1
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}

diff  --git a/llvm/test/CodeGen/X86/stack-clash-medium.ll b/llvm/test/CodeGen/X86/stack-clash-medium.ll
new file mode 100644
index 000000000000..05af3478cfc0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-clash-medium.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s | FileCheck %s
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo() local_unnamed_addr #0 {
+
+; CHECK-LABEL: foo:
+; CHECK:      # %bb.0:
+; CHECK-NEXT: subq	$4096, %rsp             # imm = 0x1000
+; CHECK-NEXT: movq	$0, (%rsp)
+; CHECK-NEXT: subq	$3784, %rsp             # imm = 0xEC8
+; CHECK-NEXT: .cfi_def_cfa_offset 7888
+; CHECK-NEXT: movl	$1, 672(%rsp)
+; CHECK-NEXT: movl	-128(%rsp), %eax
+; CHECK-NEXT: addq	$7880, %rsp             # imm = 0x1EC8
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+
+
+
+  %a = alloca i32, i64 2000, align 16
+  %b = getelementptr inbounds i32, i32* %a, i64 200
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}

diff  --git a/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll b/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll
new file mode 100644
index 000000000000..652acbdf00ba
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo(i64 %i) local_unnamed_addr #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:  subq	$4096, %rsp # imm = 0x1000
+; CHECK-NEXT:  movq	$0, (%rsp)
+; CHECK-NEXT:  subq	$3784, %rsp # imm = 0xEC8
+; CHECK-NEXT:  .cfi_def_cfa_offset 7888
+; CHECK-NEXT:  movl	$1, -128(%rsp,%rdi,4)
+; CHECK-NEXT:  movl	-128(%rsp), %eax
+; CHECK-NEXT:  addq	$7880, %rsp # imm = 0x1EC8
+; CHECK-NEXT:  .cfi_def_cfa_offset 8
+; CHECK-NEXT:  retq
+
+  %a = alloca i32, i32 2000, align 16
+  %b = getelementptr inbounds i32, i32* %a, i64 %i
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}
+

diff  --git a/llvm/test/CodeGen/X86/stack-clash-small.ll b/llvm/test/CodeGen/X86/stack-clash-small.ll
new file mode 100644
index 000000000000..bf40fe907dc2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-clash-small.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s | FileCheck %s
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo() local_unnamed_addr #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:  subq	$280, %rsp # imm = 0x118
+; CHECK-NEXT:  .cfi_def_cfa_offset 288
+; CHECK-NEXT:  movl	$1, 264(%rsp)
+; CHECK-NEXT:  movl	-128(%rsp), %eax
+; CHECK-NEXT:  addq	$280, %rsp # imm = 0x118
+; CHECK-NEXT:  .cfi_def_cfa_offset 8
+; CHECK-NEXT:  retq
+
+  %a = alloca i32, i64 100, align 16
+  %b = getelementptr inbounds i32, i32* %a, i64 98
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}

diff  --git a/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll b/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll
new file mode 100644
index 000000000000..9294d70528fa
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s | FileCheck %s
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg);
+
+define void @foo() local_unnamed_addr #0 {
+
+;CHECK-LABEL: foo:
+;CHECK:         # %bb.0:
+;CHECK-NEXT:	subq	$4096, %rsp # imm = 0x1000
+; it's important that we don't use the call as a probe here
+;CHECK-NEXT:	movq	$0, (%rsp)
+;CHECK-NEXT:	subq	$3912, %rsp # imm = 0xF48
+;CHECK-NEXT:	.cfi_def_cfa_offset 8016
+;CHECK-NEXT:	movq	%rsp, %rdi
+;CHECK-NEXT:	movl	$8000, %edx # imm = 0x1F40
+;CHECK-NEXT:	xorl	%esi, %esi
+;CHECK-NEXT:	callq	memset
+;CHECK-NEXT:	addq	$8008, %rsp # imm = 0x1F48
+;CHECK-NEXT:	.cfi_def_cfa_offset 8
+;CHECK-NEXT:	retq
+
+  %a = alloca i8, i64 8000, align 16
+  call void @llvm.memset.p0i8.i64(i8* align 16 %a, i8 0, i64 8000, i1 false)
+  ret void
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}


        


More information about the llvm-commits mailing list