[llvm] [llvm][aarch64][x86] Implement a lightweight spectre v1 mitigation, like MSVC /Qspectre (PR #116450)

Fri Nov 15 15:26:54 PST 2024

https://github.com/dpaoliello created https://github.com/llvm/llvm-project/pull/116450

Implements a form of load hardening as a mitigation against Spectre v1.

Unlike the other [LLVM mitigations](https://llvm.org/docs/SpeculativeLoadHardening.html) this mitigation like [MSVC's `/Qspectre` flag](https://learn.microsoft.com/en-us/cpp/build/reference/qspectre?view=msvc-170) where it provides less comprehensive coverage but is also cheap enough that it can be widely applied.

Specifically, this mitigation is trying to identify the pattern outlined in <https://devblogs.microsoft.com/cppblog/spectre-mitigations-in-msvc> that is, an offsetted load that is used to offset another load, both of which are guarded by a bounds check. For example:
```cpp
if (untrusted_index < array1_length) {
    unsigned char value = array1[untrusted_index];
    unsigned char value2 = array2[value * 64];
}
```
The other case that this mitigation looks for is an indirect call from an offsetted load that is protected by a bounds check. For example:
```cpp
if (index < funcs_len) {
  return funcs[index * 4]();
}
```

This mitigation will insert a new `speculative_data_barrier` intrinsic into the block with the second load or the indirect call. This intrinsice will be lowered to `LFENCE` on x86 and `CSBD` on AArch64.

>From d5b36f2726d6507d57e82bd0a9a5fbb83de64b2b Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao at microsoft.com>
Date: Wed, 13 Nov 2024 13:11:59 -0800
Subject: [PATCH] [llvm][aarch64][x86] Implement a lightweight spectre v1
 mitigation, like MSVC /Qspectre

---
 llvm/include/llvm/IR/Intrinsics.td            |   3 +
 llvm/include/llvm/InitializePasses.h          |   1 +
 .../Transforms/Utils/GuardedLoadHardening.h   |  31 ++
 llvm/lib/CodeGen/IntrinsicLowering.cpp        |   4 +
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   3 +
 .../Target/AArch64/AArch64TargetMachine.cpp   |   4 +
 llvm/lib/Target/X86/X86InstrCompiler.td       |   3 +
 llvm/lib/Target/X86/X86TargetMachine.cpp      |   4 +
 llvm/lib/Transforms/Utils/CMakeLists.txt      |   1 +
 .../Transforms/Utils/GuardedLoadHardening.cpp | 288 ++++++++++++++++++
 .../AArch64/speculative-data-barrier.ll       |  15 +
 .../CodeGen/X86/speculative-data-barrier.ll   |  15 +
 .../Transforms/Util/guarded-load-hardening.ll | 245 +++++++++++++++
 15 files changed, 619 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/Utils/GuardedLoadHardening.h
 create mode 100644 llvm/lib/Transforms/Utils/GuardedLoadHardening.cpp
 create mode 100644 llvm/test/CodeGen/AArch64/speculative-data-barrier.ll
 create mode 100644 llvm/test/CodeGen/X86/speculative-data-barrier.ll
 create mode 100644 llvm/test/Transforms/Util/guarded-load-hardening.ll

diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1ca8c2565ab0b6..9074bb18903a25 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -885,6 +885,9 @@ def int_readcyclecounter : DefaultAttrsIntrinsic<[llvm_i64_ty]>;
 
 def int_readsteadycounter : DefaultAttrsIntrinsic<[llvm_i64_ty]>;
 
+def int_speculative_data_barrier  : DefaultAttrsIntrinsic<[], [],
+                                            [IntrHasSideEffects]>;
+
 // The assume intrinsic is marked InaccessibleMemOnly so that proper control
 // dependencies will be maintained.
 def int_assume : DefaultAttrsIntrinsic<
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 7ecd59a14f709a..35976931d566b6 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -126,6 +126,7 @@ void initializeGVNLegacyPassPass(PassRegistry &);
 void initializeGlobalMergeFuncPassWrapperPass(PassRegistry &);
 void initializeGlobalMergePass(PassRegistry &);
 void initializeGlobalsAAWrapperPassPass(PassRegistry &);
+void initializeGuardedLoadHardeningPass(PassRegistry &);
 void initializeHardwareLoopsLegacyPass(PassRegistry &);
 void initializeMIRProfileLoaderPassPass(PassRegistry &);
 void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/Transforms/Utils/GuardedLoadHardening.h b/llvm/include/llvm/Transforms/Utils/GuardedLoadHardening.h
new file mode 100644
index 00000000000000..2e07181bfffb56
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/GuardedLoadHardening.h
@@ -0,0 +1,31 @@
+//=== GuardedLoadHardening.h - Lightweight spectre v1 mitigation *- C++ -*===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+// Lightweight load hardening as a mitigation against Spectre v1.
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_GUARDEDLOADHARDENING_H
+#define LLVM_TRANSFORMS_GUARDEDLOADHARDENING_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class FunctionPass;
+
+class GuardedLoadHardeningPass
+    : public PassInfoMixin<GuardedLoadHardeningPass> {
+public:
+  GuardedLoadHardeningPass() = default;
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+FunctionPass *createGuardedLoadHardeningPass();
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index f799a8cfc1ba7e..fd2fb5f5f0ffbd 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -324,6 +324,10 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
 
+  case Intrinsic::speculative_data_barrier:
+    break; // Simply strip out speculative_data_barrier on unsupported
+           // architectures
+
   case Intrinsic::dbg_declare:
   case Intrinsic::dbg_label:
     break;    // Simply strip out debugging intrinsics
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index a181a28f502f59..ca54f9fb92d9da 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -308,6 +308,7 @@
 #include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/FixIrreducible.h"
+#include "llvm/Transforms/Utils/GuardedLoadHardening.h"
 #include "llvm/Transforms/Utils/HelloWorld.h"
 #include "llvm/Transforms/Utils/IRNormalizer.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 7c3798f6462a46..f451ade4a295a9 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -370,6 +370,7 @@ FUNCTION_PASS("flatten-cfg", FlattenCFGPass())
 FUNCTION_PASS("float2int", Float2IntPass())
 FUNCTION_PASS("gc-lowering", GCLoweringPass())
 FUNCTION_PASS("guard-widening", GuardWideningPass())
+FUNCTION_PASS("guarded-load-hardening", GuardedLoadHardeningPass())
 FUNCTION_PASS("gvn-hoist", GVNHoistPass())
 FUNCTION_PASS("gvn-sink", GVNSinkPass())
 FUNCTION_PASS("helloworld", HelloWorldPass())
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 10e34a83a10da1..4818a638584cb7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10607,6 +10607,9 @@ let Predicates = [HasLSFE] in {
 let Uses = [FPMR, FPCR] in
 defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">;
 
+// Use the CSDB instruction as a barrier.
+def : Pat<(int_speculative_data_barrier), (HINT 0x14)>;
+
 include "AArch64InstrAtomics.td"
 include "AArch64SVEInstrInfo.td"
 include "AArch64SMEInstrInfo.td"
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 074f39c19fdb24..025b23993eca28 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -49,6 +49,7 @@
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/CFGuard.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardedLoadHardening.h"
 #include "llvm/Transforms/Utils/LowerIFunc.h"
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
 #include <memory>
@@ -669,6 +670,9 @@ void AArch64PassConfig::addIRPasses() {
       addPass(createCFGuardCheckPass());
   }
 
+  // Lightweight spectre v1 mitigation.
+  addPass(createGuardedLoadHardeningPass());
+
   if (TM->Options.JMCInstrument)
     addPass(createJMCInstrumenterPass());
 }
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ea0b66c2f55162..fab982fdd68932 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -2213,3 +2213,6 @@ def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
 let Predicates = [HasMOVBE] in {
  def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
 }
+
+// Use the LFENCE instruction as a barrier.
+def : Pat<(int_speculative_data_barrier), (LFENCE)>;
\ No newline at end of file
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 20dfdd27b33df6..e3a85adf09409c 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/CFGuard.h"
+#include "llvm/Transforms/Utils/GuardedLoadHardening.h"
 #include <memory>
 #include <optional>
 #include <string>
@@ -492,6 +493,9 @@ void X86PassConfig::addIRPasses() {
     }
   }
 
+  // Lightweight spectre v1 mitigation.
+  addPass(createGuardedLoadHardeningPass());
+
   if (TM->Options.JMCInstrument)
     addPass(createJMCInstrumenterPass());
 }
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index 65bd3080662c4d..503b0cdb080d4a 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -30,6 +30,7 @@ add_llvm_component_library(LLVMTransformUtils
   FunctionComparator.cpp
   FunctionImportUtils.cpp
   GlobalStatus.cpp
+  GuardedLoadHardening.cpp
   GuardUtils.cpp
   HelloWorld.cpp
   InlineFunction.cpp
diff --git a/llvm/lib/Transforms/Utils/GuardedLoadHardening.cpp b/llvm/lib/Transforms/Utils/GuardedLoadHardening.cpp
new file mode 100644
index 00000000000000..c2c50108bed81a
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/GuardedLoadHardening.cpp
@@ -0,0 +1,288 @@
+//=== GuardedLoadHardening.cpp -Lightweight spectre v1 mitigation *- C++ -*===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements a form of load hardening as a mitigation against Spectre v1.
+// Unlike the other [LLVM mitigations](/llvm/docs/SpeculativeLoadHardening.md)
+// this mitigation more like MSVC's /Qspectre flag where it provides less
+// comprehensive coverage but is also cheap enough that it can be widely
+// applied.
+//
+// Specifically this mitigation is trying to identify the pattern outlined in
+// <https://devblogs.microsoft.com/cppblog/spectre-mitigations-in-msvc>
+// that is, an offsetted load that is used to offset another load, both of which
+// are guarded by a bounds check. For example:
+// ```cpp
+// if (untrusted_index < array1_length) {
+//     unsigned char value = array1[untrusted_index];
+//     unsigned char value2 = array2[value * 64];
+// }
+// ```
+//
+// The other case that this mitigation looks for is an indirect call from an
+// offsetted load that is protected by a bounds check. For example:
+// ```cpp
+// if (index < funcs_len) {
+//   return funcs[index * 4]();
+// }
+// ```
+//
+// This mitigation will insert the `speculative_data_barrier` intrinsic into the
+// block with the second load or the indirect call.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/GuardedLoadHardening.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "guarded-load-hardening"
+
+static cl::opt<bool>
+    EnableGuardedLoadHardening("guarded-load-hardening",
+                               cl::desc("Enable guarded load hardening"),
+                               cl::init(false), cl::Hidden);
+
+STATISTIC(NumIntrInserted, "Intrinsics inserted");
+STATISTIC(CandidateBlocks, "Candidate blocks discovered");
+STATISTIC(OffsettedLoads, "Offsetted loads discovered");
+STATISTIC(DownstreamInstr, "Downstream loads or calls discovered");
+STATISTIC(OffsettedLoadsRemoved, "Candidate offsetted loads removed");
+
+namespace {
+
+class GuardedLoadHardening : public FunctionPass {
+public:
+  static char ID;
+
+  // Default constructor required for the INITIALIZE_PASS macro.
+  GuardedLoadHardening() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+};
+
+} // end anonymous namespace
+
+/// Visits the given value and all of its operands recursively, if they are of a
+/// type that is interesting to this analysis.
+bool visitDependencies(const Value &Start,
+                       const std::function<bool(const Value &)> &Visitor) {
+  SmallVector<const Value *, 4> Worklist{&Start};
+  while (!Worklist.empty()) {
+    auto *Item = Worklist.pop_back_val();
+    if (isa<Argument>(Item)) {
+      if (Visitor(*Item)) {
+        return true;
+      }
+    } else if (auto *Inst = dyn_cast<Instruction>(Item)) {
+      // Only visit the operands of unary, binary, and cast instructions. There
+      // are many other instructions that could be unwrapped here (e.g., Phi
+      // nodes, SelectInst), but they make the analysis too expensive.
+      if (Inst->isUnaryOp() || Inst->isBinaryOp() || Inst->isCast()) {
+        Worklist.append(Inst->value_op_begin(), Inst->value_op_end());
+      } else if (isa<CallInst>(Inst) || isa<LoadInst>(Inst) ||
+                 isa<AllocaInst>(Inst)) {
+        if (Visitor(*Item)) {
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+/// Gathers the given value and all of its operands recursively, if they are of
+/// a type that is interesting to this analysis.
+void gatherDependencies(const Value &Start,
+                        std::vector<const Value *> &Dependencies) {
+  visitDependencies(Start, [&](const Value &V) {
+    Dependencies.push_back(&V);
+    return false;
+  });
+}
+
+/// Checks if the given instruction is an offsetted load and returns the indices
+/// used to offset that load.
+std::optional<iterator_range<User::const_op_iterator>>
+tryGetIndicesIfOffsettedLoad(const Value &I) {
+  if (auto *Load = dyn_cast<LoadInst>(&I)) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(Load->getPointerOperand())) {
+      if (GEP->hasIndices() && !GEP->hasAllConstantIndices()) {
+        return GEP->indices();
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+/// Tries to get the comparison instruction if the given block is guarded by a
+/// relative integer comparison.
+std::optional<const ICmpInst *>
+tryGetComparisonIfGuarded(const BasicBlock &BB) {
+  if (auto *PredBB = BB.getSinglePredecessor()) {
+    if (auto *CondBranch = dyn_cast<BranchInst>(PredBB->getTerminator())) {
+      if (CondBranch->isConditional()) {
+        if (auto *Comparison = dyn_cast<ICmpInst>(CondBranch->getCondition())) {
+          if (Comparison->isRelational()) {
+            return Comparison;
+          }
+        }
+      }
+    }
+  }
+
+  return std::nullopt;
+}
+
+/// Does the given value use an offsetted load that requires protection?
+bool useRequiresProtection(const Value &MightUseIndex,
+                           const ICmpInst &Comparison,
+                           SmallVector<std::pair<const Value *, const Value *>,
+                                       4> &OffsettedLoadAndUses) {
+
+  SmallVector<const Value *, 4> OffsettedLoadIndexesToRemove;
+  for (auto &LoadAndUse : OffsettedLoadAndUses) {
+    if ((&MightUseIndex == LoadAndUse.second) &&
+        !is_contained(OffsettedLoadIndexesToRemove, LoadAndUse.first)) {
+      ++DownstreamInstr;
+
+      // If we've found a use of one of the offsetted loads, then we need to
+      // check if that offsetted load uses a value that is also used in the
+      // comparison.
+      std::vector<const Value *> ComparisonDependencies;
+      gatherDependencies(*Comparison.getOperand(0), ComparisonDependencies);
+      gatherDependencies(*Comparison.getOperand(1), ComparisonDependencies);
+
+      for (auto &Index : *tryGetIndicesIfOffsettedLoad(*LoadAndUse.first)) {
+        if (!isa<Constant>(&Index) &&
+            visitDependencies(*Index, [&](const Value &V) {
+              return is_contained(ComparisonDependencies, &V);
+            })) {
+          return true;
+        }
+      }
+
+      // The offsetted load doesn't use any of the values in the comparison, so
+      // remove it from the list since we never need to check it again.
+      OffsettedLoadIndexesToRemove.push_back(LoadAndUse.first);
+      ++OffsettedLoadsRemoved;
+    }
+  }
+
+  for (auto *IndexToRemove : OffsettedLoadIndexesToRemove) {
+    OffsettedLoadAndUses.erase(
+        std::remove_if(
+            OffsettedLoadAndUses.begin(), OffsettedLoadAndUses.end(),
+            [&](const auto &Pair) { return Pair.first == IndexToRemove; }),
+        OffsettedLoadAndUses.end());
+  }
+  return false;
+}
+
+bool runOnFunctionImpl(Function &F) {
+  SmallVector<BasicBlock *, 4> BlocksToProtect;
+  for (auto &BB : F) {
+    // Check for guarded loads that need to be protected.
+    if (auto Comparison = tryGetComparisonIfGuarded(BB)) {
+      ++CandidateBlocks;
+      SmallVector<std::pair<const Value *, const Value *>, 4>
+          OffsettedLoadAndUses;
+      for (auto &I : BB) {
+        if (OffsettedLoadAndUses.empty()) {
+          if (tryGetIndicesIfOffsettedLoad(I)) {
+            OffsettedLoadAndUses.emplace_back(&I, &I);
+            ++OffsettedLoads;
+          }
+        } else {
+          // Case 1: Look for an indirect call where the target is an offsetted
+          // load.
+          if (auto *Call = dyn_cast<CallInst>(&I)) {
+            if (Call->isIndirectCall() &&
+                useRequiresProtection(*Call->getCalledOperand(), **Comparison,
+                                      OffsettedLoadAndUses)) {
+              BlocksToProtect.push_back(&BB);
+              break;
+            }
+
+            // Case 2: Look for an offsetted load that is used as an index.
+          } else if (auto DependentIndexOp = tryGetIndicesIfOffsettedLoad(I)) {
+            for (auto &Op : *DependentIndexOp) {
+              if (!isa<Constant>(&Op) &&
+                  useRequiresProtection(*Op, **Comparison,
+                                        OffsettedLoadAndUses)) {
+                BlocksToProtect.push_back(&BB);
+                break;
+              }
+            }
+
+            OffsettedLoadAndUses.emplace_back(&I, &I);
+            ++OffsettedLoads;
+
+            // Otherwise, check if this value uses something from an offsetted
+            // load or one of its downstreams.
+          } else if (auto *Instr = dyn_cast<Instruction>(&I)) {
+            if (Instr->isUnaryOp() || Instr->isBinaryOp() || Instr->isCast()) {
+              for (auto &Op : Instr->operands()) {
+                // If any use of an offsetted load is used by this instruction,
+                // then add this instruction as a use of that offsetted load as
+                // well.
+                for (auto &LoadAndUse : OffsettedLoadAndUses) {
+                  if (Op.get() == LoadAndUse.second) {
+                    OffsettedLoadAndUses.emplace_back(LoadAndUse.first, Instr);
+                    break;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  if (BlocksToProtect.empty()) {
+    return false;
+  }
+
+  // Add a barrier to each block that requires protection.
+  for (auto *BB : BlocksToProtect) {
+    IRBuilder<> Builder(&BB->front());
+    Builder.CreateIntrinsic(Intrinsic::speculative_data_barrier, {}, {});
+    ++NumIntrInserted;
+  }
+
+  return true;
+}
+
+char GuardedLoadHardening::ID = 0;
+INITIALIZE_PASS(GuardedLoadHardening, "GuardedLoadHardening",
+                "GuardedLoadHardening", false, false)
+
+bool GuardedLoadHardening::runOnFunction(Function &F) {
+  if (EnableGuardedLoadHardening) {
+    return runOnFunctionImpl(F);
+  }
+  return false;
+}
+
+PreservedAnalyses GuardedLoadHardeningPass::run(Function &F,
+                                                FunctionAnalysisManager &FAM) {
+  bool Changed = false;
+  if (EnableGuardedLoadHardening) {
+    Changed = runOnFunctionImpl(F);
+  }
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+FunctionPass *llvm::createGuardedLoadHardeningPass() {
+  return new GuardedLoadHardening();
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/speculative-data-barrier.ll b/llvm/test/CodeGen/AArch64/speculative-data-barrier.ll
new file mode 100644
index 00000000000000..e34c46f70802b6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/speculative-data-barrier.ll
@@ -0,0 +1,15 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+
+; CHECK-LABEL:  f:
+; CHECK:        %bb.0:
+; CHECK-NEXT:       csdb
+; CHECK-NEXT:       ret
+define dso_local void @f() {
+  call void @llvm.speculative.data.barrier()
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare void @llvm.speculative.data.barrier() #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn }
diff --git a/llvm/test/CodeGen/X86/speculative-data-barrier.ll b/llvm/test/CodeGen/X86/speculative-data-barrier.ll
new file mode 100644
index 00000000000000..e8d9a0a09830c7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/speculative-data-barrier.ll
@@ -0,0 +1,15 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=x86_64-linux-gnu | FileCheck %s
+
+; CHECK-LABEL:  f:
+; CHECK:        %bb.0:
+; CHECK-NEXT:       lfence
+; CHECK-NEXT:       ret
+define dso_local void @f() {
+  call void @llvm.speculative.data.barrier()
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare void @llvm.speculative.data.barrier() #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn }
diff --git a/llvm/test/Transforms/Util/guarded-load-hardening.ll b/llvm/test/Transforms/Util/guarded-load-hardening.ll
new file mode 100644
index 00000000000000..79db6ed0020d18
--- /dev/null
+++ b/llvm/test/Transforms/Util/guarded-load-hardening.ll
@@ -0,0 +1,245 @@
+; RUN: opt -S -passes=guarded-load-hardening -guarded-load-hardening < %s | FileCheck %s --check-prefix ON
+; RUN: opt -S -passes=guarded-load-hardening < %s | FileCheck %s --check-prefix OFF
+
+; If the feature isn't enabled, we shouldn't see the intrinsic generated.
+; OFF-NOT:  call void @llvm.speculative.data.barrier()
+
+; Scenario: From the MSVC blog post
+; https://devblogs.microsoft.com/cppblog/spectre-mitigations-in-msvc/
+; From the C++:
+; int guarded_index_load_from_array(unsigned char* indexes, unsigned char* data, int index, int indexes_len) {
+;     if (index < indexes_len) {
+;         unsigned char sub_index = indexes[index];
+;         return data[sub_index * 64];
+;     }
+;     return 0;
+; }
+define dso_local noundef i32 @guarded_index_load_from_array(
+  ptr nocapture noundef readonly %indexes,
+  ptr nocapture noundef readonly %data,
+  i32 noundef %index,
+  i32 noundef %indexes_len) {
+entry:
+  %cmp = icmp slt i32 %index, %indexes_len
+  br i1 %cmp, label %if.then, label %return
+
+; ON-LABEL: define dso_local noundef i32 @guarded_index_load_from_array
+; ON:       if.then:
+; ON-NEXT:  call void @llvm.speculative.data.barrier()
+if.then:
+  %idxprom = sext i32 %index to i64
+  %arrayidx = getelementptr inbounds i8, ptr %indexes, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i64
+  %mul = shl nuw nsw i64 %conv, 6
+  %arrayidx2 = getelementptr inbounds i8, ptr %data, i64 %mul
+  %1 = load i8, ptr %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %conv3, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+; Scenario: As above (the MSVC blog post), but with an indirect call.
+; From the C++:
+; using FPtr = int(*)();
+; int guarded_fptr_call_from_array(FPtr* funcs, int index, int funcs_len) {
+;     if (index < funcs_len) {
+;         return funcs[index * 4]();
+;     }
+;     return 0;
+; }
+define dso_local noundef i32 @guarded_fptr_call_from_array(
+  ptr nocapture noundef readonly %funcs,
+  i32 noundef %index,
+  i32 noundef %funcs_len) local_unnamed_addr {
+entry:
+  %cmp = icmp slt i32 %index, %funcs_len
+  br i1 %cmp, label %if.then, label %return
+
+; ON-LABEL: define dso_local noundef i32 @guarded_fptr_call_from_array
+; ON:       if.then:
+; ON-NEXT:  call void @llvm.speculative.data.barrier()
+if.then:
+  %mul = shl nsw i32 %index, 2
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds ptr, ptr %funcs, i64 %idxprom
+  %0 = load ptr, ptr %arrayidx, align 8
+  %call = tail call noundef i32 %0()
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+ at temp = dso_local local_unnamed_addr global i8 0, align 1
+ at array1_size = external local_unnamed_addr global i32, align 4
+ at array2 = external local_unnamed_addr global [0 x i8], align 1
+ at array1 = external local_unnamed_addr global [0 x i8], align 1
+
+; Scenario: As written in the Spectre paper
+; From the C++:
+; void victim_function(size_t x) {
+;   if (x < array1_size) {
+;     temp &= array2[array1[x] * 512];
+;   }
+; }
+define dso_local void @victim_function(i64 noundef %x) local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @array1_size, align 4
+  %conv = zext i32 %0 to i64
+  %cmp = icmp ult i64 %x, %conv
+  br i1 %cmp, label %if.then, label %if.end
+
+; ON-LABEL: define dso_local void @victim_function
+; ON:       if.then:
+; ON-NEXT:  call void @llvm.speculative.data.barrier()
+if.then:
+  %arrayidx = getelementptr inbounds nuw [0 x i8], ptr @array1, i64 0, i64 %x
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv1 = zext i8 %1 to i64
+  %mul = shl nuw nsw i64 %conv1, 9
+  %arrayidx2 = getelementptr inbounds [0 x i8], ptr @array2, i64 0, i64 %mul
+  %2 = load i8, ptr %arrayidx2, align 1
+  %3 = load i8, ptr @temp, align 1
+  %and7 = and i8 %3, %2
+  store i8 %and7, ptr @temp, align 1
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Scenario: Shift/multiply the index
+; From the C++:
+; void victim_function_alt03(size_t x) {
+;   if (x < array1_size)
+;     temp &= array2[array1[x << 1] * 512];
+; }
+define dso_local void @victim_function_alt03(i64 noundef %x) local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @array1_size, align 4
+  %conv = zext i32 %0 to i64
+  %cmp = icmp ult i64 %x, %conv
+  br i1 %cmp, label %if.then, label %if.end
+
+; ON-LABEL: define dso_local void @victim_function_alt03
+; ON:       if.then:
+; ON-NEXT:  call void @llvm.speculative.data.barrier()
+if.then:
+  %shl = shl nuw nsw i64 %x, 1
+  %arrayidx = getelementptr inbounds nuw [0 x i8], ptr @array1, i64 0, i64 %shl
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv1 = zext i8 %1 to i64
+  %mul = shl nuw nsw i64 %conv1, 9
+  %arrayidx2 = getelementptr inbounds [0 x i8], ptr @array2, i64 0, i64 %mul
+  %2 = load i8, ptr %arrayidx2, align 1
+  %3 = load i8, ptr @temp, align 1
+  %and7 = and i8 %3, %2
+  store i8 %and7, ptr @temp, align 1
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Scenario: Pointer arithmetic + memcmp
+; From the C++:
+; void victim_function_alt10(size_t x) {
+;   if (x < array1_size)
+;     temp = memcmp(&temp, array2+(array1[x] * 512), 1);
+; }
+define dso_local void @victim_function_alt10(i64 noundef %x) local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @array1_size, align 4
+  %conv = zext i32 %0 to i64
+  %cmp = icmp ult i64 %x, %conv
+  br i1 %cmp, label %if.then, label %if.end
+
+; ON-LABEL: define dso_local void @victim_function_alt10
+; ON:       if.then:
+; ON-NEXT:  call void @llvm.speculative.data.barrier()
+if.then:
+  %arrayidx = getelementptr inbounds nuw [0 x i8], ptr @array1, i64 0, i64 %x
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv1 = zext i8 %1 to i64
+  %mul = shl nuw nsw i64 %conv1, 9
+  %add.ptr = getelementptr inbounds i8, ptr @array2, i64 %mul
+  %lhsc = load i8, ptr @temp, align 1
+  %rhsc = load i8, ptr %add.ptr, align 1
+  %chardiff = sub i8 %lhsc, %rhsc
+  store i8 %chardiff, ptr @temp, align 1
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Scenario: Index uses sum of two args
+; From the C++:
+; void victim_function_alt11(size_t x, size_t y) {
+;   if ((x+y) < array1_size)
+;     temp &= array2[array1[x+y] * 512];
+; }
+define dso_local void @victim_function_alt11(i64 noundef %x, i64 noundef %y) local_unnamed_addr {
+entry:
+  %add = add i64 %y, %x
+  %0 = load i32, ptr @array1_size, align 4
+  %conv = zext i32 %0 to i64
+  %cmp = icmp ult i64 %add, %conv
+  br i1 %cmp, label %if.then, label %if.end
+
+; ON-LABEL: define dso_local void @victim_function_alt11
+; ON:       if.then:
+; ON-NEXT:  call void @llvm.speculative.data.barrier()
+if.then:
+  %arrayidx = getelementptr inbounds nuw [0 x i8], ptr @array1, i64 0, i64 %add
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv2 = zext i8 %1 to i64
+  %mul = shl nuw nsw i64 %conv2, 9
+  %arrayidx3 = getelementptr inbounds [0 x i8], ptr @array2, i64 0, i64 %mul
+  %2 = load i8, ptr %arrayidx3, align 1
+  %3 = load i8, ptr @temp, align 1
+  %and9 = and i8 %3, %2
+  store i8 %and9, ptr @temp, align 1
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Scenario: Invert the bits of the index
+; From the C++:
+; void victim_function_alt13(size_t x) {
+;   if (x < array1_size)
+;     temp &= array2[array1[x ^ 255] * 512];
+; }
+define dso_local void @victim_function_alt13(i64 noundef %x) local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @array1_size, align 4
+  %conv = zext i32 %0 to i64
+  %cmp = icmp ult i64 %x, %conv
+  br i1 %cmp, label %if.then, label %if.end
+
+; ON-LABEL: define dso_local void @victim_function_alt13
+; ON:       if.then:
+; ON-NEXT:  call void @llvm.speculative.data.barrier()
+if.then:
+  %xor = xor i64 %x, 255
+  %arrayidx = getelementptr inbounds nuw [0 x i8], ptr @array1, i64 0, i64 %xor
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv1 = zext i8 %1 to i64
+  %mul = shl nuw nsw i64 %conv1, 9
+  %arrayidx2 = getelementptr inbounds [0 x i8], ptr @array2, i64 0, i64 %mul
+  %2 = load i8, ptr %arrayidx2, align 1
+  %3 = load i8, ptr @temp, align 1
+  %and7 = and i8 %3, %2
+  store i8 %and7, ptr @temp, align 1
+  br label %if.end
+
+if.end:
+  ret void
+}
\ No newline at end of file