[llvm] [x64][win] Add compiler support for x64 import call optimization (equivalent to MSVC /d2guardretpoline) (PR #126631)

Tue Feb 11 12:38:49 PST 2025

https://github.com/dpaoliello updated https://github.com/llvm/llvm-project/pull/126631

>From b4ab8af255bbd15b45618a164bd32e875534468b Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao at microsoft.com>
Date: Thu, 23 Jan 2025 10:45:02 -0800
Subject: [PATCH] Implement Import Call Optimization for x64

---
 llvm/include/llvm/Transforms/CFGuard.h        |   3 +
 llvm/lib/MC/MCObjectFileInfo.cpp              |   5 +
 llvm/lib/Target/X86/X86AsmPrinter.cpp         |  32 ++++
 llvm/lib/Target/X86/X86AsmPrinter.h           |  33 +++-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  18 +-
 llvm/lib/Target/X86/X86ISelLowering.h         |   8 +-
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp   |   7 +-
 llvm/lib/Target/X86/X86InstrCompiler.td       |   2 +
 llvm/lib/Target/X86/X86InstrFragments.td      |   3 +
 llvm/lib/Target/X86/X86MCInstLower.cpp        | 169 ++++++++++++++++--
 llvm/lib/Transforms/CFGuard/CFGuard.cpp       |  15 +-
 .../win-import-call-optimization-cfguard.ll   |  34 ++++
 .../win-import-call-optimization-jumptable.ll |  83 +++++++++
 .../win-import-call-optimization-nocalls.ll   |  21 +++
 .../X86/win-import-call-optimization.ll       |  65 +++++++
 .../MC/X86/win-import-call-optimization.s     |  69 +++++++
 16 files changed, 540 insertions(+), 27 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
 create mode 100644 llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
 create mode 100644 llvm/test/CodeGen/X86/win-import-call-optimization-nocalls.ll
 create mode 100644 llvm/test/CodeGen/X86/win-import-call-optimization.ll
 create mode 100644 llvm/test/MC/X86/win-import-call-optimization.s

diff --git a/llvm/include/llvm/Transforms/CFGuard.h b/llvm/include/llvm/Transforms/CFGuard.h
index caf822a2ec9fb..b81db8f487965 100644
--- a/llvm/include/llvm/Transforms/CFGuard.h
+++ b/llvm/include/llvm/Transforms/CFGuard.h
@@ -16,6 +16,7 @@
 namespace llvm {
 
 class FunctionPass;
+class GlobalValue;
 
 class CFGuardPass : public PassInfoMixin<CFGuardPass> {
 public:
@@ -34,6 +35,8 @@ FunctionPass *createCFGuardCheckPass();
 /// Insert Control FLow Guard dispatches on indirect function calls.
 FunctionPass *createCFGuardDispatchPass();
 
+bool isCFGuardFunction(const GlobalValue *GV);
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index 150e38a94db6a..334673c4dba79 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -599,6 +599,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
   if (T.getArch() == Triple::aarch64) {
     ImportCallSection =
         Ctx->getCOFFSection(".impcall", COFF::IMAGE_SCN_LNK_INFO);
+  } else if (T.getArch() == Triple::x86_64) {
+    // Import Call Optimization on x64 leverages the same metadata as the
+    // retpoline mitigation, hence the unusual section name.
+    ImportCallSection =
+        Ctx->getCOFFSection(".retplne", COFF::IMAGE_SCN_LNK_INFO);
   }
 
   // Debug info.
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index f01e47b41cf5e..52f8280b25965 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -920,6 +920,9 @@ void X86AsmPrinter::emitStartOfAsmFile(Module &M) {
     OutStreamer->emitSymbolAttribute(S, MCSA_Global);
     OutStreamer->emitAssignment(
         S, MCConstantExpr::create(Feat00Value, MMI->getContext()));
+
+    if (M.getModuleFlag("import-call-optimization"))
+      EnableImportCallOptimization = true;
   }
   OutStreamer->emitSyntaxDirective();
 
@@ -1021,6 +1024,35 @@ void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
     // safe to set.
     OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   } else if (TT.isOSBinFormatCOFF()) {
+    // If import call optimization is enabled, emit the appropriate section.
+    // We do this whether or not we recorded any items.
+    if (EnableImportCallOptimization) {
+      OutStreamer->switchSection(getObjFileLowering().getImportCallSection());
+
+      // Section always starts with some magic.
+      constexpr char ImpCallMagic[12] = "RetpolineV1";
+      OutStreamer->emitBytes(StringRef{ImpCallMagic, sizeof(ImpCallMagic)});
+
+      // Layout of this section is:
+      // Per section that contains an item to record:
+      //  uint32_t SectionSize: Size in bytes for information in this section.
+      //  uint32_t Section Number
+      //  Per call to imported function in section:
+      //    uint32_t Kind: the kind of item.
+      //    uint32_t InstOffset: the offset of the instr in its parent section.
+      for (auto &[Section, CallsToImportedFuncs] :
+           SectionToImportedFunctionCalls) {
+        unsigned SectionSize =
+            sizeof(uint32_t) * (2 + 2 * CallsToImportedFuncs.size());
+        OutStreamer->emitInt32(SectionSize);
+        OutStreamer->emitCOFFSecNumber(Section->getBeginSymbol());
+        for (auto &[CallsiteSymbol, Kind] : CallsToImportedFuncs) {
+          OutStreamer->emitInt32(Kind);
+          OutStreamer->emitCOFFSecOffset(CallsiteSymbol);
+        }
+      }
+    }
+
     if (usesMSVCFloatingPoint(TT, M)) {
       // In Windows' libcmt.lib, there is a file which is linked in only if the
       // symbol _fltused is referenced. Linking this in causes some
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index 693021eca3295..47e82c4dfcea5 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -31,6 +31,26 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   bool EmitFPOData = false;
   bool ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = false;
   bool IndCSPrefix = false;
+  bool EnableImportCallOptimization = false;
+
+  enum ImportCallKind : unsigned {
+    IMAGE_RETPOLINE_AMD64_IMPORT_BR = 0x02,
+    IMAGE_RETPOLINE_AMD64_IMPORT_CALL = 0x03,
+    IMAGE_RETPOLINE_AMD64_INDIR_BR = 0x04,
+    IMAGE_RETPOLINE_AMD64_INDIR_CALL = 0x05,
+    IMAGE_RETPOLINE_AMD64_INDIR_BR_REX = 0x06,
+    IMAGE_RETPOLINE_AMD64_CFG_BR = 0x08,
+    IMAGE_RETPOLINE_AMD64_CFG_CALL = 0x09,
+    IMAGE_RETPOLINE_AMD64_CFG_BR_REX = 0x0A,
+    IMAGE_RETPOLINE_AMD64_SWITCHTABLE_FIRST = 0x010,
+    IMAGE_RETPOLINE_AMD64_SWITCHTABLE_LAST = 0x01F,
+  };
+  struct ImportCallInfo {
+    MCSymbol *CalleeSymbol;
+    ImportCallKind Kind;
+  };
+  DenseMap<MCSection *, std::vector<ImportCallInfo>>
+      SectionToImportedFunctionCalls;
 
   // This utility class tracks the length of a stackmap instruction's 'shadow'.
   // It is used by the X86AsmPrinter to ensure that the stackmap shadow
@@ -45,7 +65,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
     void startFunction(MachineFunction &MF) {
       this->MF = &MF;
     }
-    void count(MCInst &Inst, const MCSubtargetInfo &STI,
+    void count(const MCInst &Inst, const MCSubtargetInfo &STI,
                MCCodeEmitter *CodeEmitter);
 
     // Called to signal the start of a shadow of RequiredSize bytes.
@@ -126,6 +146,17 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   void emitMachOIFuncStubHelperBody(Module &M, const GlobalIFunc &GI,
                                     MCSymbol *LazyPointer) override;
 
+  void emitCallInstruction(const llvm::MCInst &MCI);
+
+  // Emits a label to mark the next instruction as being relevant to Import Call
+  // Optimization.
+  void emitLabelAndRecordForImportCallOptimization(ImportCallKind Kind);
+
+  // Ensure that rax is used as the operand for the given instruction.
+  //
+  // NOTE: This assumes that it is safe to clobber rax.
+  void ensureRaxUsedForOperand(MCInst &TmpInst);
+
 public:
   X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6cf6061deba70..30a98a5a13ebf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18922,7 +18922,7 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
                                                SelectionDAG &DAG) const {
-  return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
+  return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
 }
 
 SDValue
@@ -18950,7 +18950,8 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
 /// Creates target global address or external symbol nodes for calls or
 /// other uses.
 SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
-                                                 bool ForCall) const {
+                                                 bool ForCall,
+                                                 bool *IsImpCall) const {
   // Unpack the global address or external symbol.
   SDLoc dl(Op);
   const GlobalValue *GV = nullptr;
@@ -19000,6 +19001,16 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
   if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
     return Result;
 
+  // If Import Call Optimization is enabled and this is an imported function
+  // then make a note of it and return the global address without wrapping.
+  if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
+      Mod.getModuleFlag("import-call-optimization")) {
+    assert(ForCall && "Should only enable import call optimization if we are "
+                      "lowering a call");
+    *IsImpCall = true;
+    return Result;
+  }
+
   Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
@@ -19025,7 +19036,7 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
 
 SDValue
 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
-  return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
+  return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
 }
 
 static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA,
@@ -34562,6 +34573,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FST)
   NODE_NAME_CASE(CALL)
   NODE_NAME_CASE(CALL_RVMARKER)
+  NODE_NAME_CASE(IMP_CALL)
   NODE_NAME_CASE(BT)
   NODE_NAME_CASE(CMP)
   NODE_NAME_CASE(FCMP)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index fe79fefeed631..6324cc65398a0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -81,6 +81,10 @@ namespace llvm {
     // marker instruction.
     CALL_RVMARKER,
 
+    // Pseudo for a call to an imported function to ensure the correct machine
+    // instruction is emitted for Import Call Optimization.
+    IMP_CALL,
+
     /// X86 compare and logical compare instructions.
     CMP,
     FCMP,
@@ -1733,8 +1737,8 @@ namespace llvm {
 
     /// Creates target global address or external symbol nodes for calls or
     /// other uses.
-    SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
-                                  bool ForCall) const;
+    SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall,
+                                  bool *IsImpCall) const;
 
     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 6835c7e336a5c..cbbdf37a3fb75 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2402,6 +2402,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     InGlue = Chain.getValue(1);
   }
 
+  bool IsImpCall = false;
   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
     // In the 64-bit large code model, we have to make all calls
@@ -2414,7 +2415,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // ForCall to true here has the effect of removing WrapperRIP when possible
     // to allow direct calls to be selected without first materializing the
     // address into a register.
-    Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
+    Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true, &IsImpCall);
   } else if (Subtarget.isTarget64BitILP32() &&
              Callee.getValueType() == MVT::i32) {
     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
@@ -2536,7 +2537,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Returns a chain & a glue for retval copy to use.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
+  if (IsImpCall) {
+    Chain = DAG.getNode(X86ISD::IMP_CALL, dl, NodeTys, Ops);
+  } else if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
   } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
     // Calls with a "clang.arc.attachedcall" bundle are special. They should be
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 9687ae29f1c78..5f603de695906 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1309,6 +1309,8 @@ def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 texternalsym:$dst)),
 def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 tglobaladdr:$dst)),
           (CALL64pcrel32_RVMARKER tglobaladdr:$rvfunc, tglobaladdr:$dst)>;
 
+def : Pat<(X86imp_call (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>;
 
 // Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
 // can never use callee-saved registers. That is the purpose of the GR64_TC
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index ddbc7c55a6113..3ab820de78efc 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -210,6 +210,9 @@ def X86call_rvmarker  : SDNode<"X86ISD::CALL_RVMARKER",     SDT_X86Call,
                         [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                          SDNPVariadic]>;
 
+def X86imp_call  : SDNode<"X86ISD::IMP_CALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                         SDNPVariadic]>;
 
 def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call,
                             [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 0f8fbf5be1c95..f265093a60d12 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -47,6 +47,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/CFGuard.h"
 #include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
 #include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
 #include <string>
@@ -112,7 +113,7 @@ struct NoAutoPaddingScope {
 static void emitX86Nops(MCStreamer &OS, unsigned NumBytes,
                         const X86Subtarget *Subtarget);
 
-void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
+void X86AsmPrinter::StackMapShadowTracker::count(const MCInst &Inst,
                                                  const MCSubtargetInfo &STI,
                                                  MCCodeEmitter *CodeEmitter) {
   if (InShadow) {
@@ -2193,6 +2194,27 @@ static void addConstantComments(const MachineInstr *MI,
   }
 }
 
+bool isImportedFunction(const MachineOperand &MO) {
+  return MO.isGlobal() && (MO.getTargetFlags() == X86II::MO_DLLIMPORT);
+}
+
+bool isCallToCFGuardFunction(const MachineInstr *MI) {
+  assert(MI->getOpcode() == X86::TAILJMPm64_REX ||
+         MI->getOpcode() == X86::CALL64m);
+  const MachineOperand &MO = MI->getOperand(3);
+  return MO.isGlobal() && (MO.getTargetFlags() == X86II::MO_NO_FLAG) &&
+         isCFGuardFunction(MO.getGlobal());
+}
+
+bool hasJumpTableInfoInBlock(const llvm::MachineInstr *MI) {
+  const MachineBasicBlock &MBB = *MI->getParent();
+  for (auto I = MBB.instr_rbegin(), E = MBB.instr_rend(); I != E; ++I)
+    if (I->isJumpTableDebugInfo())
+      return true;
+
+  return false;
+}
+
 void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   // FIXME: Enable feature predicate checks once all the test pass.
   // X86_MC::verifyInstructionPredicates(MI->getOpcode(),
@@ -2271,20 +2293,64 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::TAILJMPd64:
     if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11))
       EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX));
-    [[fallthrough]];
-  case X86::TAILJMPr:
+
+    if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) {
+      emitLabelAndRecordForImportCallOptimization(
+          IMAGE_RETPOLINE_AMD64_IMPORT_BR);
+    }
+
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("TAILCALL");
+    break;
+  case X86::TAILJMPm64_REX:
+    if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) {
+      emitLabelAndRecordForImportCallOptimization(
+          IMAGE_RETPOLINE_AMD64_CFG_BR_REX);
+    }
+
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("TAILCALL");
+    break;
   case X86::TAILJMPm:
   case X86::TAILJMPd:
   case X86::TAILJMPd_CC:
-  case X86::TAILJMPr64:
   case X86::TAILJMPm64:
   case X86::TAILJMPd64_CC:
-  case X86::TAILJMPr64_REX:
-  case X86::TAILJMPm64_REX:
     // Lower these as normal, but add some comments.
     OutStreamer->AddComment("TAILCALL");
     break;
 
+  case X86::TAILJMPr:
+  case X86::TAILJMPr64:
+  case X86::TAILJMPr64_REX: {
+    MCInst TmpInst;
+    MCInstLowering.Lower(MI, TmpInst);
+
+    if (EnableImportCallOptimization) {
+      // Import call optimization requires all indirect calls go via RAX.
+      ensureRaxUsedForOperand(TmpInst);
+      emitLabelAndRecordForImportCallOptimization(
+          IMAGE_RETPOLINE_AMD64_INDIR_BR);
+    }
+
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("TAILCALL");
+    EmitAndCountInstruction(TmpInst);
+    return;
+  }
+
+  case X86::JMP64r:
+  case X86::JMP64m:
+    if (EnableImportCallOptimization && hasJumpTableInfoInBlock(MI)) {
+      uint16_t EncodedReg =
+          this->getSubtarget().getRegisterInfo()->getEncodingValue(
+              MI->getOperand(0).getReg().asMCReg());
+      emitLabelAndRecordForImportCallOptimization(
+          (ImportCallKind)(IMAGE_RETPOLINE_AMD64_SWITCHTABLE_FIRST +
+                           EncodedReg));
+    }
+    break;
+
   case X86::TLS_addr32:
   case X86::TLS_addr64:
   case X86::TLS_addrX32:
@@ -2469,7 +2535,49 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::CALL64pcrel32:
     if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11))
       EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX));
+
+    if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) {
+      emitLabelAndRecordForImportCallOptimization(
+          IMAGE_RETPOLINE_AMD64_IMPORT_CALL);
+
+      MCInst TmpInst;
+      MCInstLowering.Lower(MI, TmpInst);
+
+      // For Import Call Optimization to work, we need a the call instruction
+      // with a rex prefix, and a 5-byte nop after the call instruction.
+      EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+      emitCallInstruction(TmpInst);
+      emitNop(*OutStreamer, 5, Subtarget);
+      return;
+    }
+
     break;
+  case X86::CALL64r:
+    if (EnableImportCallOptimization) {
+      MCInst TmpInst;
+      MCInstLowering.Lower(MI, TmpInst);
+
+      // Import call optimization requires all indirect calls go via RAX.
+      ensureRaxUsedForOperand(TmpInst);
+
+      emitLabelAndRecordForImportCallOptimization(
+          IMAGE_RETPOLINE_AMD64_INDIR_CALL);
+      emitCallInstruction(TmpInst);
+
+      // For Import Call Optimization to work, we a 3-byte nop after the call
+      // instruction.
+      emitNop(*OutStreamer, 3, Subtarget);
+      return;
+    }
+
+    break;
+  case X86::CALL64m:
+    if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) {
+      emitLabelAndRecordForImportCallOptimization(
+          IMAGE_RETPOLINE_AMD64_CFG_CALL);
+    }
+    break;
+
   case X86::JCC_1:
     // Two instruction prefixes (2EH for branch not-taken and 3EH for branch
     // taken) are used as branch hints. Here we add branch taken prefix for
@@ -2490,20 +2598,47 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
 
-  // Stackmap shadows cannot include branch targets, so we can count the bytes
-  // in a call towards the shadow, but must ensure that the no thread returns
-  // in to the stackmap shadow.  The only way to achieve this is if the call
-  // is at the end of the shadow.
   if (MI->isCall()) {
-    // Count then size of the call towards the shadow
-    SMShadowTracker.count(TmpInst, getSubtargetInfo(), CodeEmitter.get());
-    // Then flush the shadow so that we fill with nops before the call, not
-    // after it.
-    SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
-    // Then emit the call
-    OutStreamer->emitInstruction(TmpInst, getSubtargetInfo());
+    emitCallInstruction(TmpInst);
     return;
   }
 
   EmitAndCountInstruction(TmpInst);
 }
+
+void X86AsmPrinter::emitCallInstruction(const llvm::MCInst &MCI) {
+  // Stackmap shadows cannot include branch targets, so we can count the bytes
+  // in a call towards the shadow, but must ensure that the no thread returns
+  // in to the stackmap shadow.  The only way to achieve this is if the call
+  // is at the end of the shadow.
+
+  // Count then size of the call towards the shadow
+  SMShadowTracker.count(MCI, getSubtargetInfo(), CodeEmitter.get());
+  // Then flush the shadow so that we fill with nops before the call, not
+  // after it.
+  SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+  // Then emit the call
+  OutStreamer->emitInstruction(MCI, getSubtargetInfo());
+}
+
+void X86AsmPrinter::emitLabelAndRecordForImportCallOptimization(
+    ImportCallKind Kind) {
+  assert(EnableImportCallOptimization);
+
+  MCSymbol *CallSiteSymbol = MMI->getContext().createNamedTempSymbol("impcall");
+  OutStreamer->emitLabel(CallSiteSymbol);
+
+  SectionToImportedFunctionCalls[OutStreamer->getCurrentSectionOnly()]
+      .push_back({CallSiteSymbol, Kind});
+}
+
+void X86AsmPrinter::ensureRaxUsedForOperand(MCInst &TmpInst) {
+  assert(TmpInst.getNumOperands() == 1);
+
+  MCOperand &Op = TmpInst.getOperand(0);
+  if (Op.isReg() && Op.getReg() != X86::RAX) {
+    EmitAndCountInstruction(
+        MCInstBuilder(X86::MOV64rr).addReg(X86::RAX).addReg(Op.getReg()));
+    Op.setReg(X86::RAX);
+  }
+}
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
index 41d68b62eb8d7..cadc538021103 100644
--- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -31,6 +31,9 @@ using OperandBundleDef = OperandBundleDefT<Value *>;
 
 STATISTIC(CFGuardCounter, "Number of Control Flow Guard checks added");
 
+constexpr StringRef GuardCheckFunctionName = "__guard_check_icall_fptr";
+constexpr StringRef GuardDispatchFunctionName = "__guard_dispatch_icall_fptr";
+
 namespace {
 
 /// Adds Control Flow Guard (CFG) checks on indirect function calls/invokes.
@@ -45,10 +48,10 @@ class CFGuardImpl {
     // Get or insert the guard check or dispatch global symbols.
     switch (GuardMechanism) {
     case Mechanism::Check:
-      GuardFnName = "__guard_check_icall_fptr";
+      GuardFnName = GuardCheckFunctionName;
       break;
     case Mechanism::Dispatch:
-      GuardFnName = "__guard_dispatch_icall_fptr";
+      GuardFnName = GuardDispatchFunctionName;
       break;
     }
   }
@@ -318,3 +321,11 @@ FunctionPass *llvm::createCFGuardCheckPass() {
 FunctionPass *llvm::createCFGuardDispatchPass() {
   return new CFGuard(CFGuardPass::Mechanism::Dispatch);
 }
+
+bool llvm::isCFGuardFunction(const GlobalValue *GV) {
+  if (GV->getLinkage() != GlobalValue::ExternalLinkage)
+    return false;
+
+  StringRef Name = GV->getName();
+  return Name == GuardCheckFunctionName || Name == GuardDispatchFunctionName;
+}
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
new file mode 100644
index 0000000000000..12be910d68ee9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+
+define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
+entry:
+  call void %func_ptr()
+  ret void
+}
+; CHECK-LABEL:  normal_call:
+; CHECK:        .Limpcall0:
+; CHECK-NEXT:     callq   *__guard_dispatch_icall_fptr(%rip)
+
+define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
+entry:
+  tail call void %func_ptr()
+  ret void
+}
+; CHECK-LABEL:  tail_call_fp:
+; CHECK:        .Limpcall1:
+; CHECK-NEXT:     rex64 jmpq      *__guard_dispatch_icall_fptr(%rip)
+
+; CHECK-LABEL  .section   .retplne,"yi"
+; CHECK-NEXT   .asciz  "RetpolineV1"
+; CHECK-NEXT   .long   16
+; CHECK-NEXT   .secnum tc_sect
+; CHECK-NEXT   .long   10
+; CHECK-NEXT   .secoffset      .Limpcall1
+; CHECK-NEXT   .long   16
+; CHECK-NEXT   .secnum nc_sect
+; CHECK-NEXT   .long   9
+; CHECK-NEXT   .secoffset      .Limpcall0
+
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 1, !"import-call-optimization", i32 1}
+!1 = !{i32 2, !"cfguard", i32 2}
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
new file mode 100644
index 0000000000000..fe22b251685e6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
@@ -0,0 +1,83 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
+
+; CHECK-LABEL:  uses_rax:
+; CHECK:        .Limpcall0:
+; CHECK-NEXT:     jmpq    *%rax
+
+define void @uses_rax(i32 %x) {
+entry:
+  switch i32 %x, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:
+  tail call void @g(i32 0) #2
+  br label %sw.epilog
+
+sw.bb1:
+  tail call void @g(i32 1) #2
+  br label %sw.epilog
+
+sw.bb2:
+  tail call void @g(i32 2) #2
+  br label %sw.epilog
+
+sw.bb3:
+  tail call void @g(i32 3) #2
+  br label %sw.epilog
+
+sw.epilog:
+  tail call void @g(i32 10) #2
+  ret void
+}
+
+; CHECK-LABEL:  uses_rcx:
+; CHECK:        .Limpcall1:
+; CHECK-NEXT:     jmpq    *%rcx
+
+define void @uses_rcx(i32 %x) {
+entry:
+  switch i32 %x, label %sw.epilog [
+    i32 10, label %sw.bb
+    i32 11, label %sw.bb1
+    i32 12, label %sw.bb2
+    i32 13, label %sw.bb3
+  ]
+
+sw.bb:
+  tail call void @g(i32 0) #2
+  br label %sw.epilog
+
+sw.bb1:
+  tail call void @g(i32 1) #2
+  br label %sw.epilog
+
+sw.bb2:
+  tail call void @g(i32 2) #2
+  br label %sw.epilog
+
+sw.bb3:
+  tail call void @g(i32 3) #2
+  br label %sw.epilog
+
+sw.epilog:
+  tail call void @g(i32 10) #2
+  ret void
+}
+
+declare void @g(i32)
+
+; CHECK-LABEL:  .section        .retplne,"yi"
+; CHECK-NEXT:   .asciz  "RetpolineV1"
+; CHECK-NEXT:   .long   24
+; CHECK-NEXT:   .secnum .text
+; CHECK-NEXT:   .long   16
+; CHECK-NEXT:   .secoffset      .Limpcall0
+; CHECK-NEXT:   .long   17
+; CHECK-NEXT:   .secoffset      .Limpcall1
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"import-call-optimization", i32 1}
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-nocalls.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-nocalls.ll
new file mode 100644
index 0000000000000..4ca7b85282f2e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization-nocalls.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
+
+define dso_local void @normal_call() local_unnamed_addr {
+entry:
+  call void @a()
+  ret void
+}
+; CHECK-LABEL:  normal_call:
+; CHECK:        callq   a
+
+declare void @a() local_unnamed_addr
+
+; Even if there are no calls to imported functions, we still need to emit the
+; .impcall section.
+
+; CHECK-LABEL  .section   .retplne,"yi"
+; CHECK-NEXT   .asciz  "RetpolineV1"
+; CHECK-NOT    .secnum
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"import-call-optimization", i32 1}
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization.ll b/llvm/test/CodeGen/X86/win-import-call-optimization.ll
new file mode 100644
index 0000000000000..e669984ac37a4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization.ll
@@ -0,0 +1,65 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+
+define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
+entry:
+  call void @a()
+  call void @a()
+  call void %func_ptr()
+  ret void
+}
+; CHECK-LABEL:  normal_call:
+; CHECK:        .Limpcall0:
+; CHECK-NEXT:     rex64
+; CHECK-NEXT:     callq   __imp_a
+; CHECK-NEXT:     nopl    8(%rax,%rax)
+; CHECK-NEXT:   .Limpcall1:
+; CHECK-NEXT:     rex64
+; CHECK-NEXT:     callq   __imp_a
+; CHECK-NEXT:     nopl    8(%rax,%rax)
+; CHECK-NEXT:     movq    %rsi, %rax
+; CHECK-NEXT:   .Limpcall2:
+; CHECK-NEXT:     callq   *%rax
+; CHECK-NEXT:     nopl    (%rax)
+; CHECK-NEXT:     nop
+
+define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
+entry:
+  tail call void @b()
+  ret void
+}
+; CHECK-LABEL:  tail_call:
+; CHECK:        .Limpcall3:
+; CHECK-NEXT:     jmp __imp_b
+
+define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
+entry:
+  tail call void %func_ptr()
+  ret void
+}
+; CHECK-LABEL:  tail_call_fp:
+; CHECK:          movq    %rcx, %rax
+; CHECK-NEXT:   .Limpcall4:
+; CHECK-NEXT:     rex64 jmpq      *%rax
+
+declare dllimport void @a() local_unnamed_addr
+declare dllimport void @b() local_unnamed_addr
+
+; CHECK-LABEL  .section   .retplne,"yi"
+; CHECK-NEXT   .asciz  "RetpolineV1"
+; CHECK-NEXT   .long   24
+; CHECK-NEXT   .secnum tc_sect
+; CHECK-NEXT   .long   3
+; CHECK-NEXT   .secoffset      .Limpcall3
+; CHECK-NEXT   .long   5
+; CHECK-NEXT   .secoffset      .Limpcall4
+; CHECK-NEXT   .long   32
+; CHECK-NEXT   .secnum nc_sect
+; CHECK-NEXT   .long   3
+; CHECK-NEXT   .secoffset      .Limpcall0
+; CHECK-NEXT   .long   3
+; CHECK-NEXT   .secoffset      .Limpcall1
+; CHECK-NEXT   .long   5
+; CHECK-NEXT   .secoffset      .Limpcall2
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"import-call-optimization", i32 1}
diff --git a/llvm/test/MC/X86/win-import-call-optimization.s b/llvm/test/MC/X86/win-import-call-optimization.s
new file mode 100644
index 0000000000000..4f839a2bc6011
--- /dev/null
+++ b/llvm/test/MC/X86/win-import-call-optimization.s
@@ -0,0 +1,69 @@
+// RUN: llvm-mc -triple x86_64-windows-msvc -filetype obj -o %t.obj %s
+// RUN: llvm-readobj --sections --sd --relocs %t.obj | FileCheck %s
+
+.section        nc_sect,"xr"
+normal_call:
+.seh_proc normal_call
+# %bb.0:                                # %entry
+        subq    $40, %rsp
+        .seh_stackalloc 40
+        .seh_endprologue
+.Limpcall0:
+        rex64
+        callq   *__imp_a(%rip)
+        nopl    8(%rax,%rax)
+        nop
+        addq    $40, %rsp
+        retq
+        .seh_endproc
+
+.section        tc_sect,"xr"
+tail_call:
+.Limpcall1:
+        rex64
+        jmp     *__imp_b(%rip)
+
+.section        .retplne,"yi"
+.asciz  "RetpolineV1"
+.long   16
+.secnum tc_sect
+.long   2
+.secoffset .Limpcall1
+.long   16
+.secnum nc_sect
+.long   3
+.secoffset .Limpcall0
+
+// CHECK-LABEL: Name: .retplne (2E 72 65 74 70 6C 6E 65)
+// CHECK-NEXT:  VirtualSize: 0x0
+// CHECK-NEXT:  VirtualAddress: 0x0
+// CHECK-NEXT:  RawDataSize: 44
+// CHECK-NEXT:  PointerToRawData:
+// CHECK-NEXT:  PointerToRelocations:
+// CHECK-NEXT:  PointerToLineNumbers:
+// CHECK-NEXT:  RelocationCount: 0
+// CHECK-NEXT:  LineNumberCount: 0
+// CHECK-NEXT:  Characteristics [
+// CHECK-NEXT:    IMAGE_SCN_ALIGN_1BYTES
+// CHECK-NEXT:    IMAGE_SCN_LNK_INFO
+// CHECK-NEXT:  ]
+// CHECK-NEXT:  SectionData (
+// CHECK-NEXT:    52657470 6F6C696E 65563100 10000000  |RetpolineV1.....|
+// CHECK-NEXT:    0010:
+// CHECK-SAME:    [[#%.2X,TCSECT:]]000000
+// CHECK-SAME:    02000000
+// CHECK-SAME:    [[#%.2X,TCOFFSET:]]000000
+// CHECK-SAME:    10000000
+// CHECK-NEXT:    0020:
+// CHECK-SAME:    [[#%.2X,NCSECT:]]000000
+// CHECK-SAME:    03000000
+// CHECK-SAME:    [[#%.2X,NCOFFSET:]]000000
+// CHECK-NEXT:  )
+
+// CHECK-LABEL: Relocations [
+// CHECK-NEXT:     Section ([[#%u,NCSECT]]) nc_sect {
+// CHECK-NEXT:       0x[[#%x,NCOFFSET + 3]] IMAGE_REL_AMD64_REL32 __imp_a
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Section ([[#%u,TCSECT]]) tc_sect {
+// CHECK-NEXT:       0x[[#%x,TCOFFSET + 3]] IMAGE_REL_AMD64_REL32 __imp_b
+// CHECK-NEXT:     }