[llvm] a414877 - [x64][win] Add compiler support for x64 import call optimization (equivalent to MSVC /d2guardretpoline) (#126631)

Tue May 20 14:48:44 PDT 2025

Author: Daniel Paoliello
Date: 2025-05-20T14:48:41-07:00
New Revision: a414877a7a5f000d01370acb1162eb1dea87f48c

URL: https://github.com/llvm/llvm-project/commit/a414877a7a5f000d01370acb1162eb1dea87f48c
DIFF: https://github.com/llvm/llvm-project/commit/a414877a7a5f000d01370acb1162eb1dea87f48c.diff

LOG: [x64][win] Add compiler support for x64 import call optimization (equivalent to MSVC /d2guardretpoline) (#126631)

This is the x64 equivalent of #121516

Since import call optimization was originally [added to x64 Windows to
implement a more efficient retpoline
mitigation](https://techcommunity.microsoft.com/blog/windowsosplatform/mitigating-spectre-variant-2-with-retpoline-on-windows/295618)
the section and constant names relating to this all mention "retpoline"
and we need to mark indirect calls, control-flow guard calls and jumps
for jump tables in the section alongside calls to imported functions.

As with the AArch64 feature, this emits a new section into the obj which
is used by the MSVC linker to generate the Dynamic Value Relocation
Table and the section itself does not appear in the final binary.

The Windows Loader requires a specific sequence of instructions be
emitted when this feature is enabled:
* Indirect calls/jumps must have the function pointer to jump to in
`rax`.
* Calls to imported functions must use the `rex` prefix and be followed
by a 5-byte nop.
* Indirect calls must be followed by a 3-byte nop.

Added: 
    llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
    llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
    llvm/test/CodeGen/X86/win-import-call-optimization-nocalls.ll
    llvm/test/CodeGen/X86/win-import-call-optimization.ll
    llvm/test/MC/X86/win-import-call-optimization.s

Modified: 
    llvm/include/llvm/Transforms/CFGuard.h
    llvm/lib/MC/MCObjectFileInfo.cpp
    llvm/lib/Target/X86/X86AsmPrinter.cpp
    llvm/lib/Target/X86/X86AsmPrinter.h
    llvm/lib/Target/X86/X86ExpandPseudo.cpp
    llvm/lib/Target/X86/X86FastISel.cpp
    llvm/lib/Target/X86/X86FrameLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h
    llvm/lib/Target/X86/X86ISelLoweringCall.cpp
    llvm/lib/Target/X86/X86InstrCompiler.td
    llvm/lib/Target/X86/X86InstrControl.td
    llvm/lib/Target/X86/X86InstrFragments.td
    llvm/lib/Target/X86/X86InstrInfo.cpp
    llvm/lib/Target/X86/X86InstrPredicates.td
    llvm/lib/Target/X86/X86MCInstLower.cpp
    llvm/lib/Target/X86/X86RegisterInfo.cpp
    llvm/lib/Target/X86/X86RegisterInfo.td
    llvm/lib/Transforms/CFGuard/CFGuard.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/CFGuard.h b/llvm/include/llvm/Transforms/CFGuard.h
index caf822a2ec9fb..b81db8f487965 100644

--- a/llvm/include/llvm/Transforms/CFGuard.h
+++ b/llvm/include/llvm/Transforms/CFGuard.h
@@ -16,6 +16,7 @@
 namespace llvm {
 
 class FunctionPass;
+class GlobalValue;
 
 class CFGuardPass : public PassInfoMixin<CFGuardPass> {
 public:
@@ -34,6 +35,8 @@ FunctionPass *createCFGuardCheckPass();
 /// Insert Control FLow Guard dispatches on indirect function calls.
 FunctionPass *createCFGuardDispatchPass();
 
+bool isCFGuardFunction(const GlobalValue *GV);
+
 } // namespace llvm
 
 #endif

diff  --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index ab7552ca01061..9ad56aaf05bc5 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -599,6 +599,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
   if (T.getArch() == Triple::aarch64) {
     ImportCallSection =
         Ctx->getCOFFSection(".impcall", COFF::IMAGE_SCN_LNK_INFO);
+  } else if (T.getArch() == Triple::x86_64) {
+    // Import Call Optimization on x64 leverages the same metadata as the
+    // retpoline mitigation, hence the unusual section name.
+    ImportCallSection =
+        Ctx->getCOFFSection(".retplne", COFF::IMAGE_SCN_LNK_INFO);
   }
 
   // Debug info.

diff  --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 247b792d16ba2..24eda602effd1 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -464,7 +464,8 @@ static bool isIndirectBranchOrTailCall(const MachineInstr &MI) {
          Opc == X86::TAILJMPr64 || Opc == X86::TAILJMPm64 ||
          Opc == X86::TCRETURNri || Opc == X86::TCRETURNmi ||
          Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNmi64 ||
-         Opc == X86::TAILJMPr64_REX || Opc == X86::TAILJMPm64_REX;
+         Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TAILJMPr64_REX ||
+         Opc == X86::TAILJMPm64_REX;
 }
 
 void X86AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
@@ -912,6 +913,9 @@ void X86AsmPrinter::emitStartOfAsmFile(Module &M) {
   if (TT.isOSBinFormatCOFF()) {
     emitCOFFFeatureSymbol(M);
     emitCOFFReplaceableFunctionData(M);
+
+    if (M.getModuleFlag("import-call-optimization"))
+      EnableImportCallOptimization = true;
   }
   OutStreamer->emitSyntaxDirective();
 
@@ -1016,6 +1020,35 @@ void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
     // safe to set.
     OutStreamer->emitSubsectionsViaSymbols();
   } else if (TT.isOSBinFormatCOFF()) {
+    // If import call optimization is enabled, emit the appropriate section.
+    // We do this whether or not we recorded any items.
+    if (EnableImportCallOptimization) {
+      OutStreamer->switchSection(getObjFileLowering().getImportCallSection());
+
+      // Section always starts with some magic.
+      constexpr char ImpCallMagic[12] = "RetpolineV1";
+      OutStreamer->emitBytes(StringRef{ImpCallMagic, sizeof(ImpCallMagic)});
+
+      // Layout of this section is:
+      // Per section that contains an item to record:
+      //  uint32_t SectionSize: Size in bytes for information in this section.
+      //  uint32_t Section Number
+      //  Per call to imported function in section:
+      //    uint32_t Kind: the kind of item.
+      //    uint32_t InstOffset: the offset of the instr in its parent section.
+      for (auto &[Section, CallsToImportedFuncs] :
+           SectionToImportedFunctionCalls) {
+        unsigned SectionSize =
+            sizeof(uint32_t) * (2 + 2 * CallsToImportedFuncs.size());
+        OutStreamer->emitInt32(SectionSize);
+        OutStreamer->emitCOFFSecNumber(Section->getBeginSymbol());
+        for (auto &[CallsiteSymbol, Kind] : CallsToImportedFuncs) {
+          OutStreamer->emitInt32(Kind);
+          OutStreamer->emitCOFFSecOffset(CallsiteSymbol);
+        }
+      }
+    }
+
     if (usesMSVCFloatingPoint(TT, M)) {
       // In Windows' libcmt.lib, there is a file which is linked in only if the
       // symbol _fltused is referenced. Linking this in causes some

diff  --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index 61d8f45501ab1..efb951b73532f 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -35,6 +35,26 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   bool EmitFPOData = false;
   bool ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = false;
   bool IndCSPrefix = false;
+  bool EnableImportCallOptimization = false;
+
+  enum ImportCallKind : unsigned {
+    IMAGE_RETPOLINE_AMD64_IMPORT_BR = 0x02,
+    IMAGE_RETPOLINE_AMD64_IMPORT_CALL = 0x03,
+    IMAGE_RETPOLINE_AMD64_INDIR_BR = 0x04,
+    IMAGE_RETPOLINE_AMD64_INDIR_CALL = 0x05,
+    IMAGE_RETPOLINE_AMD64_INDIR_BR_REX = 0x06,
+    IMAGE_RETPOLINE_AMD64_CFG_BR = 0x08,
+    IMAGE_RETPOLINE_AMD64_CFG_CALL = 0x09,
+    IMAGE_RETPOLINE_AMD64_CFG_BR_REX = 0x0A,
+    IMAGE_RETPOLINE_AMD64_SWITCHTABLE_FIRST = 0x010,
+    IMAGE_RETPOLINE_AMD64_SWITCHTABLE_LAST = 0x01F,
+  };
+  struct ImportCallInfo {
+    MCSymbol *CalleeSymbol;
+    ImportCallKind Kind;
+  };
+  DenseMap<MCSection *, std::vector<ImportCallInfo>>
+      SectionToImportedFunctionCalls;
 
   // This utility class tracks the length of a stackmap instruction's 'shadow'.
   // It is used by the X86AsmPrinter to ensure that the stackmap shadow
@@ -49,7 +69,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
     void startFunction(MachineFunction &MF) {
       this->MF = &MF;
     }
-    void count(MCInst &Inst, const MCSubtargetInfo &STI,
+    void count(const MCInst &Inst, const MCSubtargetInfo &STI,
                MCCodeEmitter *CodeEmitter);
 
     // Called to signal the start of a shadow of RequiredSize bytes.
@@ -130,6 +150,12 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   void emitMachOIFuncStubHelperBody(Module &M, const GlobalIFunc &GI,
                                     MCSymbol *LazyPointer) override;
 
+  void emitCallInstruction(const llvm::MCInst &MCI);
+
+  // Emits a label to mark the next instruction as being relevant to Import Call
+  // Optimization.
+  void emitLabelAndRecordForImportCallOptimization(ImportCallKind Kind);
+
 public:
   X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
 

diff  --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 398b738b85697..8ba6ed357d143 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -274,6 +274,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case X86::TCRETURNdi64:
   case X86::TCRETURNdi64cc:
   case X86::TCRETURNri64:
+  case X86::TCRETURNri64_ImpCall:
   case X86::TCRETURNmi64: {
     bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
     MachineOperand &JumpTarget = MBBI->getOperand(0);
@@ -345,12 +346,14 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
       for (unsigned i = 0; i != X86::AddrNumOperands; ++i)
         MIB.add(MBBI->getOperand(i));
-    } else if (Opcode == X86::TCRETURNri64) {
+    } else if ((Opcode == X86::TCRETURNri64) ||
+               (Opcode == X86::TCRETURNri64_ImpCall)) {
       JumpTarget.setIsKill();
       BuildMI(MBB, MBBI, DL,
               TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
           .add(JumpTarget);
     } else {
+      assert(!IsWin64 && "Win64 requires REX for indirect jumps.");
       JumpTarget.setIsKill();
       BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr))
           .add(JumpTarget);
@@ -875,6 +878,9 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case X86::CALL64m_RVMARKER:
     expandCALL_RVMARKER(MBB, MBBI);
     return true;
+  case X86::CALL64r_ImpCall:
+    MI.setDesc(TII->get(X86::CALL64r));
+    return true;
   case X86::ADD32mi_ND:
   case X86::ADD64mi32_ND:
   case X86::SUB32mi_ND:

diff  --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index d7cb93bdb7376..0ff7f235ed392 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -34,6 +34,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
@@ -3316,6 +3317,11 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     if (Flag.isSwiftError() || Flag.isPreallocated())
       return false;
 
+  // Can't handle import call optimization.
+  if (Is64Bit &&
+      MF->getFunction().getParent()->getModuleFlag("import-call-optimization"))
+    return false;
+
   SmallVector<MVT, 16> OutVTs;
   SmallVector<Register, 16> ArgRegs;
 

diff  --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 7e960c6420d3b..75f49beee27c6 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2399,7 +2399,8 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
 static bool isTailCallOpcode(unsigned Opc) {
   return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
          Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 ||
-         Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64;
+         Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TCRETURNdi64 ||
+         Opc == X86::TCRETURNmi64;
 }
 
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fdb853d683eba..e51d15a4d3825 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19179,7 +19179,7 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
                                                SelectionDAG &DAG) const {
-  return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
+  return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
 }
 
 SDValue
@@ -19207,7 +19207,8 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
 /// Creates target global address or external symbol nodes for calls or
 /// other uses.
 SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
-                                                 bool ForCall) const {
+                                                 bool ForCall,
+                                                 bool *IsImpCall) const {
   // Unpack the global address or external symbol.
   SDLoc dl(Op);
   const GlobalValue *GV = nullptr;
@@ -19257,6 +19258,16 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
   if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
     return Result;
 
+  // If Import Call Optimization is enabled and this is an imported function
+  // then make a note of it and return the global address without wrapping.
+  if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
+      Mod.getModuleFlag("import-call-optimization")) {
+    assert(ForCall && "Should only enable import call optimization if we are "
+                      "lowering a call");
+    *IsImpCall = true;
+    return Result;
+  }
+
   Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
@@ -19282,7 +19293,7 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
 
 SDValue
 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
-  return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
+  return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
 }
 
 static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA,
@@ -34821,6 +34832,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FST)
   NODE_NAME_CASE(CALL)
   NODE_NAME_CASE(CALL_RVMARKER)
+  NODE_NAME_CASE(IMP_CALL)
   NODE_NAME_CASE(BT)
   NODE_NAME_CASE(CMP)
   NODE_NAME_CASE(FCMP)
@@ -62092,6 +62104,7 @@ X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
   Register TargetReg;
   switch (MBBI->getOpcode()) {
   case X86::CALL64r:
+  case X86::CALL64r_ImpCall:
   case X86::CALL64r_NT:
   case X86::TAILJMPr64:
   case X86::TAILJMPr64_REX:

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 00917115e9df2..359f24768b3da 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -90,6 +90,10 @@ namespace llvm {
     /// POP_FROM_X87_REG (which may remove a required FPU stack pop).
     POP_FROM_X87_REG,
 
+    // Pseudo for a call to an imported function to ensure the correct machine
+    // instruction is emitted for Import Call Optimization.
+    IMP_CALL,
+
     /// X86 compare and logical compare instructions.
     CMP,
     FCMP,
@@ -1746,8 +1750,8 @@ namespace llvm {
 
     /// Creates target global address or external symbol nodes for calls or
     /// other uses.
-    SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
-                                  bool ForCall) const;
+    SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall,
+                                  bool *IsImpCall) const;
 
     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 03165311dfef8..1aa00d4f09f75 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2050,6 +2050,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (CallConv == CallingConv::X86_INTR)
     report_fatal_error("X86 interrupts may not be called directly");
 
+  if (IsIndirectCall && !IsWin64 &&
+      M->getModuleFlag("import-call-optimization"))
+    errorUnsupported(DAG, dl,
+                     "Indirect calls must have a normal calling convention if "
+                     "Import Call Optimization is enabled");
+
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
@@ -2421,6 +2427,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     InGlue = Chain.getValue(1);
   }
 
+  bool IsImpCall = false;
   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
     // In the 64-bit large code model, we have to make all calls
@@ -2433,7 +2440,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // ForCall to true here has the effect of removing WrapperRIP when possible
     // to allow direct calls to be selected without first materializing the
     // address into a register.
-    Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
+    Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true, &IsImpCall);
   } else if (Subtarget.isTarget64BitILP32() &&
              Callee.getValueType() == MVT::i32) {
     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
@@ -2555,7 +2562,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Returns a chain & a glue for retval copy to use.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  if (IsNoTrackIndirectCall) {
+  if (IsImpCall) {
+    Chain = DAG.getNode(X86ISD::IMP_CALL, dl, NodeTys, Ops);
+  } else if (IsNoTrackIndirectCall) {
     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
   } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
     // Calls with a "clang.arc.attachedcall" bundle are special. They should be

diff  --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index efa1e8bd7f3e3..927b2c8b22f05 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1313,6 +1313,8 @@ def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 texternalsym:$dst)),
 def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 tglobaladdr:$dst)),
           (CALL64pcrel32_RVMARKER tglobaladdr:$rvfunc, tglobaladdr:$dst)>;
 
+def : Pat<(X86imp_call (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>;
 
 // Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
 // can never use callee-saved registers. That is the purpose of the GR64_TC
@@ -1344,7 +1346,11 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
-          Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
+          Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+          (TCRETURNri64_ImpCall ptr_rc_tailcall:$dst, timm:$off)>,
+          Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationEnabled]>;
 
 // Don't fold loads into X86tcret requiring more than 6 regs.
 // There wouldn't be enough scratch registers for base+index.

diff  --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index 4907105e6b8cc..22253bf0413a4 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -327,7 +327,7 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
                       Requires<[In64BitMode]>;
   def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
                         "call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
-                      Requires<[In64BitMode,NotUseIndirectThunkCalls]>;
+                      Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationDisabled]>;
   def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
                         "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
                       Requires<[In64BitMode,FavorMemIndirectCall,
@@ -357,6 +357,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
   def TCRETURNri64   : PseudoI<(outs),
                                (ins ptr_rc_tailcall:$dst, i32imm:$offset),
                                []>, Sched<[WriteJump]>;
+  def TCRETURNri64_ImpCall   : PseudoI<(outs),
+                               (ins GR64_A:$dst, i32imm:$offset),
+                               []>, Sched<[WriteJump]>;
+
   let mayLoad = 1 in
   def TCRETURNmi64   : PseudoI<(outs),
                                (ins i64mem_TC:$dst, i32imm:$offset),
@@ -418,6 +422,10 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
   def CALL64pcrel32_RVMARKER :
     PseudoI<(outs), (ins i64imm:$rvfunc, i64i32imm_brtarget:$dst), []>,
             Requires<[In64BitMode]>;
+
+  def CALL64r_ImpCall :
+    PseudoI<(outs), (ins GR64_A:$dst), [(X86call GR64_A:$dst)]>,
+            Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationEnabled]>;
 }
 
 // Conditional tail calls are similar to the above, but they are branches

diff  --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index f9d70d1bb5d85..fe95b8c20a8ff 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -210,6 +210,9 @@ def X86call_rvmarker  : SDNode<"X86ISD::CALL_RVMARKER",     SDT_X86Call,
                         [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                          SDNPVariadic]>;
 
+def X86imp_call  : SDNode<"X86ISD::IMP_CALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                         SDNPVariadic]>;
 
 def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call,
                             [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,

diff  --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 963a2bb84e185..7d9fa759ad81e 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3714,6 +3714,7 @@ bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
   case X86::TCRETURNmi:
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
+  case X86::TCRETURNri64_ImpCall:
   case X86::TCRETURNmi64:
     return true;
   default:
@@ -7458,7 +7459,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   // do not fold loads into calls or pushes, unless optimizing for size
   // aggressively.
   if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
-      (Opc == X86::CALL32r || Opc == X86::CALL64r || Opc == X86::PUSH16r ||
+      (Opc == X86::CALL32r || Opc == X86::CALL64r ||
+       Opc == X86::CALL64r_ImpCall || Opc == X86::PUSH16r ||
        Opc == X86::PUSH32r || Opc == X86::PUSH64r))
     return nullptr;
 

diff  --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 5bdcf51be9dd8..307c03c8ef541 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -233,6 +233,8 @@ let RecomputePerFunction = 1 in {
                             "shouldOptForSize(MF)">;
   def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || "
                                         "!Subtarget->hasSSE41()">;
+  def ImportCallOptimizationEnabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
+  def ImportCallOptimizationDisabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
 }
 
 def CallImmAddr  : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;

diff  --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 165bcb0ba9647..55d57d15f8d42 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -48,6 +48,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/CFGuard.h"
 #include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
 #include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
 #include <string>
@@ -113,7 +114,7 @@ struct NoAutoPaddingScope {
 static void emitX86Nops(MCStreamer &OS, unsigned NumBytes,
                         const X86Subtarget *Subtarget);
 
-void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
+void X86AsmPrinter::StackMapShadowTracker::count(const MCInst &Inst,
                                                  const MCSubtargetInfo &STI,
                                                  MCCodeEmitter *CodeEmitter) {
   if (InShadow) {
@@ -2214,6 +2215,31 @@ static void addConstantComments(const MachineInstr *MI,
   }
 }
 
+// Does the given operand refer to a DLLIMPORT function?
+bool isImportedFunction(const MachineOperand &MO) {
+  return MO.isGlobal() && (MO.getTargetFlags() == X86II::MO_DLLIMPORT);
+}
+
+// Is the given instruction a call to a CFGuard function?
+bool isCallToCFGuardFunction(const MachineInstr *MI) {
+  assert(MI->getOpcode() == X86::TAILJMPm64_REX ||
+         MI->getOpcode() == X86::CALL64m);
+  const MachineOperand &MO = MI->getOperand(3);
+  return MO.isGlobal() && (MO.getTargetFlags() == X86II::MO_NO_FLAG) &&
+         isCFGuardFunction(MO.getGlobal());
+}
+
+// Does the containing block for the given instruction contain any jump table
+// info (indicating that the block is a dispatch for a jump table)?
+bool hasJumpTableInfoInBlock(const llvm::MachineInstr *MI) {
+  const MachineBasicBlock &MBB = *MI->getParent();
+  for (auto I = MBB.instr_rbegin(), E = MBB.instr_rend(); I != E; ++I)
+    if (I->isJumpTableDebugInfo())
+      return true;
+
+  return false;
+}
+
 void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   // FIXME: Enable feature predicate checks once all the test pass.
   // X86_MC::verifyInstructionPredicates(MI->getOpcode(),
@@ -2292,7 +2318,16 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::TAILJMPd64:
     if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11))
       EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX));
-    [[fallthrough]];
+
+    if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) {
+      emitLabelAndRecordForImportCallOptimization(
+          IMAGE_RETPOLINE_AMD64_IMPORT_BR);
+    }
+
+    // Lower this as normal, but add a comment.
+    OutStreamer->AddComment("TAILCALL");
+    break;
+
   case X86::TAILJMPr:
   case X86::TAILJMPm:
   case X86::TAILJMPd:
@@ -2300,12 +2335,58 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::TAILJMPr64:
   case X86::TAILJMPm64:
   case X86::TAILJMPd64_CC:
-  case X86::TAILJMPr64_REX:
-  case X86::TAILJMPm64_REX:
+    if (EnableImportCallOptimization)
+      report_fatal_error("Unexpected TAILJMP instruction was emitted when "
+                         "import call optimization was enabled");
+
     // Lower these as normal, but add some comments.
     OutStreamer->AddComment("TAILCALL");
     break;
 
+  case X86::TAILJMPm64_REX:
+    if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) {
+      emitLabelAndRecordForImportCallOptimization(
+          IMAGE_RETPOLINE_AMD64_CFG_BR_REX);
+    }
+
+    OutStreamer->AddComment("TAILCALL");
+    break;
+
+  case X86::TAILJMPr64_REX: {
+    if (EnableImportCallOptimization) {
+      assert(MI->getOperand(0).getReg() == X86::RAX &&
+             "Indirect tail calls with impcall enabled must go through RAX (as "
+             "enforced by TCRETURNImpCallri64)");
+      emitLabelAndRecordForImportCallOptimization(
+          IMAGE_RETPOLINE_AMD64_INDIR_BR);
+    }
+
+    OutStreamer->AddComment("TAILCALL");
+    break;
+  }
+
+  case X86::JMP64r:
+    if (EnableImportCallOptimization && hasJumpTableInfoInBlock(MI)) {
+      uint16_t EncodedReg =
+          this->getSubtarget().getRegisterInfo()->getEncodingValue(
+              MI->getOperand(0).getReg().asMCReg());
+      emitLabelAndRecordForImportCallOptimization(
+          (ImportCallKind)(IMAGE_RETPOLINE_AMD64_SWITCHTABLE_FIRST +
+                           EncodedReg));
+    }
+    break;
+
+  case X86::JMP16r:
+  case X86::JMP16m:
+  case X86::JMP32r:
+  case X86::JMP32m:
+  case X86::JMP64m:
+    if (EnableImportCallOptimization && hasJumpTableInfoInBlock(MI))
+      report_fatal_error(
+          "Unexpected JMP instruction was emitted for a jump-table when import "
+          "call optimization was enabled");
+    break;
+
   case X86::TLS_addr32:
   case X86::TLS_addr64:
   case X86::TLS_addrX32:
@@ -2492,7 +2573,50 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::CALL64pcrel32:
     if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11))
       EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX));
+
+    if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) {
+      emitLabelAndRecordForImportCallOptimization(
+          IMAGE_RETPOLINE_AMD64_IMPORT_CALL);
+
+      MCInst TmpInst;
+      MCInstLowering.Lower(MI, TmpInst);
+
+      // For Import Call Optimization to work, we need a the call instruction
+      // with a rex prefix, and a 5-byte nop after the call instruction.
+      EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+      emitCallInstruction(TmpInst);
+      emitNop(*OutStreamer, 5, Subtarget);
+      return;
+    }
+
+    break;
+
+  case X86::CALL64r:
+    if (EnableImportCallOptimization) {
+      assert(MI->getOperand(0).getReg() == X86::RAX &&
+             "Indirect calls with impcall enabled must go through RAX (as "
+             "enforced by CALL64r_ImpCall)");
+
+      emitLabelAndRecordForImportCallOptimization(
+          IMAGE_RETPOLINE_AMD64_INDIR_CALL);
+      MCInst TmpInst;
+      MCInstLowering.Lower(MI, TmpInst);
+      emitCallInstruction(TmpInst);
+
+      // For Import Call Optimization to work, we need a 3-byte nop after the
+      // call instruction.
+      emitNop(*OutStreamer, 3, Subtarget);
+      return;
+    }
+    break;
+
+  case X86::CALL64m:
+    if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) {
+      emitLabelAndRecordForImportCallOptimization(
+          IMAGE_RETPOLINE_AMD64_CFG_CALL);
+    }
     break;
+
   case X86::JCC_1:
     // Two instruction prefixes (2EH for branch not-taken and 3EH for branch
     // taken) are used as branch hints. Here we add branch taken prefix for
@@ -2513,20 +2637,36 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
 
-  // Stackmap shadows cannot include branch targets, so we can count the bytes
-  // in a call towards the shadow, but must ensure that the no thread returns
-  // in to the stackmap shadow.  The only way to achieve this is if the call
-  // is at the end of the shadow.
   if (MI->isCall()) {
-    // Count then size of the call towards the shadow
-    SMShadowTracker.count(TmpInst, getSubtargetInfo(), CodeEmitter.get());
-    // Then flush the shadow so that we fill with nops before the call, not
-    // after it.
-    SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
-    // Then emit the call
-    OutStreamer->emitInstruction(TmpInst, getSubtargetInfo());
+    emitCallInstruction(TmpInst);
     return;
   }
 
   EmitAndCountInstruction(TmpInst);
 }
+
+void X86AsmPrinter::emitCallInstruction(const llvm::MCInst &MCI) {
+  // Stackmap shadows cannot include branch targets, so we can count the bytes
+  // in a call towards the shadow, but must ensure that the no thread returns
+  // in to the stackmap shadow.  The only way to achieve this is if the call
+  // is at the end of the shadow.
+
+  // Count then size of the call towards the shadow
+  SMShadowTracker.count(MCI, getSubtargetInfo(), CodeEmitter.get());
+  // Then flush the shadow so that we fill with nops before the call, not
+  // after it.
+  SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+  // Then emit the call
+  OutStreamer->emitInstruction(MCI, getSubtargetInfo());
+}
+
+void X86AsmPrinter::emitLabelAndRecordForImportCallOptimization(
+    ImportCallKind Kind) {
+  assert(EnableImportCallOptimization);
+
+  MCSymbol *CallSiteSymbol = MMI->getContext().createNamedTempSymbol("impcall");
+  OutStreamer->emitLabel(CallSiteSymbol);
+
+  SectionToImportedFunctionCalls[OutStreamer->getCurrentSectionOnly()]
+      .push_back({CallSiteSymbol, Kind});
+}

diff  --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index c192e8892995b..71d36594afaeb 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -999,6 +999,7 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg(
   case X86::TCRETURNmi:
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
+  case X86::TCRETURNri64_ImpCall:
   case X86::TCRETURNmi64:
   case X86::EH_RETURN:
   case X86::EH_RETURN64: {

diff  --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 48459b3aca508..3f9af5639a686 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -737,6 +737,10 @@ def GR32_SIDI : RegisterClass<"X86", [i32], 32, (add ESI, EDI)>;
 def GR32_DIBP : RegisterClass<"X86", [i32], 32, (add EDI, EBP)>;
 def GR32_BPSP : RegisterClass<"X86", [i32], 32, (add EBP, ESP)>;
 
+// Class to support Windows Import Call Optimization: all indirect jumps must
+// happen through RAX.
+def GR64_A : RegisterClass<"X86", [i64], 64, (add RAX)>;
+
 // Scalar SSE2 floating point registers.
 def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
 

diff  --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
index 45c2a3394da02..b73a0ce2e9ff4 100644
--- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -31,6 +31,9 @@ using OperandBundleDef = OperandBundleDefT<Value *>;
 
 STATISTIC(CFGuardCounter, "Number of Control Flow Guard checks added");
 
+constexpr StringRef GuardCheckFunctionName = "__guard_check_icall_fptr";
+constexpr StringRef GuardDispatchFunctionName = "__guard_dispatch_icall_fptr";
+
 namespace {
 
 /// Adds Control Flow Guard (CFG) checks on indirect function calls/invokes.
@@ -45,10 +48,10 @@ class CFGuardImpl {
     // Get or insert the guard check or dispatch global symbols.
     switch (GuardMechanism) {
     case Mechanism::Check:
-      GuardFnName = "__guard_check_icall_fptr";
+      GuardFnName = GuardCheckFunctionName;
       break;
     case Mechanism::Dispatch:
-      GuardFnName = "__guard_dispatch_icall_fptr";
+      GuardFnName = GuardDispatchFunctionName;
       break;
     }
   }
@@ -318,3 +321,11 @@ FunctionPass *llvm::createCFGuardCheckPass() {
 FunctionPass *llvm::createCFGuardDispatchPass() {
   return new CFGuard(CFGuardPass::Mechanism::Dispatch);
 }
+
+bool llvm::isCFGuardFunction(const GlobalValue *GV) {
+  if (GV->getLinkage() != GlobalValue::ExternalLinkage)
+    return false;
+
+  StringRef Name = GV->getName();
+  return Name == GuardCheckFunctionName || Name == GuardDispatchFunctionName;
+}

diff  --git a/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
new file mode 100644
index 0000000000000..12be910d68ee9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+
+define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
+entry:
+  call void %func_ptr()
+  ret void
+}
+; CHECK-LABEL:  normal_call:
+; CHECK:        .Limpcall0:
+; CHECK-NEXT:     callq   *__guard_dispatch_icall_fptr(%rip)
+
+define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
+entry:
+  tail call void %func_ptr()
+  ret void
+}
+; CHECK-LABEL:  tail_call_fp:
+; CHECK:        .Limpcall1:
+; CHECK-NEXT:     rex64 jmpq      *__guard_dispatch_icall_fptr(%rip)
+
+; CHECK-LABEL  .section   .retplne,"yi"
+; CHECK-NEXT   .asciz  "RetpolineV1"
+; CHECK-NEXT   .long   16
+; CHECK-NEXT   .secnum tc_sect
+; CHECK-NEXT   .long   10
+; CHECK-NEXT   .secoffset      .Limpcall1
+; CHECK-NEXT   .long   16
+; CHECK-NEXT   .secnum nc_sect
+; CHECK-NEXT   .long   9
+; CHECK-NEXT   .secoffset      .Limpcall0
+
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 1, !"import-call-optimization", i32 1}
+!1 = !{i32 2, !"cfguard", i32 2}

diff  --git a/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
new file mode 100644
index 0000000000000..fe22b251685e6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
@@ -0,0 +1,83 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
+
+; CHECK-LABEL:  uses_rax:
+; CHECK:        .Limpcall0:
+; CHECK-NEXT:     jmpq    *%rax
+
+define void @uses_rax(i32 %x) {
+entry:
+  switch i32 %x, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:
+  tail call void @g(i32 0) #2
+  br label %sw.epilog
+
+sw.bb1:
+  tail call void @g(i32 1) #2
+  br label %sw.epilog
+
+sw.bb2:
+  tail call void @g(i32 2) #2
+  br label %sw.epilog
+
+sw.bb3:
+  tail call void @g(i32 3) #2
+  br label %sw.epilog
+
+sw.epilog:
+  tail call void @g(i32 10) #2
+  ret void
+}
+
+; CHECK-LABEL:  uses_rcx:
+; CHECK:        .Limpcall1:
+; CHECK-NEXT:     jmpq    *%rcx
+
+define void @uses_rcx(i32 %x) {
+entry:
+  switch i32 %x, label %sw.epilog [
+    i32 10, label %sw.bb
+    i32 11, label %sw.bb1
+    i32 12, label %sw.bb2
+    i32 13, label %sw.bb3
+  ]
+
+sw.bb:
+  tail call void @g(i32 0) #2
+  br label %sw.epilog
+
+sw.bb1:
+  tail call void @g(i32 1) #2
+  br label %sw.epilog
+
+sw.bb2:
+  tail call void @g(i32 2) #2
+  br label %sw.epilog
+
+sw.bb3:
+  tail call void @g(i32 3) #2
+  br label %sw.epilog
+
+sw.epilog:
+  tail call void @g(i32 10) #2
+  ret void
+}
+
+declare void @g(i32)
+
+; CHECK-LABEL:  .section        .retplne,"yi"
+; CHECK-NEXT:   .asciz  "RetpolineV1"
+; CHECK-NEXT:   .long   24
+; CHECK-NEXT:   .secnum .text
+; CHECK-NEXT:   .long   16
+; CHECK-NEXT:   .secoffset      .Limpcall0
+; CHECK-NEXT:   .long   17
+; CHECK-NEXT:   .secoffset      .Limpcall1
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"import-call-optimization", i32 1}

diff  --git a/llvm/test/CodeGen/X86/win-import-call-optimization-nocalls.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-nocalls.ll
new file mode 100644
index 0000000000000..4ca7b85282f2e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization-nocalls.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
+
+define dso_local void @normal_call() local_unnamed_addr {
+entry:
+  call void @a()
+  ret void
+}
+; CHECK-LABEL:  normal_call:
+; CHECK:        callq   a
+
+declare void @a() local_unnamed_addr
+
+; Even if there are no calls to imported functions, we still need to emit the
+; .impcall section.
+
+; CHECK-LABEL  .section   .retplne,"yi"
+; CHECK-NEXT   .asciz  "RetpolineV1"
+; CHECK-NOT    .secnum
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"import-call-optimization", i32 1}

diff  --git a/llvm/test/CodeGen/X86/win-import-call-optimization.ll b/llvm/test/CodeGen/X86/win-import-call-optimization.ll
new file mode 100644
index 0000000000000..cc7e1a9f81e34
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+
+define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
+entry:
+  call void @a()
+  call void @a()
+  call void %func_ptr()
+  ret void
+}
+; CHECK-LABEL:  normal_call:
+; CHECK:        .Limpcall0:
+; CHECK-NEXT:     rex64
+; CHECK-NEXT:     callq   __imp_a
+; CHECK-NEXT:     nopl    8(%rax,%rax)
+; CHECK-NEXT:   .Limpcall1:
+; CHECK-NEXT:     rex64
+; CHECK-NEXT:     callq   __imp_a
+; CHECK-NEXT:     nopl    8(%rax,%rax)
+; CHECK-NEXT:     movq    %rsi, %rax
+; CHECK-NEXT:   .Limpcall2:
+; CHECK-NEXT:     callq   *%rax
+; CHECK-NEXT:     nopl    (%rax)
+; CHECK-NEXT:     nop
+
+define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
+entry:
+  tail call void @b()
+  ret void
+}
+; CHECK-LABEL:  tail_call:
+; CHECK:        .Limpcall3:
+; CHECK-NEXT:     jmp __imp_b
+
+define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
+entry:
+  tail call void %func_ptr()
+  ret void
+}
+; CHECK-LABEL:  tail_call_fp:
+; CHECK:          movq    %rcx, %rax
+; CHECK-NEXT:   .Limpcall4:
+; CHECK-NEXT:     rex64 jmpq      *%rax
+
+declare dllimport void @a() local_unnamed_addr
+declare dllimport void @b() local_unnamed_addr
+
+; CHECK-LABEL  .section   .retplne,"yi"
+; CHECK-NEXT   .asciz  "RetpolineV1"
+; CHECK-NEXT   .long   24
+; CHECK-NEXT   .secnum tc_sect
+; CHECK-NEXT   .long   3
+; CHECK-NEXT   .secoffset      .Limpcall3
+; CHECK-NEXT   .long   5
+; CHECK-NEXT   .secoffset      .Limpcall4
+; CHECK-NEXT   .long   32
+; CHECK-NEXT   .secnum nc_sect
+; CHECK-NEXT   .long   3
+; CHECK-NEXT   .secoffset      .Limpcall0
+; CHECK-NEXT   .long   3
+; CHECK-NEXT   .secoffset      .Limpcall1
+; CHECK-NEXT   .long   5
+; CHECK-NEXT   .secoffset      .Limpcall2
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"import-call-optimization", i32 1}

diff  --git a/llvm/test/MC/X86/win-import-call-optimization.s b/llvm/test/MC/X86/win-import-call-optimization.s
new file mode 100644
index 0000000000000..4f839a2bc6011
--- /dev/null
+++ b/llvm/test/MC/X86/win-import-call-optimization.s
@@ -0,0 +1,69 @@
+// RUN: llvm-mc -triple x86_64-windows-msvc -filetype obj -o %t.obj %s
+// RUN: llvm-readobj --sections --sd --relocs %t.obj | FileCheck %s
+
+.section        nc_sect,"xr"
+normal_call:
+.seh_proc normal_call
+# %bb.0:                                # %entry
+        subq    $40, %rsp
+        .seh_stackalloc 40
+        .seh_endprologue
+.Limpcall0:
+        rex64
+        callq   *__imp_a(%rip)
+        nopl    8(%rax,%rax)
+        nop
+        addq    $40, %rsp
+        retq
+        .seh_endproc
+
+.section        tc_sect,"xr"
+tail_call:
+.Limpcall1:
+        rex64
+        jmp     *__imp_b(%rip)
+
+.section        .retplne,"yi"
+.asciz  "RetpolineV1"
+.long   16
+.secnum tc_sect
+.long   2
+.secoffset .Limpcall1
+.long   16
+.secnum nc_sect
+.long   3
+.secoffset .Limpcall0
+
+// CHECK-LABEL: Name: .retplne (2E 72 65 74 70 6C 6E 65)
+// CHECK-NEXT:  VirtualSize: 0x0
+// CHECK-NEXT:  VirtualAddress: 0x0
+// CHECK-NEXT:  RawDataSize: 44
+// CHECK-NEXT:  PointerToRawData:
+// CHECK-NEXT:  PointerToRelocations:
+// CHECK-NEXT:  PointerToLineNumbers:
+// CHECK-NEXT:  RelocationCount: 0
+// CHECK-NEXT:  LineNumberCount: 0
+// CHECK-NEXT:  Characteristics [
+// CHECK-NEXT:    IMAGE_SCN_ALIGN_1BYTES
+// CHECK-NEXT:    IMAGE_SCN_LNK_INFO
+// CHECK-NEXT:  ]
+// CHECK-NEXT:  SectionData (
+// CHECK-NEXT:    52657470 6F6C696E 65563100 10000000  |RetpolineV1.....|
+// CHECK-NEXT:    0010:
+// CHECK-SAME:    [[#%.2X,TCSECT:]]000000
+// CHECK-SAME:    02000000
+// CHECK-SAME:    [[#%.2X,TCOFFSET:]]000000
+// CHECK-SAME:    10000000
+// CHECK-NEXT:    0020:
+// CHECK-SAME:    [[#%.2X,NCSECT:]]000000
+// CHECK-SAME:    03000000
+// CHECK-SAME:    [[#%.2X,NCOFFSET:]]000000
+// CHECK-NEXT:  )
+
+// CHECK-LABEL: Relocations [
+// CHECK-NEXT:     Section ([[#%u,NCSECT]]) nc_sect {
+// CHECK-NEXT:       0x[[#%x,NCOFFSET + 3]] IMAGE_REL_AMD64_REL32 __imp_a
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Section ([[#%u,TCSECT]]) tc_sect {
+// CHECK-NEXT:       0x[[#%x,TCOFFSET + 3]] IMAGE_REL_AMD64_REL32 __imp_b
+// CHECK-NEXT:     }