[clang] [llvm] [win][x64] Multiple fixes for import call optimization (PR #160604)

Fri Jan 16 14:58:16 PST 2026

https://github.com/dpaoliello updated https://github.com/llvm/llvm-project/pull/160604

>From 81fb78cfc7e00cb60c4979dbdae059bccc612bc7 Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao at microsoft.com>
Date: Fri, 12 Dec 2025 09:45:37 -0800
Subject: [PATCH] [win][x64] Fix import call optimization for calls to
 dllimports and global function pointers

---
 .../CodeGenCXX/microsoft-abi-eh-ip2state.cpp  |   2 +-
 llvm/lib/Target/X86/X86AsmPrinter.cpp         |   4 +-
 llvm/lib/Target/X86/X86AsmPrinter.h           |   2 +-
 llvm/lib/Target/X86/X86ExpandPseudo.cpp       |   8 +-
 llvm/lib/Target/X86/X86FastISel.cpp           |  16 +-
 llvm/lib/Target/X86/X86FrameLowering.cpp      |   4 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   2 -
 llvm/lib/Target/X86/X86ISelLowering.h         |   4 -
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp   |  16 +-
 llvm/lib/Target/X86/X86InstrCompiler.td       |  11 +-
 llvm/lib/Target/X86/X86InstrControl.td        |  10 +-
 llvm/lib/Target/X86/X86InstrFragments.td      |   4 -
 llvm/lib/Target/X86/X86InstrInfo.cpp          |  15 +-
 llvm/lib/Target/X86/X86InstrPredicates.td     |   2 -
 llvm/lib/Target/X86/X86MCInstLower.cpp        | 114 +++++++---
 llvm/lib/Target/X86/X86RegisterInfo.cpp       |   1 -
 llvm/lib/Target/X86/X86RegisterInfo.td        |   4 -
 .../win-import-call-optimization-cfguard.ll   | 154 +++++++++++--
 .../win-import-call-optimization-jumptable.ll |  61 ++++--
 .../X86/win-import-call-optimization.ll       | 202 ++++++++++++++----
 20 files changed, 463 insertions(+), 173 deletions(-)

diff --git a/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp b/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp
index 0b7b406e2ba8e..541789fc9d339 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp
@@ -40,7 +40,7 @@ void case_calls_dll_import() NO_TAIL {
 // CHECK: .seh_endprologue
 // CHECK: .Limpcall{{[0-9]+}}:
 // CHECK-NEXT: rex64
-// CHECK-NEXT: call __imp_some_dll_import
+// CHECK-NEXT: call qword ptr [rip + __imp_some_dll_import]
 // CHECK-NEXT: nop dword ptr {{\[.*\]}}
 // CHECK-NEXT: nop
 // CHECK-NEXT: .seh_startepilogue
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 84b921222a116..6a876b8963545 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -479,8 +479,8 @@ static bool isIndirectBranchOrTailCall(const MachineInstr &MI) {
          Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri ||
          Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNmi ||
          Opc == X86::TCRETURN_WINmi64 || Opc == X86::TCRETURNri64 ||
-         Opc == X86::TCRETURNmi64 || Opc == X86::TCRETURNri64_ImpCall ||
-         Opc == X86::TAILJMPr64_REX || Opc == X86::TAILJMPm64_REX;
+         Opc == X86::TCRETURNmi64 || Opc == X86::TAILJMPr64_REX ||
+         Opc == X86::TAILJMPm64_REX;
 }
 
 void X86AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index e02b5562d3b5e..7c55f06e86d4b 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -53,7 +53,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
     MCSymbol *CalleeSymbol;
     ImportCallKind Kind;
   };
-  DenseMap<MCSection *, std::vector<ImportCallInfo>>
+  MapVector<MCSection *, std::vector<ImportCallInfo>>
       SectionToImportedFunctionCalls;
 
   // This utility class tracks the length of a stackmap instruction's 'shadow'.
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 2f5ee9d2c9a13..6574bd2d974a8 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -290,7 +290,6 @@ bool X86ExpandPseudoImpl::expandMI(MachineBasicBlock &MBB,
   case X86::TCRETURNdi64:
   case X86::TCRETURNdi64cc:
   case X86::TCRETURNri64:
-  case X86::TCRETURNri64_ImpCall:
   case X86::TCRETURNmi64:
   case X86::TCRETURN_WINmi64: {
     bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64 ||
@@ -366,9 +365,7 @@ bool X86ExpandPseudoImpl::expandMI(MachineBasicBlock &MBB,
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
       for (unsigned i = 0; i != X86::AddrNumOperands; ++i)
         MIB.add(MBBI->getOperand(i));
-    } else if (Opcode == X86::TCRETURNri64 ||
-               Opcode == X86::TCRETURNri64_ImpCall ||
-               Opcode == X86::TCRETURN_WIN64ri) {
+    } else if (Opcode == X86::TCRETURNri64 || Opcode == X86::TCRETURN_WIN64ri) {
       JumpTarget.setIsKill();
       BuildMI(MBB, MBBI, DL,
               TII->get(IsX64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
@@ -722,9 +719,6 @@ bool X86ExpandPseudoImpl::expandMI(MachineBasicBlock &MBB,
   case X86::CALL64m_RVMARKER:
     expandCALL_RVMARKER(MBB, MBBI);
     return true;
-  case X86::CALL64r_ImpCall:
-    MI.setDesc(TII->get(X86::CALL64r));
-    return true;
   case X86::ADD32mi_ND:
   case X86::ADD64mi32_ND:
   case X86::SUB32mi_ND:
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index c69ca77031495..26b247c797b00 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3326,11 +3326,6 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     if (Flag.isSwiftError() || Flag.isPreallocated())
       return false;
 
-  // Can't handle import call optimization.
-  if (Is64Bit &&
-      MF->getFunction().getParent()->getModuleFlag("import-call-optimization"))
-    return false;
-
   SmallVector<MVT, 16> OutVTs;
   SmallVector<Type *, 16> ArgTys;
   SmallVector<Register, 16> ArgRegs;
@@ -3572,6 +3567,17 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   if (CalleeOp) {
     // Register-indirect call.
     unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
+
+    const Module *M = FuncInfo.MF->getFunction().getParent();
+    if (CalleeOp != X86::RAX && Is64Bit &&
+        M->getModuleFlag("import-call-optimization")) {
+      // Import call optimization requires all indirect calls to be via RAX.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+              TII.get(TargetOpcode::COPY), X86::RAX)
+          .addReg(CalleeOp);
+      CalleeOp = X86::RAX;
+    }
+
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc))
       .addReg(CalleeOp);
   } else {
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 8bca6344d6521..7494f756de68a 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2400,8 +2400,8 @@ static bool isTailCallOpcode(unsigned Opc) {
   return Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri ||
          Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNdi ||
          Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 ||
-         Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TCRETURNdi64 ||
-         Opc == X86::TCRETURNmi64 || Opc == X86::TCRETURN_WINmi64;
+         Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64 ||
+         Opc == X86::TCRETURN_WINmi64;
 }
 
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ef94c198558c7..3d8c455af01ab 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35396,7 +35396,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FST)
   NODE_NAME_CASE(CALL)
   NODE_NAME_CASE(CALL_RVMARKER)
-  NODE_NAME_CASE(IMP_CALL)
   NODE_NAME_CASE(BT)
   NODE_NAME_CASE(CMP)
   NODE_NAME_CASE(FCMP)
@@ -63310,7 +63309,6 @@ X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
   Register TargetReg;
   switch (MBBI->getOpcode()) {
   case X86::CALL64r:
-  case X86::CALL64r_ImpCall:
   case X86::CALL64r_NT:
   case X86::TAILJMPr64:
   case X86::TAILJMPr64_REX:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 08d5e2331727b..a24c8dde6497b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -94,10 +94,6 @@ namespace llvm {
     /// POP_FROM_X87_REG (which may remove a required FPU stack pop).
     POP_FROM_X87_REG,
 
-    // Pseudo for a call to an imported function to ensure the correct machine
-    // instruction is emitted for Import Call Optimization.
-    IMP_CALL,
-
     /// X86 compare and logical compare instructions.
     CMP,
     FCMP,
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index f4de2f8c6c22e..f2fb5c685f348 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2585,6 +2585,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
            "CFG Function load should not have an offset");
     Callee = DAG.getTargetGlobalAddress(
         GA->getGlobal(), dl, GA->getValueType(0), 0, X86II::MO_NO_FLAG);
+  } else if (M->getModuleFlag("import-call-optimization")) {
+    // When import call optimization is enabled, all register indirect calls
+    // must use RAX.
+    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Callee, InGlue);
+    InGlue = Chain.getValue(1);
+    Callee = DAG.getRegister(X86::RAX, Callee.getValueType());
   }
 
   SmallVector<SDValue, 8> Ops;
@@ -2689,8 +2695,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // should be computed from returns not tail calls.  Consider a void
     // function making a tail call to a function returning int.
     MF.getFrameInfo().setHasTailCall();
-    auto Opcode =
-        IsCFGuardCall ? X86ISD::TC_RETURN_GLOBALADDR : X86ISD::TC_RETURN;
+    auto Opcode = (IsCFGuardCall || IsImpCall) ? X86ISD::TC_RETURN_GLOBALADDR
+                                               : X86ISD::TC_RETURN;
     SDValue Ret = DAG.getNode(Opcode, dl, MVT::Other, Ops);
 
     if (IsCFICall)
@@ -2703,11 +2709,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Returns a chain & a glue for retval copy to use.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  if (IsImpCall) {
-    Chain = DAG.getNode(X86ISD::IMP_CALL, dl, NodeTys, Ops);
-  } else if (IsNoTrackIndirectCall) {
+  if (IsNoTrackIndirectCall) {
     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
-  } else if (IsCFGuardCall) {
+  } else if (IsCFGuardCall || IsImpCall) {
     Chain = DAG.getNode(X86ISD::CALL_GLOBALADDR, dl, NodeTys, Ops);
   } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
     // Calls with a "clang.arc.attachedcall" bundle are special. They should be
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 3e07db678809d..3ca60135784de 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1322,9 +1322,6 @@ def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 texternalsym:$dst)),
 def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 tglobaladdr:$dst)),
           (CALL64pcrel32_RVMARKER tglobaladdr:$rvfunc, tglobaladdr:$dst)>;
 
-def : Pat<(X86imp_call (i64 tglobaladdr:$dst)),
-          (CALL64pcrel32 tglobaladdr:$dst)>;
-
 // Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
 // can never use callee-saved registers. That is the purpose of the GR64_TC
 // register classes.
@@ -1359,15 +1356,11 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
-          Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+          Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls]>;
 
 def : Pat<(X86tcret GR64_TCW64:$dst, timm:$off),
           (TCRETURN_WIN64ri GR64_TCW64:$dst, timm:$off)>,
-          Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
-
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
-          (TCRETURNri64_ImpCall ptr_rc_tailcall:$dst, timm:$off)>,
-          Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationEnabled]>;
+          Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls]>;
 
 // Don't fold loads into X86tcret requiring more than 6 regs.
 // There wouldn't be enough scratch registers for base+index.
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index c67feb7668234..aed7df993880a 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -331,7 +331,7 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
                       Requires<[In64BitMode]>;
   def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
                         "call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
-                      Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationDisabled]>;
+                      Requires<[In64BitMode,NotUseIndirectThunkCalls]>;
   def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
                         "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
                       Requires<[In64BitMode,FavorMemIndirectCall,
@@ -364,10 +364,6 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
   def TCRETURN_WIN64ri : PseudoI<(outs), (ins GR64_TCW64:$dst, i32imm:$offset),
                                 []>, Sched<[WriteJump]>;
 
-  def TCRETURNri64_ImpCall   : PseudoI<(outs),
-                               (ins GR64_A:$dst, i32imm:$offset),
-                               []>, Sched<[WriteJump]>;
-
   let mayLoad = 1 in
   def TCRETURNmi64   : PseudoI<(outs),
                                (ins i64mem_TC:$dst, i32imm:$offset),
@@ -433,10 +429,6 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
   def CALL64pcrel32_RVMARKER :
     PseudoI<(outs), (ins i64imm:$rvfunc, i64i32imm_brtarget:$dst), []>,
             Requires<[In64BitMode]>;
-
-  def CALL64r_ImpCall :
-    PseudoI<(outs), (ins GR64_A:$dst), [(X86call GR64_A:$dst)]>,
-            Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationEnabled]>;
 }
 
 // Conditional tail calls are similar to the above, but they are branches
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index 38ab02667317e..29bf4c46ae69c 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -214,10 +214,6 @@ def X86call_globaladdr  : SDNode<"X86ISD::CALL_GLOBALADDR",     SDT_X86Call,
                         [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                          SDNPVariadic]>;
 
-def X86imp_call  : SDNode<"X86ISD::IMP_CALL",     SDT_X86Call,
-                        [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
-                         SDNPVariadic]>;
-
 def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call,
                             [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                              SDNPVariadic]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 53b148c11c4e1..ebe60922abd9f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3700,7 +3700,6 @@ bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
   case X86::TCRETURNmi:
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
-  case X86::TCRETURNri64_ImpCall:
   case X86::TCRETURNmi64:
     return true;
   default:
@@ -3731,9 +3730,16 @@ bool X86InstrInfo::canMakeTailCallConditional(
     return false;
   }
 
-  if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
+  if (Subtarget.isTargetWin64()) {
     // Conditional tail calls confuse the Win64 unwinder.
-    return false;
+    if (MF->hasWinCFI())
+      return false;
+
+    // Conditional tail calls cannot be encoded in the Import Call Optimization
+    // metadata.
+    if (MF->getFunction().getParent()->getModuleFlag(
+            "import-call-optimization"))
+      return false;
   }
 
   assert(BranchCond.size() == 1);
@@ -7496,8 +7502,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   // do not fold loads into calls or pushes, unless optimizing for size
   // aggressively.
   if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
-      (Opc == X86::CALL32r || Opc == X86::CALL64r ||
-       Opc == X86::CALL64r_ImpCall || Opc == X86::PUSH16r ||
+      (Opc == X86::CALL32r || Opc == X86::CALL64r || Opc == X86::PUSH16r ||
        Opc == X86::PUSH32r || Opc == X86::PUSH64r))
     return nullptr;
 
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 21e6bacbacee2..1d23604d66d2c 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -237,8 +237,6 @@ let RecomputePerFunction = 1 in {
                             "shouldOptForSize(MF)">;
   def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || "
                                         "!Subtarget->hasSSE41()">;
-  def ImportCallOptimizationEnabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
-  def ImportCallOptimizationDisabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
 
   def IsWin64CCFunc : Predicate<"Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
   def IsNotWin64CCFunc : Predicate<"!Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 2287a921a19c0..0a70f1ad7b8f8 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -2311,10 +2311,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
     if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11))
       EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX));
 
-    if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) {
-      emitLabelAndRecordForImportCallOptimization(
-          IMAGE_RETPOLINE_AMD64_IMPORT_BR);
-    }
+    if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0)))
+      reportFatalInternalError(
+          "Tail jumps to imported functions must use TAILJMPm64_REX");
 
     // Lower this as normal, but add a comment.
     OutStreamer->AddComment("TAILCALL");
@@ -2329,8 +2328,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::TAILJMPm64:
   case X86::TAILJMPd64_CC:
     if (EnableImportCallOptimization)
-      report_fatal_error("Unexpected TAILJMP instruction was emitted when "
-                         "import call optimization was enabled");
+      reportFatalInternalError(
+          "Unexpected TAILJMP instruction was emitted when "
+          "import call optimization was enabled");
 
     // Lower these as normal, but add some comments.
     OutStreamer->AddComment("TAILCALL");
@@ -2338,9 +2338,22 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
     break;
 
   case X86::TAILJMPm64_REX:
-    if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) {
-      emitLabelAndRecordForImportCallOptimization(
-          IMAGE_RETPOLINE_AMD64_CFG_BR_REX);
+    if (EnableImportCallOptimization) {
+      if (isCallToCFGuardFunction(MI)) {
+        emitLabelAndRecordForImportCallOptimization(
+            IMAGE_RETPOLINE_AMD64_CFG_BR_REX);
+      } else if (isImportedFunction(MI->getOperand(3))) {
+        emitLabelAndRecordForImportCallOptimization(
+            IMAGE_RETPOLINE_AMD64_IMPORT_BR);
+        MCInst TmpInst;
+        MCInstLowering.Lower(MI, TmpInst);
+        emitCallInstruction(TmpInst);
+
+        // Must be followed by five int3 instructions.
+        for (int i = 0; i < 5; ++i)
+          EmitAndCountInstruction(MCInstBuilder(X86::INT3));
+        return;
+      }
     }
 
     OutStreamer->AddComment("TAILCALL");
@@ -2349,11 +2362,20 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   case X86::TAILJMPr64_REX: {
     if (EnableImportCallOptimization) {
-      assert(MI->getOperand(0).getReg() == X86::RAX &&
-             "Indirect tail calls with impcall enabled must go through RAX (as "
-             "enforced by TCRETURNImpCallri64)");
+      if (MI->getOperand(0).getReg() != X86::RAX)
+        reportFatalInternalError(
+            "Indirect tail calls with impcall enabled must go through RAX (as "
+            "enforced by TCRETURNImpCallri64)");
       emitLabelAndRecordForImportCallOptimization(
-          IMAGE_RETPOLINE_AMD64_INDIR_BR);
+          IMAGE_RETPOLINE_AMD64_INDIR_BR_REX);
+      MCInst TmpInst;
+      MCInstLowering.Lower(MI, TmpInst);
+      emitCallInstruction(TmpInst);
+
+      // Must be followed by 2 int3 instructions.
+      for (int i = 0; i < 2; ++i)
+        EmitAndCountInstruction(MCInstBuilder(X86::INT3));
+      return;
     }
 
     OutStreamer->AddComment("TAILCALL");
@@ -2369,6 +2391,14 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
       emitLabelAndRecordForImportCallOptimization(
           (ImportCallKind)(IMAGE_RETPOLINE_AMD64_SWITCHTABLE_FIRST +
                            EncodedReg));
+      MCInst TmpInst;
+      MCInstLowering.Lower(MI, TmpInst);
+      emitCallInstruction(TmpInst);
+
+      // Must be followed by 4 int3 instructions.
+      for (int i = 0; i < 4; ++i)
+        EmitAndCountInstruction(MCInstBuilder(X86::INT3));
+      return;
     }
     break;
 
@@ -2378,7 +2408,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::JMP32m:
   case X86::JMP64m:
     if (EnableImportCallOptimization && hasJumpTableInfoInBlock(MI))
-      report_fatal_error(
+      reportFatalInternalError(
           "Unexpected JMP instruction was emitted for a jump-table when import "
           "call optimization was enabled");
     break;
@@ -2550,29 +2580,19 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
     if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11))
       EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX));
 
-    if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) {
-      emitLabelAndRecordForImportCallOptimization(
-          IMAGE_RETPOLINE_AMD64_IMPORT_CALL);
-
-      MCInst TmpInst;
-      MCInstLowering.Lower(MI, TmpInst);
-
-      // For Import Call Optimization to work, we need a the call instruction
-      // with a rex prefix, and a 5-byte nop after the call instruction.
-      EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
-      emitCallInstruction(TmpInst);
-      emitNop(*OutStreamer, 5, Subtarget);
-      maybeEmitNopAfterCallForWindowsEH(MI);
-      return;
-    }
+    if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0)))
+      reportFatalInternalError(
+          "Calls to imported functions with import call optimization "
+          "should be lowered to CALL64m via CALL64_ImpCall");
 
     break;
 
   case X86::CALL64r:
     if (EnableImportCallOptimization) {
-      assert(MI->getOperand(0).getReg() == X86::RAX &&
-             "Indirect calls with impcall enabled must go through RAX (as "
-             "enforced by CALL64r_ImpCall)");
+      if (MI->getOperand(0).getReg() != X86::RAX)
+        reportFatalInternalError(
+            "Indirect calls with import call optimization enabled must go "
+            "through RAX");
 
       emitLabelAndRecordForImportCallOptimization(
           IMAGE_RETPOLINE_AMD64_INDIR_CALL);
@@ -2589,9 +2609,33 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
     break;
 
   case X86::CALL64m:
-    if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) {
-      emitLabelAndRecordForImportCallOptimization(
-          IMAGE_RETPOLINE_AMD64_CFG_CALL);
+    if (EnableImportCallOptimization) {
+      if (isCallToCFGuardFunction(MI)) {
+        emitLabelAndRecordForImportCallOptimization(
+            IMAGE_RETPOLINE_AMD64_CFG_CALL);
+      } else if (isImportedFunction(MI->getOperand(3))) {
+        emitLabelAndRecordForImportCallOptimization(
+            IMAGE_RETPOLINE_AMD64_IMPORT_CALL);
+
+        MCInst TmpInst;
+        MCInstLowering.Lower(MI, TmpInst);
+
+        // For Import Call Optimization to work, we need a the call instruction
+        // with a rex prefix, and a 5-byte nop after the call instruction.
+        EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+        emitCallInstruction(TmpInst);
+        // MSVC Linker is *very* picky about the exact nop to use.
+        MCInst Nop = MCInstBuilder(X86::NOOPL)
+                         .addReg(X86::RAX)
+                         .addImm(1)
+                         .addReg(X86::RAX)
+                         .addImm(0)
+                         .addReg(0);
+        Nop.setFlags(X86::IP_USE_DISP8);
+        EmitAndCountInstruction(Nop);
+        maybeEmitNopAfterCallForWindowsEH(MI);
+        return;
+      }
     }
     break;
 
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 72f38133e21ff..5878a0f7a61d3 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -983,7 +983,6 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg(
   case X86::TCRETURNmi:
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
-  case X86::TCRETURNri64_ImpCall:
   case X86::TCRETURNmi64:
   case X86::TCRETURN_WINmi64:
   case X86::EH_RETURN:
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 692e42ae5e752..a513371506038 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -728,10 +728,6 @@ def GR32_SIDI : RegisterClass<"X86", [i32], 32, (add ESI, EDI)>;
 def GR32_DIBP : RegisterClass<"X86", [i32], 32, (add EDI, EBP)>;
 def GR32_BPSP : RegisterClass<"X86", [i32], 32, (add EBP, ESP)>;
 
-// Class to support Windows Import Call Optimization: all indirect jumps must
-// happen through RAX.
-def GR64_A : RegisterClass<"X86", [i64], 64, (add RAX)>;
-
 // Scalar SSE2 floating point registers.
 def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
 
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
index 12be910d68ee9..39d5a2596b5b6 100644
--- a/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
@@ -1,33 +1,151 @@
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM
+; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM
+; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc -o - %s | \
+; RUN:  FileCheck %s --check-prefix ASM
+; RUN: llc -mtriple=x86_64-pc-windows-msvc  --filetype=obj -o - %s | llvm-objdump - --disassemble \
+; RUN:  | FileCheck %s --check-prefix OBJ
+
+ at global_func_ptr = external dso_local local_unnamed_addr global ptr, align 8
+declare dllimport void @a() local_unnamed_addr
+declare dllimport void @b() local_unnamed_addr
+declare dso_local i32 @__C_specific_handler(...)
 
 define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
 entry:
+  call void @a()
+  call void @a()
   call void %func_ptr()
+  %0 = load ptr, ptr @global_func_ptr, align 8
+  call void %0()
+  ret void
+}
+; ASM-LABEL:  normal_call:
+; ASM:          movq    %rcx, %rsi
+; ASM-NEXT:   .Limpcall0:
+; ASM-NEXT:     rex64
+; ASM-NEXT:     callq   *__imp_a(%rip)
+; ASM-NEXT:     nopl    (%rax,%rax)
+; ASM-NEXT:   .Limpcall1:
+; ASM-NEXT:     rex64
+; ASM-NEXT:     callq   *__imp_a(%rip)
+; ASM-NEXT:     nopl    (%rax,%rax)
+; ASM-NEXT:     movq    %rsi, %rax
+; ASM-NEXT:   .Limpcall2:
+; ASM-NEXT:     callq   *__guard_dispatch_icall_fptr(%rip)
+; ASM-NEXT:     movq    global_func_ptr(%rip), %rax
+; ASM-NEXT:   .Limpcall3:
+; ASM-NEXT:     callq   *__guard_dispatch_icall_fptr(%rip)
+; ASM-NEXT:     nop
+
+define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
+entry:
+  tail call void @b()
   ret void
 }
-; CHECK-LABEL:  normal_call:
-; CHECK:        .Limpcall0:
-; CHECK-NEXT:     callq   *__guard_dispatch_icall_fptr(%rip)
+; ASM-LABEL:  tail_call:
+; ASM:        .Limpcall4:
+; ASM-NEXT:     rex64 jmpq *__imp_b(%rip)
 
 define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
 entry:
   tail call void %func_ptr()
   ret void
 }
-; CHECK-LABEL:  tail_call_fp:
-; CHECK:        .Limpcall1:
-; CHECK-NEXT:     rex64 jmpq      *__guard_dispatch_icall_fptr(%rip)
-
-; CHECK-LABEL  .section   .retplne,"yi"
-; CHECK-NEXT   .asciz  "RetpolineV1"
-; CHECK-NEXT   .long   16
-; CHECK-NEXT   .secnum tc_sect
-; CHECK-NEXT   .long   10
-; CHECK-NEXT   .secoffset      .Limpcall1
-; CHECK-NEXT   .long   16
-; CHECK-NEXT   .secnum nc_sect
-; CHECK-NEXT   .long   9
-; CHECK-NEXT   .secoffset      .Limpcall0
+; ASM-LABEL:  tail_call_fp:
+; ASM:          movq    %rcx, %rax
+; ASM-NEXT:   .Limpcall5:
+; ASM-NEXT:     rex64 jmpq      *__guard_dispatch_icall_fptr(%rip)
+
+define dso_local void @tail_call_global_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
+entry:
+  %0 = load ptr, ptr @global_func_ptr, align 8
+  tail call void %0()
+  ret void
+}
+; ASM-LABEL:  tail_call_global_fp:
+; ASM:          movq    global_func_ptr(%rip), %rax
+; ASM-NEXT:  .Limpcall6:
+; ASM-NEXT:     rex64 jmpq      *__guard_dispatch_icall_fptr(%rip)
+
+; Regression test: the call to the CFG Guard was being indirected via a register, which is not
+; permitted when retpoline is enabled.
+define dso_local i32 @might_call_global_func_ptr(ptr %0, ptr %1, i32 %2) {
+3:
+  %4 = icmp eq i32 %2, 0
+  br i1 %4, label %5, label %8
+
+5:                                               ; preds = %11
+  %6 = load ptr, ptr @global_func_ptr, align 8
+  %7 = tail call i32 %6(ptr noundef %1)
+  br label %8
+
+8:
+  %9 = phi i32 [ %7, %5 ], [ -1, %3 ]
+  ret i32 %9
+}
+; ASM-LABEL:  might_call_global_func_ptr:
+; ASM:          movq    global_func_ptr(%rip), %rax
+; ASM-NEXT:     movq    %rdx, %rcx
+; ASM-NEXT:  .Limpcall7:
+; ASM-NEXT:     rex64 jmpq      *__guard_dispatch_icall_fptr(%rip)
+
+define dso_local void @invoke_many_args(ptr %0, ptr %1, ptr %2) personality ptr @__C_specific_handler {
+  %4 = alloca ptr, align 8
+  %5 = alloca ptr, align 8
+  %6 = alloca ptr, align 8
+  invoke void %0(ptr %1, ptr %2, ptr %4, ptr %5, ptr %6)
+          to label %7 unwind label %8
+
+7:
+  ret void
+
+8:
+  %9 = cleanuppad within none []
+  cleanupret from %9 unwind to caller
+}
+; ASM-LABEL:  invoke_many_args:
+; ASM:        .Limpcall8:
+; ASM-NEXT:     callq   *__guard_dispatch_icall_fptr(%rip)
+; ASM-NEXT:     nop
+
+; ASM-LABEL  .section   .retplne,"yi"
+; ASM-NEXT   .asciz  "RetpolineV1"
+; ASM-NEXT   .long   24
+; ASM-NEXT   .secnum .text
+; ASM-NEXT   .long   10
+; ASM-NEXT   .secoffset      .Limpcall7
+; ASM-NEXT   .long   9
+; ASM-NEXT   .secoffset      .Limpcall8
+; ASM-NEXT   .long   40
+; ASM-NEXT   .secnum nc_sect
+; ASM-NEXT   .long   3
+; ASM-NEXT   .secoffset      .Limpcall0
+; ASM-NEXT   .long   3
+; ASM-NEXT   .secoffset      .Limpcall1
+; ASM-NEXT   .long   9
+; ASM-NEXT   .secoffset      .Limpcall2
+; ASM-NEXT   .long   9
+; ASM-NEXT   .secoffset      .Limpcall3
+; ASM-NEXT   .long   32
+; ASM-NEXT   .secnum tc_sect
+; ASM-NEXT   .long   2
+; ASM-NEXT   .secoffset      .Limpcall4
+; ASM-NEXT   .long   10
+; ASM-NEXT   .secoffset      .Limpcall5
+; ASM-NEXT   .long   10
+; ASM-NEXT   .secoffset      .Limpcall6
+
+; The loader assumes an exact sequence of instructions/bytes at each marked site since it may
+; replace the instruction(s) with new instruction(s), and the MSVC linker validates these at link
+; time.
+
+; Kind = 9 (IMAGE_RETPOLINE_AMD64_CFG_CALL)
+; OBJ-LABEL:  <normal_call>:
+; OBJ:        : ff 15 00 00 00 00             callq   *(%rip)
+
+; Kind = 10 (IMAGE_RETPOLINE_AMD64_CFG_BR_REX)
+; OBJ-LABEL:  <tc_sect>:
+; OBJ:        : 48 ff 25 00 00 00 00          jmpq    *(%rip)
 
 !llvm.module.flags = !{!0, !1}
 !0 = !{i32 1, !"import-call-optimization", i32 1}
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
index fe22b251685e6..fb628fc34bdb5 100644
--- a/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
@@ -1,8 +1,15 @@
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
-
-; CHECK-LABEL:  uses_rax:
-; CHECK:        .Limpcall0:
-; CHECK-NEXT:     jmpq    *%rax
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix ASM
+; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM
+; RUN: llc -mtriple=x86_64-pc-windows-msvc  --filetype=obj -o - %s | llvm-objdump - --disassemble \
+; RUN:  | FileCheck %s --check-prefix OBJ
+
+; ASM-LABEL:  uses_rax:
+; ASM:        .Limpcall0:
+; ASM-NEXT:     jmpq    *%rax
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
 
 define void @uses_rax(i32 %x) {
 entry:
@@ -34,9 +41,13 @@ sw.epilog:
   ret void
 }
 
-; CHECK-LABEL:  uses_rcx:
-; CHECK:        .Limpcall1:
-; CHECK-NEXT:     jmpq    *%rcx
+; ASM-LABEL:  uses_rcx:
+; ASM:        .Limpcall1:
+; ASM-NEXT:     jmpq    *%rcx
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
 
 define void @uses_rcx(i32 %x) {
 entry:
@@ -70,14 +81,32 @@ sw.epilog:
 
 declare void @g(i32)
 
-; CHECK-LABEL:  .section        .retplne,"yi"
-; CHECK-NEXT:   .asciz  "RetpolineV1"
-; CHECK-NEXT:   .long   24
-; CHECK-NEXT:   .secnum .text
-; CHECK-NEXT:   .long   16
-; CHECK-NEXT:   .secoffset      .Limpcall0
-; CHECK-NEXT:   .long   17
-; CHECK-NEXT:   .secoffset      .Limpcall1
+; ASM-LABEL:  .section        .retplne,"yi"
+; ASM-NEXT:   .asciz  "RetpolineV1"
+; ASM-NEXT:   .long   24
+; ASM-NEXT:   .secnum .text
+; ASM-NEXT:   .long   16
+; ASM-NEXT:   .secoffset      .Limpcall0
+; ASM-NEXT:   .long   17
+; ASM-NEXT:   .secoffset      .Limpcall1
+
+; The loader assumes an exact sequence of instructions/bytes at each marked site since it may
+; replace the instruction(s) with new instruction(s), and the MSVC linker validates these at link
+; time.
+
+; Kind = 16-31 (IMAGE_RETPOLINE_AMD64_SWITCHTABLE_*)
+; OBJ-LABEL:  <uses_rax>:
+; OBJ:        : ff e0                         jmpq    *%rax
+; OBJ-NEXT:   : cc                            int3
+; OBJ-NEXT:   : cc                            int3
+; OBJ-NEXT:   : cc                            int3
+; OBJ-NEXT:   : cc                            int3
+; OBJ-LABEL:  <uses_rcx>:
+; OBJ:        : ff e1                         jmpq    *%rcx
+; OBJ-NEXT:   : cc                            int3
+; OBJ-NEXT:   : cc                            int3
+; OBJ-NEXT:   : cc                            int3
+; OBJ-NEXT:   : cc                            int3
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"import-call-optimization", i32 1}
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization.ll b/llvm/test/CodeGen/X86/win-import-call-optimization.ll
index cc7e1a9f81e34..0d62779cb444b 100644
--- a/llvm/test/CodeGen/X86/win-import-call-optimization.ll
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization.ll
@@ -1,67 +1,189 @@
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM
+; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM
+; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc -o - %s | \
+; RUN:  FileCheck %s --check-prefix ASM
+; RUN: llc -mtriple=x86_64-pc-windows-msvc  --filetype=obj -o - %s | llvm-objdump - --disassemble \
+; RUN:  | FileCheck %s --check-prefix OBJ
+
+ at global_func_ptr = external dso_local local_unnamed_addr global ptr, align 8
 
 define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
 entry:
   call void @a()
   call void @a()
   call void %func_ptr()
+  %0 = load ptr, ptr @global_func_ptr, align 8
+  call void %0()
   ret void
 }
-; CHECK-LABEL:  normal_call:
-; CHECK:        .Limpcall0:
-; CHECK-NEXT:     rex64
-; CHECK-NEXT:     callq   __imp_a
-; CHECK-NEXT:     nopl    8(%rax,%rax)
-; CHECK-NEXT:   .Limpcall1:
-; CHECK-NEXT:     rex64
-; CHECK-NEXT:     callq   __imp_a
-; CHECK-NEXT:     nopl    8(%rax,%rax)
-; CHECK-NEXT:     movq    %rsi, %rax
-; CHECK-NEXT:   .Limpcall2:
-; CHECK-NEXT:     callq   *%rax
-; CHECK-NEXT:     nopl    (%rax)
-; CHECK-NEXT:     nop
+; ASM-LABEL:  normal_call:
+; ASM:        .Limpcall0:
+; ASM-NEXT:     rex64
+; ASM-NEXT:     callq   *__imp_a(%rip)
+; ASM-NEXT:     nopl    (%rax,%rax)
+; ASM-NEXT:   .Limpcall1:
+; ASM-NEXT:     rex64
+; ASM-NEXT:     callq   *__imp_a(%rip)
+; ASM-NEXT:     nopl    (%rax,%rax)
+; ASM-NEXT:     movq    %rsi, %rax
+; ASM-NEXT:   .Limpcall2:
+; ASM-NEXT:     callq   *%rax
+; ASM-NEXT:     nopl    (%rax)
+; ASM-NEXT:     movq global_func_ptr(%rip), %rax
+; ASM-NEXT:   .Limpcall3:
+; ASM-NEXT:     callq   *%rax
+; ASM-NEXT:     nopl    (%rax)
+; ASM-NEXT:     nop
 
 define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
 entry:
   tail call void @b()
   ret void
 }
-; CHECK-LABEL:  tail_call:
-; CHECK:        .Limpcall3:
-; CHECK-NEXT:     jmp __imp_b
+; ASM-LABEL:  tail_call:
+; ASM:        .Limpcall4:
+; ASM-NEXT:     rex64 jmpq      *__imp_b(%rip)
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
 
 define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
 entry:
   tail call void %func_ptr()
   ret void
 }
-; CHECK-LABEL:  tail_call_fp:
-; CHECK:          movq    %rcx, %rax
-; CHECK-NEXT:   .Limpcall4:
-; CHECK-NEXT:     rex64 jmpq      *%rax
+; ASM-LABEL:  tail_call_fp:
+; ASM:          movq    %rcx, %rax
+; ASM-NEXT:   .Limpcall5:
+; ASM-NEXT:     rex64 jmpq      *%rax
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+
+define dso_local void @tail_call_global_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
+entry:
+  %0 = load ptr, ptr @global_func_ptr, align 8
+  tail call void %0()
+  ret void
+}
+; ASM-LABEL:  tail_call_global_fp:
+; ASM:          movq    global_func_ptr(%rip), %rax
+; ASM-NEXT:   .Limpcall6:
+; ASM-NEXT:     rex64 jmpq      *%rax
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+
+; Regression test: conditional tail calls can't be encoded, so make sure they aren't emitted.
+define void @might_call(i1 %4) local_unnamed_addr {
+  br i1 %4, label %makecall, label %finish
+
+makecall:
+  tail call void @a()
+  br label %finish
+
+finish:
+  ret void
+}
+; ASM-LABEL:  might_call:
+; ASM:        .Limpcall7:
+; ASM-NEXT:     rex64 jmpq      *__imp_a(%rip)
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
+
+; Regression test: this particular sequence caused a cycle in DAG scheduling due
+; to the requirement to use RAX for register-indirect calls. We now explicitly
+; copy to RAX which breaks the cycle.
+define dso_local i32 @not_scheduled_repro(ptr %0, ptr %1, ptr %2) local_unnamed_addr {
+  %4 = load i64, ptr %0, align 8
+  %5 = inttoptr i64 %4 to ptr
+  %6 = tail call i64 %5(ptr noundef %1)
+  store i64 %6, ptr %2, align 8
+  ret i32 0
+}
+; ASM-LABEL:  not_scheduled_repro:
+; ASM:          movq    (%rcx), %rax
+; ASM-NEXT:     movq    %rdx, %rcx
+; ASM-NEXT:   .Limpcall8:
+; ASM-NEXT:     callq   *%rax
+; ASM-NEXT:     nopl    (%rax)
+
+define dso_local void @not_scheduled_repro_tc(ptr %0, ptr %1) local_unnamed_addr {
+  %4 = load i64, ptr %0, align 8
+  %5 = inttoptr i64 %4 to ptr
+  tail call void %5(ptr noundef %1)
+  ret void
+}
+; ASM-LABEL:  not_scheduled_repro_tc:
+; ASM:          movq    (%rcx), %rax
+; ASM-NEXT:     movq    %rdx, %rcx
+; ASM-NEXT:   .Limpcall9:
+; ASM-NEXT:     rex64 jmpq      *%rax
+; ASM-NEXT:     int3
+; ASM-NEXT:     int3
 
 declare dllimport void @a() local_unnamed_addr
 declare dllimport void @b() local_unnamed_addr
 
-; CHECK-LABEL  .section   .retplne,"yi"
-; CHECK-NEXT   .asciz  "RetpolineV1"
-; CHECK-NEXT   .long   24
-; CHECK-NEXT   .secnum tc_sect
-; CHECK-NEXT   .long   3
-; CHECK-NEXT   .secoffset      .Limpcall3
-; CHECK-NEXT   .long   5
-; CHECK-NEXT   .secoffset      .Limpcall4
-; CHECK-NEXT   .long   32
-; CHECK-NEXT   .secnum nc_sect
-; CHECK-NEXT   .long   3
-; CHECK-NEXT   .secoffset      .Limpcall0
-; CHECK-NEXT   .long   3
-; CHECK-NEXT   .secoffset      .Limpcall1
-; CHECK-NEXT   .long   5
-; CHECK-NEXT   .secoffset      .Limpcall2
+; ASM-LABEL  .section   .retplne,"yi"
+; ASM-NEXT   .asciz  "RetpolineV1"
+; ASM-NEXT   .long   32
+; ASM-NEXT   .secnum tc_sect
+; ASM-NEXT   .long   2
+; ASM-NEXT   .secoffset      .Limpcall4
+; ASM-NEXT   .long   6
+; ASM-NEXT   .secoffset      .Limpcall5
+; ASM-NEXT   .long   6
+; ASM-NEXT   .secoffset      .Limpcall6
+; ASM-NEXT   .long   40
+; ASM-NEXT   .secnum nc_sect
+; ASM-NEXT   .long   3
+; ASM-NEXT   .secoffset      .Limpcall0
+; ASM-NEXT   .long   3
+; ASM-NEXT   .secoffset      .Limpcall1
+; ASM-NEXT   .long   5
+; ASM-NEXT   .secoffset      .Limpcall2
+; ASM-NEXT   .long   5
+; ASM-NEXT   .secoffset      .Limpcall3
+; ASM-NEXT   .long   32
+; ASM-NEXT   .secnum .text
+; ASM-NEXT   .long   2
+; ASM-NEXT   .secoffset      .Limpcall7
+; ASM-NEXT   .long   5
+; ASM-NEXT   .secoffset      .Limpcall8
+; ASM-NEXT   .long   6
+; ASM-NEXT   .secoffset      .Limpcall9
+
+; The loader assumes an exact sequence of instructions/bytes at each marked site since it may
+; replace the instruction(s) with new instruction(s), and the MSVC linker validates these at link
+; time.
+
+; Kind = 3 (IMAGE_RETPOLINE_AMD64_IMPORT_CALL)
+; OBJ-LABEL:  <normal_call>:
+; OBJ:        : 48 ff 15 00 00 00 00          callq   *(%rip)
+; OBJ-NEXT:   : 0f 1f 44 00 00                nopl    (%rax,%rax)
+
+; Kind = 5 (IMAGE_RETPOLINE_AMD64_INDIR_CALL)
+; OBJ:        : ff d0                         callq   *%rax
+; OBJ-NEXT:   : 0f 1f 00                      nopl    (%rax)
+
+; Kind = 2 (IMAGE_RETPOLINE_AMD64_IMPORT_BR)
+; OBJ-LABEL:  <tc_sect>:
+; OBJ:        : 48 ff 25 00 00 00 00          jmpq    *(%rip)
+; OBJ-NEXT:   : cc                            int3
+; OBJ-NEXT:   : cc                            int3
+; OBJ-NEXT:   : cc                            int3
+; OBJ-NEXT:   : cc                            int3
+; OBJ-NEXT:   : cc                            int3
+
+; Kind = 6 (IMAGE_RETPOLINE_AMD64_INDIR_BR)
+; OBJ-LABEL:  <tail_call_fp>:
+; OBJ:        : 48 ff e0                      jmpq    *%rax
+; OBJ-NEXT:   : cc                            int3
+; OBJ-NEXT:   : cc                            int3
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"import-call-optimization", i32 1}