[clang] [llvm] [win][x64] Multiple fixes for import call optimization (PR #160604)
Daniel Paoliello via cfe-commits
cfe-commits at lists.llvm.org
Fri Jan 16 14:58:16 PST 2026
https://github.com/dpaoliello updated https://github.com/llvm/llvm-project/pull/160604
>From 81fb78cfc7e00cb60c4979dbdae059bccc612bc7 Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao at microsoft.com>
Date: Fri, 12 Dec 2025 09:45:37 -0800
Subject: [PATCH] [win][x64] Fix import call optimization for calls to
dllimports and global function pointers
---
.../CodeGenCXX/microsoft-abi-eh-ip2state.cpp | 2 +-
llvm/lib/Target/X86/X86AsmPrinter.cpp | 4 +-
llvm/lib/Target/X86/X86AsmPrinter.h | 2 +-
llvm/lib/Target/X86/X86ExpandPseudo.cpp | 8 +-
llvm/lib/Target/X86/X86FastISel.cpp | 16 +-
llvm/lib/Target/X86/X86FrameLowering.cpp | 4 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 2 -
llvm/lib/Target/X86/X86ISelLowering.h | 4 -
llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 16 +-
llvm/lib/Target/X86/X86InstrCompiler.td | 11 +-
llvm/lib/Target/X86/X86InstrControl.td | 10 +-
llvm/lib/Target/X86/X86InstrFragments.td | 4 -
llvm/lib/Target/X86/X86InstrInfo.cpp | 15 +-
llvm/lib/Target/X86/X86InstrPredicates.td | 2 -
llvm/lib/Target/X86/X86MCInstLower.cpp | 114 +++++++---
llvm/lib/Target/X86/X86RegisterInfo.cpp | 1 -
llvm/lib/Target/X86/X86RegisterInfo.td | 4 -
.../win-import-call-optimization-cfguard.ll | 154 +++++++++++--
.../win-import-call-optimization-jumptable.ll | 61 ++++--
.../X86/win-import-call-optimization.ll | 202 ++++++++++++++----
20 files changed, 463 insertions(+), 173 deletions(-)
diff --git a/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp b/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp
index 0b7b406e2ba8e..541789fc9d339 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp
@@ -40,7 +40,7 @@ void case_calls_dll_import() NO_TAIL {
// CHECK: .seh_endprologue
// CHECK: .Limpcall{{[0-9]+}}:
// CHECK-NEXT: rex64
-// CHECK-NEXT: call __imp_some_dll_import
+// CHECK-NEXT: call qword ptr [rip + __imp_some_dll_import]
// CHECK-NEXT: nop dword ptr {{\[.*\]}}
// CHECK-NEXT: nop
// CHECK-NEXT: .seh_startepilogue
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 84b921222a116..6a876b8963545 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -479,8 +479,8 @@ static bool isIndirectBranchOrTailCall(const MachineInstr &MI) {
Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri ||
Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNmi ||
Opc == X86::TCRETURN_WINmi64 || Opc == X86::TCRETURNri64 ||
- Opc == X86::TCRETURNmi64 || Opc == X86::TCRETURNri64_ImpCall ||
- Opc == X86::TAILJMPr64_REX || Opc == X86::TAILJMPm64_REX;
+ Opc == X86::TCRETURNmi64 || Opc == X86::TAILJMPr64_REX ||
+ Opc == X86::TAILJMPm64_REX;
}
void X86AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index e02b5562d3b5e..7c55f06e86d4b 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -53,7 +53,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
MCSymbol *CalleeSymbol;
ImportCallKind Kind;
};
- DenseMap<MCSection *, std::vector<ImportCallInfo>>
+ MapVector<MCSection *, std::vector<ImportCallInfo>>
SectionToImportedFunctionCalls;
// This utility class tracks the length of a stackmap instruction's 'shadow'.
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 2f5ee9d2c9a13..6574bd2d974a8 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -290,7 +290,6 @@ bool X86ExpandPseudoImpl::expandMI(MachineBasicBlock &MBB,
case X86::TCRETURNdi64:
case X86::TCRETURNdi64cc:
case X86::TCRETURNri64:
- case X86::TCRETURNri64_ImpCall:
case X86::TCRETURNmi64:
case X86::TCRETURN_WINmi64: {
bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64 ||
@@ -366,9 +365,7 @@ bool X86ExpandPseudoImpl::expandMI(MachineBasicBlock &MBB,
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
for (unsigned i = 0; i != X86::AddrNumOperands; ++i)
MIB.add(MBBI->getOperand(i));
- } else if (Opcode == X86::TCRETURNri64 ||
- Opcode == X86::TCRETURNri64_ImpCall ||
- Opcode == X86::TCRETURN_WIN64ri) {
+ } else if (Opcode == X86::TCRETURNri64 || Opcode == X86::TCRETURN_WIN64ri) {
JumpTarget.setIsKill();
BuildMI(MBB, MBBI, DL,
TII->get(IsX64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
@@ -722,9 +719,6 @@ bool X86ExpandPseudoImpl::expandMI(MachineBasicBlock &MBB,
case X86::CALL64m_RVMARKER:
expandCALL_RVMARKER(MBB, MBBI);
return true;
- case X86::CALL64r_ImpCall:
- MI.setDesc(TII->get(X86::CALL64r));
- return true;
case X86::ADD32mi_ND:
case X86::ADD64mi32_ND:
case X86::SUB32mi_ND:
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index c69ca77031495..26b247c797b00 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3326,11 +3326,6 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
if (Flag.isSwiftError() || Flag.isPreallocated())
return false;
- // Can't handle import call optimization.
- if (Is64Bit &&
- MF->getFunction().getParent()->getModuleFlag("import-call-optimization"))
- return false;
-
SmallVector<MVT, 16> OutVTs;
SmallVector<Type *, 16> ArgTys;
SmallVector<Register, 16> ArgRegs;
@@ -3572,6 +3567,17 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
if (CalleeOp) {
// Register-indirect call.
unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
+
+ const Module *M = FuncInfo.MF->getFunction().getParent();
+ if (CalleeOp != X86::RAX && Is64Bit &&
+ M->getModuleFlag("import-call-optimization")) {
+ // Import call optimization requires all indirect calls to be via RAX.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+ TII.get(TargetOpcode::COPY), X86::RAX)
+ .addReg(CalleeOp);
+ CalleeOp = X86::RAX;
+ }
+
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc))
.addReg(CalleeOp);
} else {
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 8bca6344d6521..7494f756de68a 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2400,8 +2400,8 @@ static bool isTailCallOpcode(unsigned Opc) {
return Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri ||
Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNdi ||
Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 ||
- Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TCRETURNdi64 ||
- Opc == X86::TCRETURNmi64 || Opc == X86::TCRETURN_WINmi64;
+ Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64 ||
+ Opc == X86::TCRETURN_WINmi64;
}
void X86FrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ef94c198558c7..3d8c455af01ab 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35396,7 +35396,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FST)
NODE_NAME_CASE(CALL)
NODE_NAME_CASE(CALL_RVMARKER)
- NODE_NAME_CASE(IMP_CALL)
NODE_NAME_CASE(BT)
NODE_NAME_CASE(CMP)
NODE_NAME_CASE(FCMP)
@@ -63310,7 +63309,6 @@ X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
Register TargetReg;
switch (MBBI->getOpcode()) {
case X86::CALL64r:
- case X86::CALL64r_ImpCall:
case X86::CALL64r_NT:
case X86::TAILJMPr64:
case X86::TAILJMPr64_REX:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 08d5e2331727b..a24c8dde6497b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -94,10 +94,6 @@ namespace llvm {
/// POP_FROM_X87_REG (which may remove a required FPU stack pop).
POP_FROM_X87_REG,
- // Pseudo for a call to an imported function to ensure the correct machine
- // instruction is emitted for Import Call Optimization.
- IMP_CALL,
-
/// X86 compare and logical compare instructions.
CMP,
FCMP,
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index f4de2f8c6c22e..f2fb5c685f348 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2585,6 +2585,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
"CFG Function load should not have an offset");
Callee = DAG.getTargetGlobalAddress(
GA->getGlobal(), dl, GA->getValueType(0), 0, X86II::MO_NO_FLAG);
+ } else if (M->getModuleFlag("import-call-optimization")) {
+ // When import call optimization is enabled, all register indirect calls
+ // must use RAX.
+ Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Callee, InGlue);
+ InGlue = Chain.getValue(1);
+ Callee = DAG.getRegister(X86::RAX, Callee.getValueType());
}
SmallVector<SDValue, 8> Ops;
@@ -2689,8 +2695,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
MF.getFrameInfo().setHasTailCall();
- auto Opcode =
- IsCFGuardCall ? X86ISD::TC_RETURN_GLOBALADDR : X86ISD::TC_RETURN;
+ auto Opcode = (IsCFGuardCall || IsImpCall) ? X86ISD::TC_RETURN_GLOBALADDR
+ : X86ISD::TC_RETURN;
SDValue Ret = DAG.getNode(Opcode, dl, MVT::Other, Ops);
if (IsCFICall)
@@ -2703,11 +2709,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Returns a chain & a glue for retval copy to use.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- if (IsImpCall) {
- Chain = DAG.getNode(X86ISD::IMP_CALL, dl, NodeTys, Ops);
- } else if (IsNoTrackIndirectCall) {
+ if (IsNoTrackIndirectCall) {
Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
- } else if (IsCFGuardCall) {
+ } else if (IsCFGuardCall || IsImpCall) {
Chain = DAG.getNode(X86ISD::CALL_GLOBALADDR, dl, NodeTys, Ops);
} else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
// Calls with a "clang.arc.attachedcall" bundle are special. They should be
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 3e07db678809d..3ca60135784de 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1322,9 +1322,6 @@ def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 texternalsym:$dst)),
def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 tglobaladdr:$dst)),
(CALL64pcrel32_RVMARKER tglobaladdr:$rvfunc, tglobaladdr:$dst)>;
-def : Pat<(X86imp_call (i64 tglobaladdr:$dst)),
- (CALL64pcrel32 tglobaladdr:$dst)>;
-
// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
// can never use callee-saved registers. That is the purpose of the GR64_TC
// register classes.
@@ -1359,15 +1356,11 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
(TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
- Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+ Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls]>;
def : Pat<(X86tcret GR64_TCW64:$dst, timm:$off),
(TCRETURN_WIN64ri GR64_TCW64:$dst, timm:$off)>,
- Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
-
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
- (TCRETURNri64_ImpCall ptr_rc_tailcall:$dst, timm:$off)>,
- Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationEnabled]>;
+ Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls]>;
// Don't fold loads into X86tcret requiring more than 6 regs.
// There wouldn't be enough scratch registers for base+index.
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index c67feb7668234..aed7df993880a 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -331,7 +331,7 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
Requires<[In64BitMode]>;
def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
"call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
- Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationDisabled]>;
+ Requires<[In64BitMode,NotUseIndirectThunkCalls]>;
def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
"call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
Requires<[In64BitMode,FavorMemIndirectCall,
@@ -364,10 +364,6 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
def TCRETURN_WIN64ri : PseudoI<(outs), (ins GR64_TCW64:$dst, i32imm:$offset),
[]>, Sched<[WriteJump]>;
- def TCRETURNri64_ImpCall : PseudoI<(outs),
- (ins GR64_A:$dst, i32imm:$offset),
- []>, Sched<[WriteJump]>;
-
let mayLoad = 1 in
def TCRETURNmi64 : PseudoI<(outs),
(ins i64mem_TC:$dst, i32imm:$offset),
@@ -433,10 +429,6 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
def CALL64pcrel32_RVMARKER :
PseudoI<(outs), (ins i64imm:$rvfunc, i64i32imm_brtarget:$dst), []>,
Requires<[In64BitMode]>;
-
- def CALL64r_ImpCall :
- PseudoI<(outs), (ins GR64_A:$dst), [(X86call GR64_A:$dst)]>,
- Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationEnabled]>;
}
// Conditional tail calls are similar to the above, but they are branches
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index 38ab02667317e..29bf4c46ae69c 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -214,10 +214,6 @@ def X86call_globaladdr : SDNode<"X86ISD::CALL_GLOBALADDR", SDT_X86Call,
[SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
SDNPVariadic]>;
-def X86imp_call : SDNode<"X86ISD::IMP_CALL", SDT_X86Call,
- [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
- SDNPVariadic]>;
-
def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call,
[SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
SDNPVariadic]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 53b148c11c4e1..ebe60922abd9f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3700,7 +3700,6 @@ bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
case X86::TCRETURNmi:
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
- case X86::TCRETURNri64_ImpCall:
case X86::TCRETURNmi64:
return true;
default:
@@ -3731,9 +3730,16 @@ bool X86InstrInfo::canMakeTailCallConditional(
return false;
}
- if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
+ if (Subtarget.isTargetWin64()) {
// Conditional tail calls confuse the Win64 unwinder.
- return false;
+ if (MF->hasWinCFI())
+ return false;
+
+ // Conditional tail calls cannot be encoded in the Import Call Optimization
+ // metadata.
+ if (MF->getFunction().getParent()->getModuleFlag(
+ "import-call-optimization"))
+ return false;
}
assert(BranchCond.size() == 1);
@@ -7496,8 +7502,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
// do not fold loads into calls or pushes, unless optimizing for size
// aggressively.
if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
- (Opc == X86::CALL32r || Opc == X86::CALL64r ||
- Opc == X86::CALL64r_ImpCall || Opc == X86::PUSH16r ||
+ (Opc == X86::CALL32r || Opc == X86::CALL64r || Opc == X86::PUSH16r ||
Opc == X86::PUSH32r || Opc == X86::PUSH64r))
return nullptr;
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 21e6bacbacee2..1d23604d66d2c 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -237,8 +237,6 @@ let RecomputePerFunction = 1 in {
"shouldOptForSize(MF)">;
def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || "
"!Subtarget->hasSSE41()">;
- def ImportCallOptimizationEnabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
- def ImportCallOptimizationDisabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
def IsWin64CCFunc : Predicate<"Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
def IsNotWin64CCFunc : Predicate<"!Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 2287a921a19c0..0a70f1ad7b8f8 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -2311,10 +2311,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11))
EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX));
- if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) {
- emitLabelAndRecordForImportCallOptimization(
- IMAGE_RETPOLINE_AMD64_IMPORT_BR);
- }
+ if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0)))
+ reportFatalInternalError(
+ "Tail jumps to imported functions must use TAILJMPm64_REX");
// Lower this as normal, but add a comment.
OutStreamer->AddComment("TAILCALL");
@@ -2329,8 +2328,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
case X86::TAILJMPm64:
case X86::TAILJMPd64_CC:
if (EnableImportCallOptimization)
- report_fatal_error("Unexpected TAILJMP instruction was emitted when "
- "import call optimization was enabled");
+ reportFatalInternalError(
+ "Unexpected TAILJMP instruction was emitted when "
+ "import call optimization was enabled");
// Lower these as normal, but add some comments.
OutStreamer->AddComment("TAILCALL");
@@ -2338,9 +2338,22 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
break;
case X86::TAILJMPm64_REX:
- if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) {
- emitLabelAndRecordForImportCallOptimization(
- IMAGE_RETPOLINE_AMD64_CFG_BR_REX);
+ if (EnableImportCallOptimization) {
+ if (isCallToCFGuardFunction(MI)) {
+ emitLabelAndRecordForImportCallOptimization(
+ IMAGE_RETPOLINE_AMD64_CFG_BR_REX);
+ } else if (isImportedFunction(MI->getOperand(3))) {
+ emitLabelAndRecordForImportCallOptimization(
+ IMAGE_RETPOLINE_AMD64_IMPORT_BR);
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+ emitCallInstruction(TmpInst);
+
+ // Must be followed by five int3 instructions.
+ for (int i = 0; i < 5; ++i)
+ EmitAndCountInstruction(MCInstBuilder(X86::INT3));
+ return;
+ }
}
OutStreamer->AddComment("TAILCALL");
@@ -2349,11 +2362,20 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
case X86::TAILJMPr64_REX: {
if (EnableImportCallOptimization) {
- assert(MI->getOperand(0).getReg() == X86::RAX &&
- "Indirect tail calls with impcall enabled must go through RAX (as "
- "enforced by TCRETURNImpCallri64)");
+ if (MI->getOperand(0).getReg() != X86::RAX)
+ reportFatalInternalError(
+ "Indirect tail calls with impcall enabled must go through RAX (as "
+ "enforced by TCRETURNImpCallri64)");
emitLabelAndRecordForImportCallOptimization(
- IMAGE_RETPOLINE_AMD64_INDIR_BR);
+ IMAGE_RETPOLINE_AMD64_INDIR_BR_REX);
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+ emitCallInstruction(TmpInst);
+
+ // Must be followed by 2 int3 instructions.
+ for (int i = 0; i < 2; ++i)
+ EmitAndCountInstruction(MCInstBuilder(X86::INT3));
+ return;
}
OutStreamer->AddComment("TAILCALL");
@@ -2369,6 +2391,14 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
emitLabelAndRecordForImportCallOptimization(
(ImportCallKind)(IMAGE_RETPOLINE_AMD64_SWITCHTABLE_FIRST +
EncodedReg));
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+ emitCallInstruction(TmpInst);
+
+ // Must be followed by 4 int3 instructions.
+ for (int i = 0; i < 4; ++i)
+ EmitAndCountInstruction(MCInstBuilder(X86::INT3));
+ return;
}
break;
@@ -2378,7 +2408,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
case X86::JMP32m:
case X86::JMP64m:
if (EnableImportCallOptimization && hasJumpTableInfoInBlock(MI))
- report_fatal_error(
+ reportFatalInternalError(
"Unexpected JMP instruction was emitted for a jump-table when import "
"call optimization was enabled");
break;
@@ -2550,29 +2580,19 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11))
EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX));
- if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) {
- emitLabelAndRecordForImportCallOptimization(
- IMAGE_RETPOLINE_AMD64_IMPORT_CALL);
-
- MCInst TmpInst;
- MCInstLowering.Lower(MI, TmpInst);
-
- // For Import Call Optimization to work, we need a the call instruction
- // with a rex prefix, and a 5-byte nop after the call instruction.
- EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
- emitCallInstruction(TmpInst);
- emitNop(*OutStreamer, 5, Subtarget);
- maybeEmitNopAfterCallForWindowsEH(MI);
- return;
- }
+ if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0)))
+ reportFatalInternalError(
+ "Calls to imported functions with import call optimization "
+ "should be lowered to CALL64m via CALL64_ImpCall");
break;
case X86::CALL64r:
if (EnableImportCallOptimization) {
- assert(MI->getOperand(0).getReg() == X86::RAX &&
- "Indirect calls with impcall enabled must go through RAX (as "
- "enforced by CALL64r_ImpCall)");
+ if (MI->getOperand(0).getReg() != X86::RAX)
+ reportFatalInternalError(
+ "Indirect calls with import call optimization enabled must go "
+ "through RAX");
emitLabelAndRecordForImportCallOptimization(
IMAGE_RETPOLINE_AMD64_INDIR_CALL);
@@ -2589,9 +2609,33 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
break;
case X86::CALL64m:
- if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) {
- emitLabelAndRecordForImportCallOptimization(
- IMAGE_RETPOLINE_AMD64_CFG_CALL);
+ if (EnableImportCallOptimization) {
+ if (isCallToCFGuardFunction(MI)) {
+ emitLabelAndRecordForImportCallOptimization(
+ IMAGE_RETPOLINE_AMD64_CFG_CALL);
+ } else if (isImportedFunction(MI->getOperand(3))) {
+ emitLabelAndRecordForImportCallOptimization(
+ IMAGE_RETPOLINE_AMD64_IMPORT_CALL);
+
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+
+ // For Import Call Optimization to work, we need a the call instruction
+ // with a rex prefix, and a 5-byte nop after the call instruction.
+ EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+ emitCallInstruction(TmpInst);
+ // MSVC Linker is *very* picky about the exact nop to use.
+ MCInst Nop = MCInstBuilder(X86::NOOPL)
+ .addReg(X86::RAX)
+ .addImm(1)
+ .addReg(X86::RAX)
+ .addImm(0)
+ .addReg(0);
+ Nop.setFlags(X86::IP_USE_DISP8);
+ EmitAndCountInstruction(Nop);
+ maybeEmitNopAfterCallForWindowsEH(MI);
+ return;
+ }
}
break;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 72f38133e21ff..5878a0f7a61d3 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -983,7 +983,6 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg(
case X86::TCRETURNmi:
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
- case X86::TCRETURNri64_ImpCall:
case X86::TCRETURNmi64:
case X86::TCRETURN_WINmi64:
case X86::EH_RETURN:
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 692e42ae5e752..a513371506038 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -728,10 +728,6 @@ def GR32_SIDI : RegisterClass<"X86", [i32], 32, (add ESI, EDI)>;
def GR32_DIBP : RegisterClass<"X86", [i32], 32, (add EDI, EBP)>;
def GR32_BPSP : RegisterClass<"X86", [i32], 32, (add EBP, ESP)>;
-// Class to support Windows Import Call Optimization: all indirect jumps must
-// happen through RAX.
-def GR64_A : RegisterClass<"X86", [i64], 64, (add RAX)>;
-
// Scalar SSE2 floating point registers.
def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
index 12be910d68ee9..39d5a2596b5b6 100644
--- a/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
@@ -1,33 +1,151 @@
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM
+; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM
+; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc -o - %s | \
+; RUN: FileCheck %s --check-prefix ASM
+; RUN: llc -mtriple=x86_64-pc-windows-msvc --filetype=obj -o - %s | llvm-objdump - --disassemble \
+; RUN: | FileCheck %s --check-prefix OBJ
+
+ at global_func_ptr = external dso_local local_unnamed_addr global ptr, align 8
+declare dllimport void @a() local_unnamed_addr
+declare dllimport void @b() local_unnamed_addr
+declare dso_local i32 @__C_specific_handler(...)
define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
entry:
+ call void @a()
+ call void @a()
call void %func_ptr()
+ %0 = load ptr, ptr @global_func_ptr, align 8
+ call void %0()
+ ret void
+}
+; ASM-LABEL: normal_call:
+; ASM: movq %rcx, %rsi
+; ASM-NEXT: .Limpcall0:
+; ASM-NEXT: rex64
+; ASM-NEXT: callq *__imp_a(%rip)
+; ASM-NEXT: nopl (%rax,%rax)
+; ASM-NEXT: .Limpcall1:
+; ASM-NEXT: rex64
+; ASM-NEXT: callq *__imp_a(%rip)
+; ASM-NEXT: nopl (%rax,%rax)
+; ASM-NEXT: movq %rsi, %rax
+; ASM-NEXT: .Limpcall2:
+; ASM-NEXT: callq *__guard_dispatch_icall_fptr(%rip)
+; ASM-NEXT: movq global_func_ptr(%rip), %rax
+; ASM-NEXT: .Limpcall3:
+; ASM-NEXT: callq *__guard_dispatch_icall_fptr(%rip)
+; ASM-NEXT: nop
+
+define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
+entry:
+ tail call void @b()
ret void
}
-; CHECK-LABEL: normal_call:
-; CHECK: .Limpcall0:
-; CHECK-NEXT: callq *__guard_dispatch_icall_fptr(%rip)
+; ASM-LABEL: tail_call:
+; ASM: .Limpcall4:
+; ASM-NEXT: rex64 jmpq *__imp_b(%rip)
define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
entry:
tail call void %func_ptr()
ret void
}
-; CHECK-LABEL: tail_call_fp:
-; CHECK: .Limpcall1:
-; CHECK-NEXT: rex64 jmpq *__guard_dispatch_icall_fptr(%rip)
-
-; CHECK-LABEL .section .retplne,"yi"
-; CHECK-NEXT .asciz "RetpolineV1"
-; CHECK-NEXT .long 16
-; CHECK-NEXT .secnum tc_sect
-; CHECK-NEXT .long 10
-; CHECK-NEXT .secoffset .Limpcall1
-; CHECK-NEXT .long 16
-; CHECK-NEXT .secnum nc_sect
-; CHECK-NEXT .long 9
-; CHECK-NEXT .secoffset .Limpcall0
+; ASM-LABEL: tail_call_fp:
+; ASM: movq %rcx, %rax
+; ASM-NEXT: .Limpcall5:
+; ASM-NEXT: rex64 jmpq *__guard_dispatch_icall_fptr(%rip)
+
+define dso_local void @tail_call_global_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
+entry:
+ %0 = load ptr, ptr @global_func_ptr, align 8
+ tail call void %0()
+ ret void
+}
+; ASM-LABEL: tail_call_global_fp:
+; ASM: movq global_func_ptr(%rip), %rax
+; ASM-NEXT: .Limpcall6:
+; ASM-NEXT: rex64 jmpq *__guard_dispatch_icall_fptr(%rip)
+
+; Regression test: the call to the CFG Guard was being indirected via a register, which is not
+; permitted when retpoline is enabled.
+define dso_local i32 @might_call_global_func_ptr(ptr %0, ptr %1, i32 %2) {
+3:
+ %4 = icmp eq i32 %2, 0
+ br i1 %4, label %5, label %8
+
+5: ; preds = %11
+ %6 = load ptr, ptr @global_func_ptr, align 8
+ %7 = tail call i32 %6(ptr noundef %1)
+ br label %8
+
+8:
+ %9 = phi i32 [ %7, %5 ], [ -1, %3 ]
+ ret i32 %9
+}
+; ASM-LABEL: might_call_global_func_ptr:
+; ASM: movq global_func_ptr(%rip), %rax
+; ASM-NEXT: movq %rdx, %rcx
+; ASM-NEXT: .Limpcall7:
+; ASM-NEXT: rex64 jmpq *__guard_dispatch_icall_fptr(%rip)
+
+define dso_local void @invoke_many_args(ptr %0, ptr %1, ptr %2) personality ptr @__C_specific_handler {
+ %4 = alloca ptr, align 8
+ %5 = alloca ptr, align 8
+ %6 = alloca ptr, align 8
+ invoke void %0(ptr %1, ptr %2, ptr %4, ptr %5, ptr %6)
+ to label %7 unwind label %8
+
+7:
+ ret void
+
+8:
+ %9 = cleanuppad within none []
+ cleanupret from %9 unwind to caller
+}
+; ASM-LABEL: invoke_many_args:
+; ASM: .Limpcall8:
+; ASM-NEXT: callq *__guard_dispatch_icall_fptr(%rip)
+; ASM-NEXT: nop
+
+; ASM-LABEL .section .retplne,"yi"
+; ASM-NEXT .asciz "RetpolineV1"
+; ASM-NEXT .long 24
+; ASM-NEXT .secnum .text
+; ASM-NEXT .long 10
+; ASM-NEXT .secoffset .Limpcall7
+; ASM-NEXT .long 9
+; ASM-NEXT .secoffset .Limpcall8
+; ASM-NEXT .long 40
+; ASM-NEXT .secnum nc_sect
+; ASM-NEXT .long 3
+; ASM-NEXT .secoffset .Limpcall0
+; ASM-NEXT .long 3
+; ASM-NEXT .secoffset .Limpcall1
+; ASM-NEXT .long 9
+; ASM-NEXT .secoffset .Limpcall2
+; ASM-NEXT .long 9
+; ASM-NEXT .secoffset .Limpcall3
+; ASM-NEXT .long 32
+; ASM-NEXT .secnum tc_sect
+; ASM-NEXT .long 2
+; ASM-NEXT .secoffset .Limpcall4
+; ASM-NEXT .long 10
+; ASM-NEXT .secoffset .Limpcall5
+; ASM-NEXT .long 10
+; ASM-NEXT .secoffset .Limpcall6
+
+; The loader assumes an exact sequence of instructions/bytes at each marked site since it may
+; replace the instruction(s) with new instruction(s), and the MSVC linker validates these at link
+; time.
+
+; Kind = 9 (IMAGE_RETPOLINE_AMD64_CFG_CALL)
+; OBJ-LABEL: <normal_call>:
+; OBJ: : ff 15 00 00 00 00 callq *(%rip)
+
+; Kind = 10 (IMAGE_RETPOLINE_AMD64_CFG_BR_REX)
+; OBJ-LABEL: <tc_sect>:
+; OBJ: : 48 ff 25 00 00 00 00 jmpq *(%rip)
!llvm.module.flags = !{!0, !1}
!0 = !{i32 1, !"import-call-optimization", i32 1}
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
index fe22b251685e6..fb628fc34bdb5 100644
--- a/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
@@ -1,8 +1,15 @@
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
-
-; CHECK-LABEL: uses_rax:
-; CHECK: .Limpcall0:
-; CHECK-NEXT: jmpq *%rax
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix ASM
+; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM
+; RUN: llc -mtriple=x86_64-pc-windows-msvc --filetype=obj -o - %s | llvm-objdump - --disassemble \
+; RUN: | FileCheck %s --check-prefix OBJ
+
+; ASM-LABEL: uses_rax:
+; ASM: .Limpcall0:
+; ASM-NEXT: jmpq *%rax
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+; ASM-NEXT: int3
define void @uses_rax(i32 %x) {
entry:
@@ -34,9 +41,13 @@ sw.epilog:
ret void
}
-; CHECK-LABEL: uses_rcx:
-; CHECK: .Limpcall1:
-; CHECK-NEXT: jmpq *%rcx
+; ASM-LABEL: uses_rcx:
+; ASM: .Limpcall1:
+; ASM-NEXT: jmpq *%rcx
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+; ASM-NEXT: int3
define void @uses_rcx(i32 %x) {
entry:
@@ -70,14 +81,32 @@ sw.epilog:
declare void @g(i32)
-; CHECK-LABEL: .section .retplne,"yi"
-; CHECK-NEXT: .asciz "RetpolineV1"
-; CHECK-NEXT: .long 24
-; CHECK-NEXT: .secnum .text
-; CHECK-NEXT: .long 16
-; CHECK-NEXT: .secoffset .Limpcall0
-; CHECK-NEXT: .long 17
-; CHECK-NEXT: .secoffset .Limpcall1
+; ASM-LABEL: .section .retplne,"yi"
+; ASM-NEXT: .asciz "RetpolineV1"
+; ASM-NEXT: .long 24
+; ASM-NEXT: .secnum .text
+; ASM-NEXT: .long 16
+; ASM-NEXT: .secoffset .Limpcall0
+; ASM-NEXT: .long 17
+; ASM-NEXT: .secoffset .Limpcall1
+
+; The loader assumes an exact sequence of instructions/bytes at each marked site since it may
+; replace the instruction(s) with new instruction(s), and the MSVC linker validates these at link
+; time.
+
+; Kind = 16-31 (IMAGE_RETPOLINE_AMD64_SWITCHTABLE_*)
+; OBJ-LABEL: <uses_rax>:
+; OBJ: : ff e0 jmpq *%rax
+; OBJ-NEXT: : cc int3
+; OBJ-NEXT: : cc int3
+; OBJ-NEXT: : cc int3
+; OBJ-NEXT: : cc int3
+; OBJ-LABEL: <uses_rcx>:
+; OBJ: : ff e1 jmpq *%rcx
+; OBJ-NEXT: : cc int3
+; OBJ-NEXT: : cc int3
+; OBJ-NEXT: : cc int3
+; OBJ-NEXT: : cc int3
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"import-call-optimization", i32 1}
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization.ll b/llvm/test/CodeGen/X86/win-import-call-optimization.ll
index cc7e1a9f81e34..0d62779cb444b 100644
--- a/llvm/test/CodeGen/X86/win-import-call-optimization.ll
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization.ll
@@ -1,67 +1,189 @@
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM
+; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM
+; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc -o - %s | \
+; RUN: FileCheck %s --check-prefix ASM
+; RUN: llc -mtriple=x86_64-pc-windows-msvc --filetype=obj -o - %s | llvm-objdump - --disassemble \
+; RUN: | FileCheck %s --check-prefix OBJ
+
+ at global_func_ptr = external dso_local local_unnamed_addr global ptr, align 8
define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
entry:
call void @a()
call void @a()
call void %func_ptr()
+ %0 = load ptr, ptr @global_func_ptr, align 8
+ call void %0()
ret void
}
-; CHECK-LABEL: normal_call:
-; CHECK: .Limpcall0:
-; CHECK-NEXT: rex64
-; CHECK-NEXT: callq __imp_a
-; CHECK-NEXT: nopl 8(%rax,%rax)
-; CHECK-NEXT: .Limpcall1:
-; CHECK-NEXT: rex64
-; CHECK-NEXT: callq __imp_a
-; CHECK-NEXT: nopl 8(%rax,%rax)
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: .Limpcall2:
-; CHECK-NEXT: callq *%rax
-; CHECK-NEXT: nopl (%rax)
-; CHECK-NEXT: nop
+; ASM-LABEL: normal_call:
+; ASM: .Limpcall0:
+; ASM-NEXT: rex64
+; ASM-NEXT: callq *__imp_a(%rip)
+; ASM-NEXT: nopl (%rax,%rax)
+; ASM-NEXT: .Limpcall1:
+; ASM-NEXT: rex64
+; ASM-NEXT: callq *__imp_a(%rip)
+; ASM-NEXT: nopl (%rax,%rax)
+; ASM-NEXT: movq %rsi, %rax
+; ASM-NEXT: .Limpcall2:
+; ASM-NEXT: callq *%rax
+; ASM-NEXT: nopl (%rax)
+; ASM-NEXT: movq global_func_ptr(%rip), %rax
+; ASM-NEXT: .Limpcall3:
+; ASM-NEXT: callq *%rax
+; ASM-NEXT: nopl (%rax)
+; ASM-NEXT: nop
define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
entry:
tail call void @b()
ret void
}
-; CHECK-LABEL: tail_call:
-; CHECK: .Limpcall3:
-; CHECK-NEXT: jmp __imp_b
+; ASM-LABEL: tail_call:
+; ASM: .Limpcall4:
+; ASM-NEXT: rex64 jmpq *__imp_b(%rip)
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+; ASM-NEXT: int3
define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
entry:
tail call void %func_ptr()
ret void
}
-; CHECK-LABEL: tail_call_fp:
-; CHECK: movq %rcx, %rax
-; CHECK-NEXT: .Limpcall4:
-; CHECK-NEXT: rex64 jmpq *%rax
+; ASM-LABEL: tail_call_fp:
+; ASM: movq %rcx, %rax
+; ASM-NEXT: .Limpcall5:
+; ASM-NEXT: rex64 jmpq *%rax
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+
+define dso_local void @tail_call_global_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
+entry:
+ %0 = load ptr, ptr @global_func_ptr, align 8
+ tail call void %0()
+ ret void
+}
+; ASM-LABEL: tail_call_global_fp:
+; ASM: movq global_func_ptr(%rip), %rax
+; ASM-NEXT: .Limpcall6:
+; ASM-NEXT: rex64 jmpq *%rax
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+
+; Regression test: conditional tail calls can't be encoded, so make sure they aren't emitted.
+define void @might_call(i1 %4) local_unnamed_addr {
+ br i1 %4, label %makecall, label %finish
+
+makecall:
+ tail call void @a()
+ br label %finish
+
+finish:
+ ret void
+}
+; ASM-LABEL: might_call:
+; ASM: .Limpcall7:
+; ASM-NEXT: rex64 jmpq *__imp_a(%rip)
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+; ASM-NEXT: int3
+
+; Regression test: this particular sequence caused a cycle in DAG scheduling due
+; to the requirement to use RAX for register-indirect calls. We now explicitly
+; copy to RAX which breaks the cycle.
+define dso_local i32 @not_scheduled_repro(ptr %0, ptr %1, ptr %2) local_unnamed_addr {
+ %4 = load i64, ptr %0, align 8
+ %5 = inttoptr i64 %4 to ptr
+ %6 = tail call i64 %5(ptr noundef %1)
+ store i64 %6, ptr %2, align 8
+ ret i32 0
+}
+; ASM-LABEL: not_scheduled_repro:
+; ASM: movq (%rcx), %rax
+; ASM-NEXT: movq %rdx, %rcx
+; ASM-NEXT: .Limpcall8:
+; ASM-NEXT: callq *%rax
+; ASM-NEXT: nopl (%rax)
+
+define dso_local void @not_scheduled_repro_tc(ptr %0, ptr %1) local_unnamed_addr {
+ %4 = load i64, ptr %0, align 8
+ %5 = inttoptr i64 %4 to ptr
+ tail call void %5(ptr noundef %1)
+ ret void
+}
+; ASM-LABEL: not_scheduled_repro_tc:
+; ASM: movq (%rcx), %rax
+; ASM-NEXT: movq %rdx, %rcx
+; ASM-NEXT: .Limpcall9:
+; ASM-NEXT: rex64 jmpq *%rax
+; ASM-NEXT: int3
+; ASM-NEXT: int3
declare dllimport void @a() local_unnamed_addr
declare dllimport void @b() local_unnamed_addr
-; CHECK-LABEL .section .retplne,"yi"
-; CHECK-NEXT .asciz "RetpolineV1"
-; CHECK-NEXT .long 24
-; CHECK-NEXT .secnum tc_sect
-; CHECK-NEXT .long 3
-; CHECK-NEXT .secoffset .Limpcall3
-; CHECK-NEXT .long 5
-; CHECK-NEXT .secoffset .Limpcall4
-; CHECK-NEXT .long 32
-; CHECK-NEXT .secnum nc_sect
-; CHECK-NEXT .long 3
-; CHECK-NEXT .secoffset .Limpcall0
-; CHECK-NEXT .long 3
-; CHECK-NEXT .secoffset .Limpcall1
-; CHECK-NEXT .long 5
-; CHECK-NEXT .secoffset .Limpcall2
+; ASM-LABEL .section .retplne,"yi"
+; ASM-NEXT .asciz "RetpolineV1"
+; ASM-NEXT .long 32
+; ASM-NEXT .secnum tc_sect
+; ASM-NEXT .long 2
+; ASM-NEXT .secoffset .Limpcall4
+; ASM-NEXT .long 6
+; ASM-NEXT .secoffset .Limpcall5
+; ASM-NEXT .long 6
+; ASM-NEXT .secoffset .Limpcall6
+; ASM-NEXT .long 40
+; ASM-NEXT .secnum nc_sect
+; ASM-NEXT .long 3
+; ASM-NEXT .secoffset .Limpcall0
+; ASM-NEXT .long 3
+; ASM-NEXT .secoffset .Limpcall1
+; ASM-NEXT .long 5
+; ASM-NEXT .secoffset .Limpcall2
+; ASM-NEXT .long 5
+; ASM-NEXT .secoffset .Limpcall3
+; ASM-NEXT .long 32
+; ASM-NEXT .secnum .text
+; ASM-NEXT .long 2
+; ASM-NEXT .secoffset .Limpcall7
+; ASM-NEXT .long 5
+; ASM-NEXT .secoffset .Limpcall8
+; ASM-NEXT .long 6
+; ASM-NEXT .secoffset .Limpcall9
+
+; The loader assumes an exact sequence of instructions/bytes at each marked site since it may
+; replace the instruction(s) with new instruction(s), and the MSVC linker validates these at link
+; time.
+
+; Kind = 3 (IMAGE_RETPOLINE_AMD64_IMPORT_CALL)
+; OBJ-LABEL: <normal_call>:
+; OBJ: : 48 ff 15 00 00 00 00 callq *(%rip)
+; OBJ-NEXT: : 0f 1f 44 00 00 nopl (%rax,%rax)
+
+; Kind = 5 (IMAGE_RETPOLINE_AMD64_INDIR_CALL)
+; OBJ: : ff d0 callq *%rax
+; OBJ-NEXT: : 0f 1f 00 nopl (%rax)
+
+; Kind = 2 (IMAGE_RETPOLINE_AMD64_IMPORT_BR)
+; OBJ-LABEL: <tc_sect>:
+; OBJ: : 48 ff 25 00 00 00 00 jmpq *(%rip)
+; OBJ-NEXT: : cc int3
+; OBJ-NEXT: : cc int3
+; OBJ-NEXT: : cc int3
+; OBJ-NEXT: : cc int3
+; OBJ-NEXT: : cc int3
+
+; Kind = 6 (IMAGE_RETPOLINE_AMD64_INDIR_BR)
+; OBJ-LABEL: <tail_call_fp>:
+; OBJ: : 48 ff e0 jmpq *%rax
+; OBJ-NEXT: : cc int3
+; OBJ-NEXT: : cc int3
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"import-call-optimization", i32 1}
More information about the cfe-commits
mailing list