[llvm] [win][x64] Fix import call optimization for calls to dllimports and global function pointers (PR #160604)

Wed Sep 24 13:57:33 PDT 2025

https://github.com/dpaoliello created https://github.com/llvm/llvm-project/pull/160604

As noted in #149714 - Import Call Optimization on x64 was generating relative calls instead of indirect calls for `dllimport` functions.

Additionally, calls to pointers in `GlobalValue` values were not being marked in the metadata at all.

As part of making this work, enabling Import Call Optimization no longer needs to bail out of FastISel but is instead supported.

Note that there is still work to do here: when FastISel is enabled, LLVM generates calls to the CFG dispatch function via a call to a register. This technically works but is sub-optimal and the wrong sequence for Import Call Optimization.

Fixes #149714 

>From 6075d26eef2a901426b11d231f60f9ef5ba012ec Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao at microsoft.com>
Date: Wed, 17 Sep 2025 17:29:41 -0700
Subject: [PATCH] [win][x64] Fix import call optimization for calls to
 dllimports and global function pointers

---
 llvm/lib/Target/X86/X86FastISel.cpp           | 16 ++--
 llvm/lib/Target/X86/X86InstrCompiler.td       |  8 +-
 llvm/lib/Target/X86/X86InstrControl.td        |  4 +-
 llvm/lib/Target/X86/X86InstrPredicates.td     |  3 +-
 llvm/lib/Target/X86/X86MCInstLower.cpp        | 39 ++++++++--
 .../win-import-call-optimization-cfguard.ll   | 75 ++++++++++++++++---
 .../win-import-call-optimization-jumptable.ll |  1 +
 .../X86/win-import-call-optimization.ll       | 52 +++++++++----
 8 files changed, 155 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index f007886115d35..5f8830469e893 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3317,11 +3317,6 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     if (Flag.isSwiftError() || Flag.isPreallocated())
       return false;
 
-  // Can't handle import call optimization.
-  if (Is64Bit &&
-      MF->getFunction().getParent()->getModuleFlag("import-call-optimization"))
-    return false;
-
   SmallVector<MVT, 16> OutVTs;
   SmallVector<Type *, 16> ArgTys;
   SmallVector<Register, 16> ArgRegs;
@@ -3563,6 +3558,17 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   if (CalleeOp) {
     // Register-indirect call.
     unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
+
+    const Module *M = FuncInfo.MF->getFunction().getParent();
+    if (CalleeOp != X86::RAX && Is64Bit &&
+        M->getModuleFlag("import-call-optimization")) {
+      // Import call optimization requires all indirect calls to be via RAX.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+              TII.get(TargetOpcode::COPY), X86::RAX)
+          .addReg(CalleeOp);
+      CalleeOp = X86::RAX;
+    }
+
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc))
       .addReg(CalleeOp);
   } else {
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 0fd44b74fd449..51dc67b36edfa 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1350,11 +1350,11 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
-          Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+          Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabledOrCFGuardEnabled]>;
 
 def : Pat<(X86tcret GR64_TCW64:$dst, timm:$off),
           (TCRETURN_WIN64ri GR64_TCW64:$dst, timm:$off)>,
-          Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+          Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabledOrCFGuardEnabled]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (TCRETURNri64_ImpCall ptr_rc_tailcall:$dst, timm:$off)>,
@@ -1364,11 +1364,11 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
 // There wouldn't be enough scratch registers for base+index.
 def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),
           (TCRETURNmi64 addr:$dst, timm:$off)>,
-          Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls]>;
+          Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabledOrCFGuardEnabled]>;
 
 def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),
           (TCRETURN_WINmi64 addr:$dst, timm:$off)>,
-          Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls]>;
+          Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabledOrCFGuardEnabled]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, timm:$off)>,
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index e8527cd73abb5..5ca051072fe07 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -331,11 +331,11 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
                       Requires<[In64BitMode]>;
   def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
                         "call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
-                      Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationDisabled]>;
+                      Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationDisabledOrCFGuardEnabled]>;
   def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
                         "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
                       Requires<[In64BitMode,FavorMemIndirectCall,
-                                NotUseIndirectThunkCalls]>;
+                                NotUseIndirectThunkCalls,ImportCallOptimizationDisabledOrCFGuardEnabled]>;
 
   // Non-tracking calls for IBT, use with caution.
   let isCodeGenOnly = 1 in {
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index c20bb05018b4d..025c0ebd7bb9b 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -235,7 +235,8 @@ let RecomputePerFunction = 1 in {
   def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || "
                                         "!Subtarget->hasSSE41()">;
   def ImportCallOptimizationEnabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
-  def ImportCallOptimizationDisabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
+  def ImportCallOptimizationDisabledOrCFGuardEnabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\") ||"
+                                                                 "MF->getFunction().getParent()->getModuleFlag(\"cfguard\")">;
 
   def IsWin64CCFunc : Predicate<"Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
   def IsNotWin64CCFunc : Predicate<"!Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 481a9be8374ab..6ea114280b4f1 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -2346,7 +2346,8 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   case X86::TAILJMPr64_REX: {
     if (EnableImportCallOptimization) {
-      assert(MI->getOperand(0).getReg() == X86::RAX &&
+      assert((MI->getOperand(0).getReg() == X86::RAX ||
+              MF->getFunction().getParent()->getModuleFlag("cfguard")) &&
              "Indirect tail calls with impcall enabled must go through RAX (as "
              "enforced by TCRETURNImpCallri64)");
       emitLabelAndRecordForImportCallOptimization(
@@ -2551,13 +2552,19 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
       emitLabelAndRecordForImportCallOptimization(
           IMAGE_RETPOLINE_AMD64_IMPORT_CALL);
 
-      MCInst TmpInst;
-      MCInstLowering.Lower(MI, TmpInst);
-
       // For Import Call Optimization to work, we need a the call instruction
       // with a rex prefix, and a 5-byte nop after the call instruction.
       EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
-      emitCallInstruction(TmpInst);
+      // Following the pattern of how dllimport calls are lowered in FastISel.
+      emitCallInstruction(
+          MCInstBuilder(X86::CALL64m)
+              .addReg(X86::RIP)
+              .addImm(1)
+              .addReg(0)
+              .addExpr(MCSymbolRefExpr::create(
+                  MCInstLowering.GetSymbolFromOperand(MI->getOperand(0)),
+                  OutStreamer->getContext()))
+              .addReg(0));
       emitNop(*OutStreamer, 5, Subtarget);
       maybeEmitNopAfterCallForWindowsEH(MI);
       return;
@@ -2586,9 +2593,25 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
     break;
 
   case X86::CALL64m:
-    if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) {
-      emitLabelAndRecordForImportCallOptimization(
-          IMAGE_RETPOLINE_AMD64_CFG_CALL);
+    if (EnableImportCallOptimization) {
+      if (isCallToCFGuardFunction(MI)) {
+        emitLabelAndRecordForImportCallOptimization(
+            IMAGE_RETPOLINE_AMD64_CFG_CALL);
+      } else if (isImportedFunction(MI->getOperand(3))) {
+        emitLabelAndRecordForImportCallOptimization(
+            IMAGE_RETPOLINE_AMD64_IMPORT_CALL);
+
+        MCInst TmpInst;
+        MCInstLowering.Lower(MI, TmpInst);
+
+        // For Import Call Optimization to work, we need a the call instruction
+        // with a rex prefix, and a 5-byte nop after the call instruction.
+        EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+        emitCallInstruction(TmpInst);
+        emitNop(*OutStreamer, 5, Subtarget);
+        maybeEmitNopAfterCallForWindowsEH(MI);
+        return;
+      }
     }
     break;
 
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
index 12be910d68ee9..6cda56ea98b0e 100644
--- a/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
@@ -1,13 +1,48 @@
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s
+
+; FIXME: FastISel is emitting calls to the CFG dispatch function as indirect
+; calls via registers. Normally this would work, but for Import Call it is the
+; incorrect pattern.
+
+ at global_func_ptr = external dso_local local_unnamed_addr global ptr, align 8
+declare dllimport void @a() local_unnamed_addr
+declare dllimport void @b() local_unnamed_addr
 
 define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
 entry:
+  call void @a()
+  call void @a()
   call void %func_ptr()
+  %0 = load ptr, ptr @global_func_ptr, align 8
+  call void %0()
   ret void
 }
 ; CHECK-LABEL:  normal_call:
-; CHECK:        .Limpcall0:
+; CHECK:          movq    %rcx, %rsi
+; CHECK-NEXT:   .Limpcall0:
+; CHECK-NEXT:     rex64
+; CHECK-NEXT:     callq   *__imp_a(%rip)
+; CHECK-NEXT:     nopl    8(%rax,%rax)
+; CHECK-NEXT:   .Limpcall1:
+; CHECK-NEXT:     rex64
+; CHECK-NEXT:     callq   *__imp_a(%rip)
+; CHECK-NEXT:     nopl    8(%rax,%rax)
+; CHECK-NEXT:     movq    %rsi, %rax
+; CHECK-NEXT:   .Limpcall2:
+; CHECK-NEXT:     callq   *__guard_dispatch_icall_fptr(%rip)
+; CHECK-NEXT:     movq    global_func_ptr(%rip), %rax
+; CHECK-NEXT:   .Limpcall3:
 ; CHECK-NEXT:     callq   *__guard_dispatch_icall_fptr(%rip)
+; CHECK-NEXT:     nop
+
+define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
+entry:
+  tail call void @b()
+  ret void
+}
+; CHECK-LABEL:  tail_call:
+; CHECK:        .Limpcall4:
+; CHECK-NEXT:     jmp     __imp_b
 
 define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
 entry:
@@ -15,19 +50,41 @@ entry:
   ret void
 }
 ; CHECK-LABEL:  tail_call_fp:
-; CHECK:        .Limpcall1:
+; CHECK:          movq    %rcx, %rax
+; CHECK-NEXT:   .Limpcall5:
 ; CHECK-NEXT:     rex64 jmpq      *__guard_dispatch_icall_fptr(%rip)
 
+define dso_local void @tail_call_global_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
+entry:
+  %0 = load ptr, ptr @global_func_ptr, align 8
+  tail call void %0()
+  ret void
+}
+; CHECK-LABEL:  tail_call_global_fp:
+; CHECK:          movq    global_func_ptr(%rip), %rax
+; CHECK-NEXT:  .Limpcall6:
+; CHECK-NEXT:     rex64 jmpq *__guard_dispatch_icall_fptr(%rip)
+
 ; CHECK-LABEL  .section   .retplne,"yi"
 ; CHECK-NEXT   .asciz  "RetpolineV1"
-; CHECK-NEXT   .long   16
-; CHECK-NEXT   .secnum tc_sect
-; CHECK-NEXT   .long   10
-; CHECK-NEXT   .secoffset      .Limpcall1
-; CHECK-NEXT   .long   16
+; CHECK-NEXT   .long   40
 ; CHECK-NEXT   .secnum nc_sect
-; CHECK-NEXT   .long   9
+; CHECK-NEXT   .long   3
 ; CHECK-NEXT   .secoffset      .Limpcall0
+; CHECK-NEXT   .long   3
+; CHECK-NEXT   .secoffset      .Limpcall1
+; CHECK-NEXT   .long   9
+; CHECK-NEXT   .secoffset      .Limpcall2
+; CHECK-NEXT   .long   9
+; CHECK-NEXT   .secoffset      .Limpcall3
+; CHECK-NEXT   .long   32
+; CHECK-NEXT   .secnum tc_sect
+; CHECK-NEXT   .long   2
+; CHECK-NEXT   .secoffset      .Limpcall4
+; CHECK-NEXT   .long   4
+; CHECK-NEXT   .secoffset      .Limpcall5
+; CHECK-NEXT   .long   4
+; CHECK-NEXT   .secoffset      .Limpcall6
 
 !llvm.module.flags = !{!0, !1}
 !0 = !{i32 1, !"import-call-optimization", i32 1}
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
index fe22b251685e6..c2389a10415d1 100644
--- a/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
+; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s
 
 ; CHECK-LABEL:  uses_rax:
 ; CHECK:        .Limpcall0:
diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization.ll b/llvm/test/CodeGen/X86/win-import-call-optimization.ll
index cc7e1a9f81e34..eba0dab7e1559 100644
--- a/llvm/test/CodeGen/X86/win-import-call-optimization.ll
+++ b/llvm/test/CodeGen/X86/win-import-call-optimization.ll
@@ -1,27 +1,36 @@
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s
+; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s
+; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc -o - %s | \
+; RUN:  FileCheck %s
+
+ at global_func_ptr = external dso_local local_unnamed_addr global ptr, align 8
 
 define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
 entry:
   call void @a()
   call void @a()
   call void %func_ptr()
+  %0 = load ptr, ptr @global_func_ptr, align 8
+  call void %0()
   ret void
 }
 ; CHECK-LABEL:  normal_call:
 ; CHECK:        .Limpcall0:
 ; CHECK-NEXT:     rex64
-; CHECK-NEXT:     callq   __imp_a
+; CHECK-NEXT:     callq   *__imp_a(%rip)
 ; CHECK-NEXT:     nopl    8(%rax,%rax)
 ; CHECK-NEXT:   .Limpcall1:
 ; CHECK-NEXT:     rex64
-; CHECK-NEXT:     callq   __imp_a
+; CHECK-NEXT:     callq   *__imp_a(%rip)
 ; CHECK-NEXT:     nopl    8(%rax,%rax)
 ; CHECK-NEXT:     movq    %rsi, %rax
 ; CHECK-NEXT:   .Limpcall2:
 ; CHECK-NEXT:     callq   *%rax
 ; CHECK-NEXT:     nopl    (%rax)
+; CHECK-NEXT:     movq global_func_ptr(%rip), %rax
+; CHECK-NEXT:   .Limpcall3:
+; CHECK-NEXT:     callq   *%rax
+; CHECK-NEXT:     nopl    (%rax)
 ; CHECK-NEXT:     nop
 
 define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
@@ -30,7 +39,7 @@ entry:
   ret void
 }
 ; CHECK-LABEL:  tail_call:
-; CHECK:        .Limpcall3:
+; CHECK:        .Limpcall4:
 ; CHECK-NEXT:     jmp __imp_b
 
 define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
@@ -40,7 +49,18 @@ entry:
 }
 ; CHECK-LABEL:  tail_call_fp:
 ; CHECK:          movq    %rcx, %rax
-; CHECK-NEXT:   .Limpcall4:
+; CHECK-NEXT:   .Limpcall5:
+; CHECK-NEXT:     rex64 jmpq      *%rax
+
+define dso_local void @tail_call_global_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
+entry:
+  %0 = load ptr, ptr @global_func_ptr, align 8
+  tail call void %0()
+  ret void
+}
+; CHECK-LABEL:  tail_call_global_fp:
+; CHECK:          movq    global_func_ptr(%rip), %rax
+; CHECK-NEXT:   .Limpcall6:
 ; CHECK-NEXT:     rex64 jmpq      *%rax
 
 declare dllimport void @a() local_unnamed_addr
@@ -48,13 +68,7 @@ declare dllimport void @b() local_unnamed_addr
 
 ; CHECK-LABEL  .section   .retplne,"yi"
 ; CHECK-NEXT   .asciz  "RetpolineV1"
-; CHECK-NEXT   .long   24
-; CHECK-NEXT   .secnum tc_sect
-; CHECK-NEXT   .long   3
-; CHECK-NEXT   .secoffset      .Limpcall3
-; CHECK-NEXT   .long   5
-; CHECK-NEXT   .secoffset      .Limpcall4
-; CHECK-NEXT   .long   32
+; CHECK-NEXT   .long   40
 ; CHECK-NEXT   .secnum nc_sect
 ; CHECK-NEXT   .long   3
 ; CHECK-NEXT   .secoffset      .Limpcall0
@@ -62,6 +76,16 @@ declare dllimport void @b() local_unnamed_addr
 ; CHECK-NEXT   .secoffset      .Limpcall1
 ; CHECK-NEXT   .long   5
 ; CHECK-NEXT   .secoffset      .Limpcall2
+; CHECK-NEXT   .long   5
+; CHECK-NEXT   .secoffset      .Limpcall3
+; CHECK-NEXT   .long   32
+; CHECK-NEXT   .secnum tc_sect
+; CHECK-NEXT   .long   2
+; CHECK-NEXT   .secoffset      .Limpcall4
+; CHECK-NEXT   .long   4
+; CHECK-NEXT   .secoffset      .Limpcall5
+; CHECK-NEXT   .long   4
+; CHECK-NEXT   .secoffset      .Limpcall6
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"import-call-optimization", i32 1}