[PATCH] Make sibling call opt work with common x86 callee pop conventions

Thu Mar 27 19:54:48 PDT 2014

Hi nicholas,

The goal here is ensure that virtual member pointer and vtable thunks on
Windows get tail call optimized.  This is an ABI requirement for calls
that use inalloca.  If TCO fails, even at -O0, LLVM will miscompile the
program.

Fortunately, all the thunks generated by Clang should be trivially
eligible for tail call optimization: the type of the caller and callee
match almost perfectly, modulo the type of the 'this' pointer.

Ignoring inalloca, this patch should stand on its own as correct.

This patch doesn't implement any checking or verification to defend
against miscompilation of inalloca if TCO fails, and I'm open to
suggestions on how to implement that.  One idea is that the backend
should assert that calls marked with 'tail' and 'inalloca' are always
tail call optimized.  However, that would mean that optimizers can't add
speculatively add the 'tail' marker to calls with 'inalloca'.  Do we
have such optimizations?  Do we mind changing the rules for tail and
inalloca here?

Alternatively I could add a new call instruction marker like
'guaranteed_tail' that aborts if TCO fails.

http://llvm-reviews.chandlerc.com/D3209

Files:
  lib/Target/X86/X86ISelLowering.cpp
  test/CodeGen/X86/sibcall-thiscall-thunks.ll

Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================

--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -2106,7 +2106,8 @@
 /// supports tail call optimization.
 static bool IsTailCallConvention(CallingConv::ID CC) {
   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
-          CC == CallingConv::HiPE);
+          CC == CallingConv::HiPE || CC == CallingConv::X86_StdCall ||
+          CC == CallingConv::X86_ThisCall || CC == CallingConv::X86_FastCall);
 }
 
 /// \brief Return true if the calling convention is a C calling convention.
@@ -3126,9 +3127,11 @@
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
-  // Also avoid sibcall optimization if either caller or callee uses struct
-  // return semantics.
-  if (isCalleeStructRet || isCallerStructRet)
+  // Also avoid sibcall optimization if the caller and callee have mismatched
+  // sret conventions.
+  if (!CCMatch && (isCalleeStructRet || isCallerStructRet))
+    return false;
+  if (isCalleeStructRet != isCallerStructRet)
     return false;
 
   // An stdcall/thiscall caller is expected to clean up its arguments; the
@@ -3227,7 +3230,13 @@
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     if (CCInfo.getNextStackOffset()) {
       MachineFunction &MF = DAG.getMachineFunction();
-      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
+
+      // If this is a callee pop convention, the argument amount of argument
+      // stack space available in the caller must match that of the callee.
+      if (X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg,
+                           getTargetMachine().Options.GuaranteedTailCallOpt) &&
+          CCInfo.getNextStackOffset() !=
+              MF.getInfo<X86MachineFunctionInfo>()->getArgumentStackSize())
         return false;
 
       // Check if the arguments are already laid out in the right way as
Index: test/CodeGen/X86/sibcall-thiscall-thunks.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/sibcall-thiscall-thunks.ll
@@ -0,0 +1,144 @@
+; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-win32 -O0 | FileCheck %s
+
+; IR simplified from the following C++ snippet compiled for i686-windows-msvc:
+
+; struct A { A(); ~A(); int a; };
+;
+; struct B {
+;   virtual int  f(int);
+;   virtual int  g(A, int, A);
+;   virtual void h(A, int, A);
+;   virtual A    i(A, int, A);
+;   virtual A    j(int);
+; };
+;
+; int  (B::*mp_f)(int)       = &B::f;
+; int  (B::*mp_g)(A, int, A) = &B::g;
+; void (B::*mp_h)(A, int, A) = &B::h;
+; A    (B::*mp_i)(A, int, A) = &B::i;
+; A    (B::*mp_j)(int)       = &B::j;
+
+; Each member pointer creates a thunk.  The ones with inalloca are required to
+; tail calls by the ABI, even at O0.
+
+%struct.B = type { i32 (...)** }
+%struct.A = type { i32 }
+
+; CHECK-LABEL: f_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc i32 @f_thunk(%struct.B* %this, i32) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, i32)***
+  %vtable = load i32 (%struct.B*, i32)*** %1
+  %2 = load i32 (%struct.B*, i32)** %vtable
+  %3 = tail call x86_thiscallcc i32 %2(%struct.B* %this, i32 %0)
+  ret i32 %3
+}
+
+; CHECK-LABEL: g_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc i32 @g_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+  %vtable = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+  %3 = tail call x86_thiscallcc i32 %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0)
+  ret i32 %3
+}
+
+; CHECK-LABEL: h_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc void @h_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+  %vtable = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 2
+  %2 = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+  tail call x86_thiscallcc void %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0)
+  ret void
+}
+
+; CHECK-LABEL: i_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc %struct.A* @i_thunk(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)***
+  %vtable = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vtable, i32 3
+  %2 = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vfn
+  %3 = tail call x86_thiscallcc %struct.A* %2(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca %0)
+  ret %struct.A* %3
+}
+
+; CHECK-LABEL: j_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc void @j_thunk(%struct.A* noalias sret %agg.result, %struct.B* %this, i32) {
+entry:
+  %1 = bitcast %struct.B* %this to void (%struct.A*, %struct.B*, i32)***
+  %vtable = load void (%struct.A*, %struct.B*, i32)*** %1
+  %vfn = getelementptr inbounds void (%struct.A*, %struct.B*, i32)** %vtable, i32 4
+  %2 = load void (%struct.A*, %struct.B*, i32)** %vfn
+  tail call x86_thiscallcc void %2(%struct.A* sret %agg.result, %struct.B* %this, i32 %0)
+  ret void
+}
+
+; CHECK-LABEL: k_thunk:
+; CHECK: call
+; CHECK: retl
+define x86_thiscallcc void @k_thunk(%struct.A* noalias sret %agg.result, %struct.B* %this, i32) {
+entry:
+  %1 = bitcast %struct.B* %this to void (%struct.A*, %struct.B*)***
+  %vtable = load void (%struct.A*, %struct.B*)*** %1
+  %vfn = getelementptr inbounds void (%struct.A*, %struct.B*)** %vtable, i32 5
+  %2 = load void (%struct.A*, %struct.B*)** %vfn
+  tail call x86_thiscallcc void %2(%struct.A* sret %agg.result, %struct.B* %this)
+  ret void
+}
+
+; CHECK-LABEL: l_thunk:
+; CHECK: call
+; CHECK: retl
+define x86_thiscallcc void @l_thunk(%struct.A* noalias sret %agg.result, %struct.B* %this, i32) {
+entry:
+  %1 = bitcast %struct.B* %this to void (%struct.A*, %struct.B*, i32, i32)***
+  %vtable = load void (%struct.A*, %struct.B*, i32, i32)*** %1
+  %vfn = getelementptr inbounds void (%struct.A*, %struct.B*, i32, i32)** %vtable, i32 5
+  %2 = load void (%struct.A*, %struct.B*, i32, i32)** %vfn
+  tail call x86_thiscallcc void %2(%struct.A* sret %agg.result, %struct.B* %this, i32 1, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: _stdcall_thunk at 8:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_stdcallcc void @stdcall_thunk(<{ %struct.B*, %struct.A }>* inalloca) {
+entry:
+  %this_ptr = getelementptr inbounds <{ %struct.B*, %struct.A }>* %0, i32 0, i32 0
+  %this = load %struct.B** %this_ptr
+  %1 = bitcast %struct.B* %this to i32 (<{ %struct.B*, %struct.A }>*)***
+  %vtable = load i32 (<{ %struct.B*, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (<{ %struct.B*, %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (<{ %struct.B*, %struct.A }>*)** %vfn
+  %3 = tail call x86_stdcallcc i32 %2(<{ %struct.B*, %struct.A }>* inalloca %0)
+  ret void
+}
+
+; CHECK-LABEL: @fastcall_thunk at 8:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_fastcallcc void @fastcall_thunk(%struct.B* inreg %this, <{ %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A }>*)***
+  %vtable = load i32 (%struct.B*, <{ %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (%struct.B*, <{ %struct.A }>*)** %vfn
+  %3 = tail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* inalloca %0)
+  ret void
+}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D3209.1.patch
Type: text/x-patch
Size: 8140 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140327/def7e4eb/attachment.bin>