[llvm] r290240 - [X86] Vectorcall Calling Convention - Adding CodeGen Complete Support

Wed Dec 21 00:31:46 PST 2016

Author: orenb
Date: Wed Dec 21 02:31:45 2016
New Revision: 290240

URL: http://llvm.org/viewvc/llvm-project?rev=290240&view=rev
Log:
[X86] Vectorcall Calling Convention - Adding CodeGen Complete Support

The vectorcall calling convention specifies that arguments to functions are to be passed in registers, when possible.
vectorcall uses more registers for arguments than fastcall or the default x64 calling convention use. 
The vectorcall calling convention is only supported in native code on x86 and x64 processors that include Streaming SIMD Extensions 2 (SSE2) and above.

The current implementation does not handle Homogeneous Vector Aggregates (HVAs) correctly and this review attempts to fix it.
This aubmit also includes additional lit tests to cover better HVAs corner cases.

Differential Revision: https://reviews.llvm.org/D27392


Modified:
    llvm/trunk/include/llvm/CodeGen/CallingConvLower.h
    llvm/trunk/include/llvm/Target/TargetCallingConv.h
    llvm/trunk/lib/CodeGen/CallingConvLower.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/trunk/lib/Target/X86/X86CallingConv.cpp
    llvm/trunk/lib/Target/X86/X86CallingConv.h
    llvm/trunk/lib/Target/X86/X86CallingConv.td
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vectorcall.ll

Modified: llvm/trunk/include/llvm/CodeGen/CallingConvLower.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/CallingConvLower.h?rev=290240&r1=290239&r2=290240&view=diff
==============================================================================

--- llvm/trunk/include/llvm/CodeGen/CallingConvLower.h (original)
+++ llvm/trunk/include/llvm/CodeGen/CallingConvLower.h Wed Dec 21 02:31:45 2016
@@ -296,6 +296,12 @@ public:
   void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
                               CCAssignFn Fn);
 
+  /// The function will invoke AnalyzeFormalArguments.
+  void AnalyzeArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                        CCAssignFn Fn) {
+    AnalyzeFormalArguments(Ins, Fn);
+  }
+
   /// AnalyzeReturn - Analyze the returned values of a return,
   /// incorporating info about the result values into this state.
   void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -318,11 +324,22 @@ public:
                            SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
                            CCAssignFn Fn);
 
+  /// The function will invoke AnalyzeCallOperands.
+  void AnalyzeArguments(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        CCAssignFn Fn) {
+    AnalyzeCallOperands(Outs, Fn);
+  }
+
   /// AnalyzeCallResult - Analyze the return values of a call,
   /// incorporating info about the passed values into this state.
   void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
                          CCAssignFn Fn);
 
+  /// A shadow allocated register is a register that was allocated
+  /// but wasn't added to the location list (Locs).
+  /// \returns true if the register was allocated as shadow or false otherwise.
+  bool IsShadowAllocatedReg(unsigned Reg) const;
+
   /// AnalyzeCallResult - Same as above except it's specialized for calls which
   /// produce a single value.
   void AnalyzeCallResult(MVT VT, CCAssignFn Fn);
@@ -521,6 +538,37 @@ public:
                                 const SmallVectorImpl<ISD::InputArg> &Ins,
                                 CCAssignFn CalleeFn, CCAssignFn CallerFn);
 
+  /// The function runs an additional analysis pass over function arguments.
+  /// It will mark each argument with the attribute flag SecArgPass.
+  /// After running, it will sort the locs list.
+  template <class T>
+  void AnalyzeArgumentsSecondPass(const SmallVectorImpl<T> &Args,
+                                  CCAssignFn Fn) {
+    unsigned NumFirstPassLocs = Locs.size();
+
+    /// Creates similar argument list to \p Args in which each argument is
+    /// marked using SecArgPass flag.
+    SmallVector<T, 16> SecPassArg;
+    // SmallVector<ISD::InputArg, 16> SecPassArg;
+    for (auto Arg : Args) {
+      Arg.Flags.setSecArgPass();
+      SecPassArg.push_back(Arg);
+    }
+
+    // Run the second argument pass
+    AnalyzeArguments(SecPassArg, Fn);
+
+    // Sort the locations of the arguments according to their original position.
+    SmallVector<CCValAssign, 16> TmpArgLocs;
+    std::swap(TmpArgLocs, Locs);
+    auto B = TmpArgLocs.begin(), E = TmpArgLocs.end();
+    std::merge(B, B + NumFirstPassLocs, B + NumFirstPassLocs, E,
+               std::back_inserter(Locs),
+               [](const CCValAssign &A, const CCValAssign &B) -> bool {
+                 return A.getValNo() < B.getValNo();
+               });
+  }
+
 private:
   /// MarkAllocated - Mark a register and all of its aliases as allocated.
   void MarkAllocated(unsigned Reg);

Modified: llvm/trunk/include/llvm/Target/TargetCallingConv.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetCallingConv.h?rev=290240&r1=290239&r2=290240&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Target/TargetCallingConv.h (original)
+++ llvm/trunk/include/llvm/Target/TargetCallingConv.h Wed Dec 21 02:31:45 2016
@@ -51,6 +51,15 @@ namespace ISD {
     static const uint64_t SwiftSelfOffs  = 14;
     static const uint64_t SwiftError     = 1ULL<<15; ///< Swift error parameter
     static const uint64_t SwiftErrorOffs = 15;
+    static const uint64_t Hva            = 1ULL << 16; ///< HVA field for
+                                                       ///< vectorcall
+    static const uint64_t HvaOffs        = 16;
+    static const uint64_t HvaStart       = 1ULL << 17; ///< HVA structure start
+                                                       ///< for vectorcall
+    static const uint64_t HvaStartOffs   = 17;
+    static const uint64_t SecArgPass     = 1ULL << 18; ///< Second argument
+                                                       ///< pass for vectorcall
+    static const uint64_t SecArgPassOffs = 18;
     static const uint64_t OrigAlign      = 0x1FULL<<27;
     static const uint64_t OrigAlignOffs  = 27;
     static const uint64_t ByValSize      = 0x3fffffffULL<<32; ///< Struct size
@@ -91,6 +100,15 @@ namespace ISD {
     bool isSwiftError() const { return Flags & SwiftError; }
     void setSwiftError() { Flags |= One << SwiftErrorOffs; }
 
+    bool isHva() const { return Flags & Hva; }
+    void setHva() { Flags |= One << HvaOffs; }
+
+    bool isHvaStart() const { return Flags & HvaStart; }
+    void setHvaStart() { Flags |= One << HvaStartOffs; }
+
+    bool isSecArgPass() const { return Flags & SecArgPass; }
+    void setSecArgPass() { Flags |= One << SecArgPassOffs; }
+
     bool isNest()      const { return Flags & Nest; }
     void setNest()     { Flags |= One << NestOffs; }
 

Modified: llvm/trunk/lib/CodeGen/CallingConvLower.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/CallingConvLower.cpp?rev=290240&r1=290239&r2=290240&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/CallingConvLower.cpp (original)
+++ llvm/trunk/lib/CodeGen/CallingConvLower.cpp Wed Dec 21 02:31:45 2016
@@ -23,6 +23,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+
 using namespace llvm;
 
 CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
@@ -64,6 +66,22 @@ void CCState::MarkAllocated(unsigned Reg
     UsedRegs[*AI/32] |= 1 << (*AI&31);
 }
 
+bool CCState::IsShadowAllocatedReg(unsigned Reg) const {
+  if (!isAllocated(Reg))
+    return false;
+
+  for (auto const &ValAssign : Locs) {
+    if (ValAssign.isRegLoc()) {
+      for (MCRegAliasIterator AI(ValAssign.getLocReg(), &TRI, true);
+           AI.isValid(); ++AI) {
+        if (*AI == Reg)
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
 /// Analyze an array of argument values,
 /// incorporating info about the formals into this state.
 void

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp?rev=290240&r1=290239&r2=290240&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp Wed Dec 21 02:31:45 2016
@@ -7732,8 +7732,19 @@ TargetLowering::LowerCallTo(TargetLoweri
         Flags.setZExt();
       if (Args[i].isSExt)
         Flags.setSExt();
-      if (Args[i].isInReg)
+      if (Args[i].isInReg) {
+        // If we are using vectorcall calling convention, a structure that is
+        // passed InReg - is surely an HVA
+        if (CLI.CallConv == CallingConv::X86_VectorCall &&
+            isa<StructType>(FinalType)) {
+          // The first value of a structure is marked
+          if (0 == Value)
+            Flags.setHvaStart();
+          Flags.setHva();
+        }
+        // Set InReg Flag
         Flags.setInReg();
+      }
       if (Args[i].isSRet)
         Flags.setSRet();
       if (Args[i].isSwiftSelf)
@@ -8019,8 +8030,19 @@ void SelectionDAGISel::LowerArguments(co
         Flags.setZExt();
       if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
         Flags.setSExt();
-      if (F.getAttributes().hasAttribute(Idx, Attribute::InReg))
+      if (F.getAttributes().hasAttribute(Idx, Attribute::InReg)) {
+        // If we are using vectorcall calling convention, a structure that is
+        // passed InReg - is surely an HVA
+        if (F.getCallingConv() == CallingConv::X86_VectorCall &&
+            isa<StructType>(I->getType())) {
+          // The first value of a structure is marked
+          if (0 == Value)
+            Flags.setHvaStart();
+          Flags.setHva();
+        }
+        // Set InReg Flag
         Flags.setInReg();
+      }
       if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet))
         Flags.setSRet();
       if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftSelf))

Modified: llvm/trunk/lib/Target/X86/X86CallingConv.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CallingConv.cpp?rev=290240&r1=290239&r2=290240&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86CallingConv.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86CallingConv.cpp Wed Dec 21 02:31:45 2016
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86Subtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/IR/CallingConv.h"
 
@@ -39,14 +40,14 @@ bool CC_X86_32_RegCall_Assign2Regs(unsig
   if (AvailableRegs.size() < RequiredGprsUponSplit)
     return false; // Not enough free registers - continue the search.
 
-  // Allocating the available registers
+  // Allocating the available registers.
   for (unsigned I = 0; I < RequiredGprsUponSplit; I++) {
 
-    // Marking the register as located
+    // Marking the register as located.
     unsigned Reg = State.AllocateReg(AvailableRegs[I]);
 
     // Since we previously made sure that 2 registers are available
-    // we expect that a real register number will be returned
+    // we expect that a real register number will be returned.
     assert(Reg && "Expecting a register will be available");
 
     // Assign the value to the allocated register
@@ -57,4 +58,151 @@ bool CC_X86_32_RegCall_Assign2Regs(unsig
   return true;
 }
 
+static ArrayRef<MCPhysReg> CC_X86_VectorCallGetSSEs(const MVT &ValVT) {
+  if (ValVT.is512BitVector()) {
+    static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2,
+                                           X86::ZMM3, X86::ZMM4, X86::ZMM5};
+    return RegListZMM;
+  }
+
+  if (ValVT.is256BitVector()) {
+    static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2,
+                                           X86::YMM3, X86::YMM4, X86::YMM5};
+    return RegListYMM;
+  }
+
+  static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2,
+                                         X86::XMM3, X86::XMM4, X86::XMM5};
+  return RegListXMM;
+}
+
+static ArrayRef<MCPhysReg> CC_X86_64_VectorCallGetGPRs() {
+  static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9};
+  return RegListGPR;
+}
+
+static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
+                                            MVT &LocVT,
+                                            CCValAssign::LocInfo &LocInfo,
+                                            ISD::ArgFlagsTy &ArgFlags,
+                                            CCState &State) {
+
+  ArrayRef<MCPhysReg> RegList = CC_X86_VectorCallGetSSEs(ValVT);
+  bool Is64bit = static_cast<const X86Subtarget &>(
+                     State.getMachineFunction().getSubtarget())
+                     .is64Bit();
+
+  for (auto Reg : RegList) {
+    // If the register is not marked as allocated - assign to it.
+    if (!State.isAllocated(Reg)) {
+      unsigned AssigedReg = State.AllocateReg(Reg);
+      assert(AssigedReg == Reg && "Expecting a valid register allocation");
+      State.addLoc(
+          CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo));
+      return true;
+    }
+    // If the register is marked as shadow allocated - assign to it.
+    if (Is64bit && State.IsShadowAllocatedReg(Reg)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return true;
+    }
+  }
+
+  llvm_unreachable("Clang should ensure that hva marked vectors will have "
+                   "an available register.");
+  return false;
+}
+
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // On the second pass, go through the HVAs only.
+  if (ArgFlags.isSecArgPass()) {
+    if (ArgFlags.isHva())
+      return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+                                             ArgFlags, State);
+    return true;
+  }
+
+  // Process only vector types as defined by vectorcall spec:
+  // "A vector type is either a floating-point type, for example,
+  //  a float or double, or an SIMD vector type, for example, __m128 or __m256".
+  if (!(ValVT.isFloatingPoint() ||
+        (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+    // If R9 was already assigned it means that we are after the fourth element
+    // and because this is not an HVA / Vector type, we need to allocate
+    // shadow XMM register.
+    if (State.isAllocated(X86::R9)) {
+      // Assign shadow XMM register.
+      (void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT));
+    }
+
+    return false;
+  }
+
+  if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) {
+    // Assign shadow GPR register.
+    (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs());
+
+    // Assign XMM register - (shadow for HVA and non-shadow for non HVA).
+    if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+      // In Vectorcall Calling convention, additional shadow stack can be
+      // created on top of the basic 32 bytes of win64.
+      // It can happen if the fifth or sixth argument is vector type or HVA.
+      // At that case for each argument a shadow stack of 8 bytes is allocated.
+      if (Reg == X86::XMM4 || Reg == X86::XMM5)
+        State.AllocateStack(8, 8);
+
+      if (!ArgFlags.isHva()) {
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+        return true; // Allocated a register - Stop the search.
+      }
+    }
+  }
+
+  // If this is an HVA - Stop the search,
+  // otherwise continue the search.
+  return ArgFlags.isHva();
+}
+
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // On the second pass, go through the HVAs only.
+  if (ArgFlags.isSecArgPass()) {
+    if (ArgFlags.isHva())
+      return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+                                             ArgFlags, State);
+    return true;
+  }
+
+  // Process only vector types as defined by vectorcall spec:
+  // "A vector type is either a floating point type, for example,
+  //  a float or double, or an SIMD vector type, for example, __m128 or __m256".
+  if (!(ValVT.isFloatingPoint() ||
+        (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+    return false;
+  }
+
+  if (ArgFlags.isHva())
+    return true; // If this is an HVA - Stop the search.
+
+  // Assign XMM register.
+  if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+
+  // In case we did not find an available XMM register for a vector -
+  // pass it indirectly.
+  // It is similar to CCPassIndirect, with the addition of inreg.
+  if (!ValVT.isFloatingPoint()) {
+    LocVT = MVT::i32;
+    LocInfo = CCValAssign::Indirect;
+    ArgFlags.setInReg();
+  }
+
+  return false; // No register was assigned - Continue the search.
+}
+
 } // End llvm namespace

Modified: llvm/trunk/lib/Target/X86/X86CallingConv.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CallingConv.h?rev=290240&r1=290239&r2=290240&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86CallingConv.h (original)
+++ llvm/trunk/lib/Target/X86/X86CallingConv.h Wed Dec 21 02:31:45 2016
@@ -24,22 +24,29 @@ namespace llvm {
 /// When regcall calling convention compiled to 32 bit arch, special treatment
 /// is required for 64 bit masks.
 /// The value should be assigned to two GPRs.
-/// @return true if registers were allocated and false otherwise
+/// \return true if registers were allocated and false otherwise.
 bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                    CCValAssign::LocInfo &LocInfo,
                                    ISD::ArgFlagsTy &ArgFlags, CCState &State);
 
-inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT,
-                                         MVT &LocVT,
-                                         CCValAssign::LocInfo &LocInfo,
-                                         ISD::ArgFlagsTy &ArgFlags,
-                                         CCState &State) {
-  // Similar to CCPassIndirect, with the addition of inreg.
-  LocVT = MVT::i32;
-  LocInfo = CCValAssign::Indirect;
-  ArgFlags.setInReg();
-  return false; // Continue the search, but now for i32.
-}
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 64 bit arch.
+/// For HVAs shadow registers might be allocated on the first pass
+/// and actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State);
+
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 32 bit arch.
+/// For HVAs actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State);
 
 inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
                                 CCValAssign::LocInfo &, ISD::ArgFlagsTy &,

Modified: llvm/trunk/lib/Target/X86/X86CallingConv.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CallingConv.td?rev=290240&r1=290239&r2=290240&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86CallingConv.td (original)
+++ llvm/trunk/lib/Target/X86/X86CallingConv.td Wed Dec 21 02:31:45 2016
@@ -308,20 +308,12 @@ def RetCC_X86_32_HiPE : CallingConv<[
   CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
 ]>;
 
-// X86-32 HiPE return-value convention.
+// X86-32 Vectorcall return-value convention.
 def RetCC_X86_32_VectorCall : CallingConv<[
-  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  // Floating Point types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, f128],
             CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
 
-  // 256-bit FP vectors
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
-
-  // 512-bit FP vectors
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
-
   // Return integers in the standard way.
   CCDelegateTo<RetCC_X86Common>
 ]>;
@@ -350,6 +342,16 @@ def RetCC_X86_Win64_C : CallingConv<[
   CCDelegateTo<RetCC_X86_64_C>
 ]>;
 
+// X86-64 vectorcall return-value convention.
+def RetCC_X86_64_Vectorcall : CallingConv<[
+  // Vectorcall calling convention always returns FP values in XMMs.
+  CCIfType<[f32, f64, f128], 
+    CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+  // Otherwise, everything is the same as Windows X86-64 C CC.
+  CCDelegateTo<RetCC_X86_Win64_C>
+]>;
+
 // X86-64 HiPE return-value convention.
 def RetCC_X86_64_HiPE : CallingConv<[
   // Promote all types to i64
@@ -447,6 +449,9 @@ def RetCC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
 
+  // Handle Vectorcall CC
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_64_Vectorcall>>,
+
   // Handle HHVM calls.
   CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>,
 
@@ -626,18 +631,7 @@ def CC_X86_Win64_C : CallingConv<[
 ]>;
 
 def CC_X86_Win64_VectorCall : CallingConv<[
-  // The first 6 floating point and vector types of 128 bits or less use
-  // XMM0-XMM5.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
-
-  // 256-bit vectors use YMM registers.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
-
-  // 512-bit vectors use ZMM registers.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
+  CCCustom<"CC_X86_64_VectorCall">,
 
   // Delegate to fastcall to handle integer types.
   CCDelegateTo<CC_X86_Win64_C>
@@ -847,25 +841,9 @@ def CC_X86_32_FastCall : CallingConv<[
   CCDelegateTo<CC_X86_32_Common>
 ]>;
 
-def CC_X86_32_VectorCall : CallingConv<[
-  // The first 6 floating point and vector types of 128 bits or less use
-  // XMM0-XMM5.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
-
-  // 256-bit vectors use YMM registers.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
-
-  // 512-bit vectors use ZMM registers.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
-
-  // Otherwise, pass it indirectly.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64,
-            v32i8, v16i16, v8i32, v4i64, v8f32, v4f64,
-            v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-           CCCustom<"CC_X86_32_VectorCallIndirect">>,
+def CC_X86_Win32_VectorCall : CallingConv<[
+  // Pass floating point in XMMs
+  CCCustom<"CC_X86_32_VectorCall">,
 
   // Delegate to fastcall to handle integer types.
   CCDelegateTo<CC_X86_32_FastCall>
@@ -999,7 +977,7 @@ def CC_X86_32 : CallingConv<[
   CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
   CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
   CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
-  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
   CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=290240&r1=290239&r2=290240&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Dec 21 02:31:45 2016
@@ -17,6 +17,7 @@
 #include "X86CallingConv.h"
 #include "X86FrameLowering.h"
 #include "X86InstrBuilder.h"
+#include "X86IntrinsicsInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86ShuffleDecodeConstantPool.h"
 #include "X86TargetMachine.h"
@@ -53,10 +54,10 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOptions.h"
-#include "X86IntrinsicsInfo.h"
+#include <algorithm>
 #include <bitset>
-#include <numeric>
 #include <cctype>
+#include <numeric>
 using namespace llvm;
 
 #define DEBUG_TYPE "x86-isel"
@@ -2781,6 +2782,13 @@ static ArrayRef<MCPhysReg> get64BitArgum
   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
 }
 
+static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
+  return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
+                        [](const CCValAssign &A, const CCValAssign &B) -> bool {
+                          return A.getValNo() < B.getValNo();
+                        });
+}
+
 SDValue X86TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -2815,11 +2823,22 @@ SDValue X86TargetLowering::LowerFormalAr
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
-  // Allocate shadow area for Win64
+  // Allocate shadow area for Win64.
   if (IsWin64)
     CCInfo.AllocateStack(32, 8);
 
-  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
+  CCInfo.AnalyzeArguments(Ins, CC_X86);
+
+  // In vectorcall calling convention a second pass is required for the HVA
+  // types.
+  if (CallingConv::X86_VectorCall == CallConv) {
+    CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
+  }
+
+  // The next loop assumes that the locations are in the same order of the
+  // input arguments.
+  assert(isSortedByValueNo(ArgLocs) &&
+         "Argument Location list must be sorted before lowering");
 
   SDValue ArgValue;
   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
@@ -3263,11 +3282,17 @@ X86TargetLowering::LowerCall(TargetLower
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
-  // Allocate shadow area for Win64
+  // Allocate shadow area for Win64.
   if (IsWin64)
     CCInfo.AllocateStack(32, 8);
 
-  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+  CCInfo.AnalyzeArguments(Outs, CC_X86);
+
+  // In vectorcall calling convention a second pass is required for the HVA
+  // types.
+  if (CallingConv::X86_VectorCall == CallConv) {
+    CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
+  }
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
@@ -3322,6 +3347,11 @@ X86TargetLowering::LowerCall(TargetLower
   SmallVector<SDValue, 8> MemOpChains;
   SDValue StackPtr;
 
+  // The next loop assumes that the locations are in the same order of the
+  // input arguments.
+  assert(isSortedByValueNo(ArgLocs) &&
+         "Argument Location list must be sorted before lowering");
+
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

Modified: llvm/trunk/test/CodeGen/X86/vectorcall.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vectorcall.ll?rev=290240&r1=290239&r2=290240&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vectorcall.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vectorcall.ll Wed Dec 21 02:31:45 2016
@@ -6,14 +6,12 @@
 define x86_vectorcallcc i32 @test_int_1() {
   ret i32 0
 }
-
 ; CHECK-LABEL: {{^}}test_int_1@@0:
 ; CHECK: xorl %eax, %eax
 
 define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) {
   ret i32 %a
 }
-
 ; X86-LABEL: {{^}}test_int_2@@4:
 ; X64-LABEL: {{^}}test_int_2@@8:
 ; CHECK: movl %ecx, %eax
@@ -22,7 +20,6 @@ define x86_vectorcallcc i32 @test_int_3(
   %at = trunc i64 %a to i32
   ret i32 %at
 }
-
 ; X86-LABEL: {{^}}test_int_3@@8:
 ; X64-LABEL: {{^}}test_int_3@@8:
 ; CHECK: movl %ecx, %eax
@@ -31,10 +28,8 @@ define x86_vectorcallcc i32 @test_int_4(
   %s = add i32 %a, %b
   ret i32 %s
 }
-
 ; X86-LABEL: {{^}}test_int_4@@8:
 ; X86: leal (%ecx,%edx), %eax
-
 ; X64-LABEL: {{^}}test_int_4@@16:
 ; X64: leal (%rcx,%rdx), %eax
 
@@ -90,4 +85,139 @@ define x86_vectorcallcc <16 x i8> @test_
   ret <16 x i8> %r
 }
 ; CHECK-LABEL: {{^}}test_vec_2@@104:
-; CHECK: movaps (%{{[re]}}cx), %xmm0
+; x64:           movq    {{[0-9]*}}(%rsp), %rax
+; CHECK:         movaps (%{{rax|ecx}}), %xmm0
+
+%struct.HVA5 = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+%struct.HVA4 = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+%struct.HVA3 = type { <4 x float>, <4 x float>, <4 x float> }
+%struct.HVA2 = type { <4 x float>, <4 x float> }
+
+define x86_vectorcallcc <4 x float> @test_mixed_1(i32 %a, %struct.HVA4 inreg %bb, i32 %c) {
+entry:
+  %b = alloca %struct.HVA4, align 16
+  store %struct.HVA4 %bb, %struct.HVA4* %b, align 16
+  %w1 = getelementptr inbounds %struct.HVA4, %struct.HVA4* %b, i32 0, i32 1
+  %0 = load <4 x float>, <4 x float>* %w1, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_1
+; CHECK:       movaps	%xmm1, 16(%{{(e|r)}}sp)
+; CHECK:       movaps	16(%{{(e|r)}}sp), %xmm0
+; CHECK:       ret{{q|l}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_2(%struct.HVA4 inreg %a, %struct.HVA4* %b, <4 x float> %c) {
+entry:
+  %c.addr = alloca <4 x float>, align 16
+  store <4 x float> %c, <4 x float>* %c.addr, align 16
+  %0 = load <4 x float>, <4 x float>* %c.addr, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_2
+; X86:         movaps  %xmm0, (%esp)
+; X64:         movaps  %xmm2, %xmm0
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* %f) {
+entry:
+  %x = getelementptr inbounds %struct.HVA2, %struct.HVA2* %f, i32 0, i32 0
+  %0 = load <4 x float>, <4 x float>* %x, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_3
+; CHECK:       movaps	(%{{[re][ac]}}x), %xmm0
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_4(%struct.HVA4 inreg %a, %struct.HVA2* %bb, <4 x float> %c) {
+entry:
+  %y4 = getelementptr inbounds %struct.HVA2, %struct.HVA2* %bb, i32 0, i32 1
+  %0 = load <4 x float>, <4 x float>* %y4, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_4
+; X86:         movaps	16(%eax), %xmm0
+; X64:         movaps	16(%rdx), %xmm0
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_5(%struct.HVA3 inreg %a, %struct.HVA3* %b, <4 x float> %c, %struct.HVA2 inreg %dd) {
+entry:
+  %d = alloca %struct.HVA2, align 16
+  store %struct.HVA2 %dd, %struct.HVA2* %d, align 16
+  %y5 = getelementptr inbounds %struct.HVA2, %struct.HVA2* %d, i32 0, i32 1
+  %0 = load <4 x float>, <4 x float>* %y5, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_5
+; CHECK:       movaps	%xmm5, 16(%{{(e|r)}}sp)
+; CHECK:       movaps	16(%{{(e|r)}}sp), %xmm0
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc %struct.HVA4 @test_mixed_6(%struct.HVA4 inreg %a, %struct.HVA4* %b) {
+entry:
+  %retval = alloca %struct.HVA4, align 16
+  %0 = bitcast %struct.HVA4* %retval to i8*
+  %1 = bitcast %struct.HVA4* %b to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 64, i32 16, i1 false)
+  %2 = load %struct.HVA4, %struct.HVA4* %retval, align 16
+  ret %struct.HVA4 %2
+}
+; CHECK-LABEL: test_mixed_6
+; CHECK:       movaps	(%{{[re]}}sp), %xmm0
+; CHECK:       movaps	16(%{{[re]}}sp), %xmm1
+; CHECK:       movaps	32(%{{[re]}}sp), %xmm2
+; CHECK:       movaps	48(%{{[re]}}sp), %xmm3
+; CHECK:       ret{{[ql]}}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1)
+
+define x86_vectorcallcc void @test_mixed_7(%struct.HVA5* noalias sret %agg.result) {
+entry:
+  %a = alloca %struct.HVA5, align 16
+  %0 = bitcast %struct.HVA5* %a to i8*
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 80, i32 16, i1 false)
+  %1 = bitcast %struct.HVA5* %agg.result to i8*
+  %2 = bitcast %struct.HVA5* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 80, i32 16, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_mixed_7
+; CHECK:       movaps	%xmm{{[0-9]}}, 64(%{{rcx|eax}})
+; CHECK:       movaps	%xmm{{[0-9]}}, 48(%{{rcx|eax}})
+; CHECK:       movaps	%xmm{{[0-9]}}, 32(%{{rcx|eax}})
+; CHECK:       movaps	%xmm{{[0-9]}}, 16(%{{rcx|eax}})
+; CHECK:       movaps	%xmm{{[0-9]}}, (%{{rcx|eax}})
+; X64:         mov{{[ql]}}	%rcx, %rax
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_8(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, i32 %e, <4 x float> %f) {
+entry:
+  %f.addr = alloca <4 x float>, align 16
+  store <4 x float> %f, <4 x float>* %f.addr, align 16
+  %0 = load <4 x float>, <4 x float>* %f.addr, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_8
+; X86:         movaps	%xmm4, %xmm0
+; X64:         movaps	%xmm5, %xmm0
+; CHECK:       ret{{[ql]}}
+
+%struct.HFA4 = type { double, double, double, double }
+declare x86_vectorcallcc double @test_mixed_9_callee(%struct.HFA4 %x, double %y)
+
+define x86_vectorcallcc double @test_mixed_9_caller(%struct.HFA4 inreg %b) {
+entry:
+  %call = call x86_vectorcallcc double @test_mixed_9_callee(%struct.HFA4 inreg %b, double 3.000000e+00)
+  %add = fadd double 1.000000e+00, %call
+  ret double %add
+}
+; CHECK-LABEL: test_mixed_9_caller
+; CHECK:       movaps  %xmm3, %xmm4
+; CHECK:       movaps  %xmm2, %xmm3
+; CHECK:       movaps  %xmm1, %xmm2
+; X32:         movasd  %xmm0, %xmm1
+; X64:         movapd  %xmm5, %xmm1
+; CHECK:       call{{l|q}}   test_mixed_9_callee@@40
+; CHECK:       addsd   {{.*}}, %xmm0
+; CHECK:       ret{{l|q}}