[llvm-commits] [llvm] r166566 - in /llvm/trunk: include/llvm/CallingConv.h lib/AsmParser/LLLexer.cpp lib/AsmParser/LLParser.cpp lib/AsmParser/LLToken.h lib/Target/X86/X86CallingConv.td lib/Target/X86/X86RegisterInfo.cpp lib/VMCore/AsmWriter.cpp lib/VMCore/Verifier.cpp test/CodeGen/X86/avx-intel-ocl.ll test/CodeGen/X86/sse-intel-ocl.ll

Wed Oct 24 07:46:17 PDT 2012

Author: delena
Date: Wed Oct 24 09:46:16 2012
New Revision: 166566

URL: http://llvm.org/viewvc/llvm-project?rev=166566&view=rev
Log:
Special calling conventions for Intel OpenCL built-in library.

Added:
    llvm/trunk/test/CodeGen/X86/avx-intel-ocl.ll
    llvm/trunk/test/CodeGen/X86/sse-intel-ocl.ll
Modified:
    llvm/trunk/include/llvm/CallingConv.h
    llvm/trunk/lib/AsmParser/LLLexer.cpp
    llvm/trunk/lib/AsmParser/LLParser.cpp
    llvm/trunk/lib/AsmParser/LLToken.h
    llvm/trunk/lib/Target/X86/X86CallingConv.td
    llvm/trunk/lib/Target/X86/X86RegisterInfo.cpp
    llvm/trunk/lib/VMCore/AsmWriter.cpp
    llvm/trunk/lib/VMCore/Verifier.cpp

Modified: llvm/trunk/include/llvm/CallingConv.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CallingConv.h?rev=166566&r1=166565&r2=166566&view=diff
==============================================================================

--- llvm/trunk/include/llvm/CallingConv.h (original)
+++ llvm/trunk/include/llvm/CallingConv.h Wed Oct 24 09:46:16 2012
@@ -112,7 +112,11 @@
     /// Cannot have variable arguments.
     /// Can also be called by the host.
     /// Is externally visible.
-    SPIR_KERNEL = 76
+    SPIR_KERNEL = 76,
+
+    /// Intel_OCL_BI - Calling conventions for Intel OpenCL built-ins
+    Intel_OCL_BI = 77
+
   };
 } // End CallingConv namespace
 

Modified: llvm/trunk/lib/AsmParser/LLLexer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/AsmParser/LLLexer.cpp?rev=166566&r1=166565&r2=166566&view=diff
==============================================================================
--- llvm/trunk/lib/AsmParser/LLLexer.cpp (original)
+++ llvm/trunk/lib/AsmParser/LLLexer.cpp Wed Oct 24 09:46:16 2012
@@ -527,6 +527,7 @@
   KEYWORD(ptx_device);
   KEYWORD(spir_kernel);
   KEYWORD(spir_func);
+  KEYWORD(intel_ocl_bicc);
 
   KEYWORD(cc);
   KEYWORD(c);

Modified: llvm/trunk/lib/AsmParser/LLParser.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/AsmParser/LLParser.cpp?rev=166566&r1=166565&r2=166566&view=diff
==============================================================================
--- llvm/trunk/lib/AsmParser/LLParser.cpp (original)
+++ llvm/trunk/lib/AsmParser/LLParser.cpp Wed Oct 24 09:46:16 2012
@@ -1094,6 +1094,7 @@
 ///   ::= /*empty*/
 ///   ::= 'ccc'
 ///   ::= 'fastcc'
+///   ::= 'kw_intel_ocl_bicc'
 ///   ::= 'coldcc'
 ///   ::= 'x86_stdcallcc'
 ///   ::= 'x86_fastcallcc'
@@ -1125,6 +1126,7 @@
   case lltok::kw_ptx_device:     CC = CallingConv::PTX_Device; break;
   case lltok::kw_spir_kernel:    CC = CallingConv::SPIR_KERNEL; break;
   case lltok::kw_spir_func:      CC = CallingConv::SPIR_FUNC; break;
+  case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
   case lltok::kw_cc: {
       unsigned ArbitraryCC;
       Lex.Lex();

Modified: llvm/trunk/lib/AsmParser/LLToken.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/AsmParser/LLToken.h?rev=166566&r1=166565&r2=166566&view=diff
==============================================================================
--- llvm/trunk/lib/AsmParser/LLToken.h (original)
+++ llvm/trunk/lib/AsmParser/LLToken.h Wed Oct 24 09:46:16 2012
@@ -77,6 +77,7 @@
     kw_c,
 
     kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
+	  kw_intel_ocl_bicc,
     kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc,
     kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
     kw_msp430_intrcc,

Modified: llvm/trunk/lib/Target/X86/X86CallingConv.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CallingConv.td?rev=166566&r1=166565&r2=166566&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86CallingConv.td (original)
+++ llvm/trunk/lib/Target/X86/X86CallingConv.td Wed Oct 24 09:46:16 2012
@@ -88,6 +88,21 @@
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit FP vectors
+  // No more than 4 registers
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // i32, i64 in the standard way
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
 // X86-64 C return-value convention.
 def RetCC_X86_64_C : CallingConv<[
   // The X86-64 calling convention always returns FP values in XMM0.
@@ -128,6 +143,10 @@
 
 // This is the return-value convention used for the entire X86 backend.
 def RetCC_X86 : CallingConv<[
+
+  // Check if this is the Intel OpenCL built-ins calling convention
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
   CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
   CCDelegateTo<RetCC_X86_32>
 ]>;
@@ -235,6 +254,29 @@
   CCIfType<[f80], CCAssignToStack<0, 0>>
 ]>;
 
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+  CCIfType<[i32], CCIfSubtarget<"isTargetWin32()", CCAssignToStack<4, 4>>>,
+
+  CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+  CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8,  R9 ]>>>,
+
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX]>>,
+
+ // The SSE vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+  
+  // The 256-bit vector arguments are passed in YMM registers.
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+                CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+  
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+  CCDelegateTo<CC_X86_64_C>
+]>;
+
+
 def CC_X86_64_GHC : CallingConv<[
   // Promote i8/i16/i32 arguments to i64.
   CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
@@ -408,6 +450,7 @@
 
 // This is the argument convention used for the entire X86 backend.
 def CC_X86 : CallingConv<[
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
   CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
   CCDelegateTo<CC_X86_32>
 ]>;
@@ -426,3 +469,17 @@
 
 def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
                                      (sequence "XMM%u", 6, 15))>;
+
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+                                                  R13, R14, R15, 
+                                                  (sequence "YMM%u", 6, 15))>;
+
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI       : CalleeSavedRegs<(add CSR_64,
+                                                 (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX    : CalleeSavedRegs<(add CSR_64,
+                                                  (sequence "YMM%u", 8, 15))>;

Modified: llvm/trunk/lib/Target/X86/X86RegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86RegisterInfo.cpp?rev=166566&r1=166565&r2=166566&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86RegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86RegisterInfo.cpp Wed Oct 24 09:46:16 2012
@@ -229,15 +229,26 @@
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   bool callsEHReturn = false;
   bool ghcCall = false;
+  bool oclBiCall = false;
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
 
   if (MF) {
     callsEHReturn = MF->getMMI().callsEHReturn();
     const Function *F = MF->getFunction();
     ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
+    oclBiCall = (F ? F->getCallingConv() == CallingConv::Intel_OCL_BI : false);
   }
 
   if (ghcCall)
     return CSR_NoRegs_SaveList;
+  if (oclBiCall) {
+    if (HasAVX && IsWin64)
+        return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+    if (HasAVX && Is64Bit)
+        return CSR_64_Intel_OCL_BI_AVX_SaveList;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+        return CSR_64_Intel_OCL_BI_SaveList;
+  }
   if (Is64Bit) {
     if (IsWin64)
       return CSR_Win64_SaveList;
@@ -252,6 +263,16 @@
 
 const uint32_t*
 X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+
+  if (CC == CallingConv::Intel_OCL_BI) {
+    if (IsWin64 && HasAVX)
+      return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+    if (Is64Bit && HasAVX)
+      return CSR_64_Intel_OCL_BI_AVX_RegMask;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_RegMask;
+  }
   if (CC == CallingConv::GHC)
     return CSR_NoRegs_RegMask;
   if (!Is64Bit)

Modified: llvm/trunk/lib/VMCore/AsmWriter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/VMCore/AsmWriter.cpp?rev=166566&r1=166565&r2=166566&view=diff
==============================================================================
--- llvm/trunk/lib/VMCore/AsmWriter.cpp (original)
+++ llvm/trunk/lib/VMCore/AsmWriter.cpp Wed Oct 24 09:46:16 2012
@@ -74,6 +74,7 @@
     case CallingConv::X86_StdCall:  Out << "x86_stdcallcc"; break;
     case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break;
     case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break;
+    case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break;
     case CallingConv::ARM_APCS:     Out << "arm_apcscc"; break;
     case CallingConv::ARM_AAPCS:    Out << "arm_aapcscc"; break;
     case CallingConv::ARM_AAPCS_VFP:Out << "arm_aapcs_vfpcc"; break;

Modified: llvm/trunk/lib/VMCore/Verifier.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/VMCore/Verifier.cpp?rev=166566&r1=166565&r2=166566&view=diff
==============================================================================
--- llvm/trunk/lib/VMCore/Verifier.cpp (original)
+++ llvm/trunk/lib/VMCore/Verifier.cpp Wed Oct 24 09:46:16 2012
@@ -705,6 +705,7 @@
   case CallingConv::Cold:
   case CallingConv::X86_FastCall:
   case CallingConv::X86_ThisCall:
+  case CallingConv::Intel_OCL_BI:
   case CallingConv::PTX_Kernel:
   case CallingConv::PTX_Device:
     Assert1(!F.isVarArg(),

Added: llvm/trunk/test/CodeGen/X86/avx-intel-ocl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-intel-ocl.ll?rev=166566&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-intel-ocl.ll (added)
+++ llvm/trunk/test/CodeGen/X86/avx-intel-ocl.ll Wed Oct 24 09:46:16 2012
@@ -0,0 +1,107 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl    %eax, (%esp)
+; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved ymm6-ymm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: vaddps  {{%ymm[6-7]}}, %ymm0, %ymm0
+; WIN64: vaddps  {{%ymm[6-7]}}, %ymm1, %ymm1
+; WIN64: ret
+
+; preserved ymm8-ymm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm0, %ymm0
+; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm1, %ymm1
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: call
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: call
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
\ No newline at end of file

Added: llvm/trunk/test/CodeGen/X86/sse-intel-ocl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-intel-ocl.ll?rev=166566&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse-intel-ocl.ll (added)
+++ llvm/trunk/test/CodeGen/X86/sse-intel-ocl.ll Wed Oct 24 09:46:16 2012
@@ -0,0 +1,93 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl    %eax, (%esp)
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved xmm6-xmm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
+; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
+; WIN64: ret
+
+; preserved xmm8-xmm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: call
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}