[llvm] r229482 - AVX-512: changes in intel_ocl_bi calling conventions
Elena Demikhovsky
elena.demikhovsky at intel.com
Tue Feb 17 01:20:13 PST 2015
Author: delena
Date: Tue Feb 17 03:20:12 2015
New Revision: 229482
URL: http://llvm.org/viewvc/llvm-project?rev=229482&view=rev
Log:
AVX-512: changes in intel_ocl_bi calling conventions
- added mask types v8i1 and v16i1 to possible function parameters
- enabled passing 512-bit vectors in standard CC
- added a test for KNL intel_ocl_bi conventions
Added:
llvm/trunk/test/CodeGen/X86/avx512-intel-ocl.ll
Modified:
llvm/trunk/lib/Target/X86/X86CallingConv.td
llvm/trunk/lib/Target/X86/X86InstrAVX512.td
Modified: llvm/trunk/lib/Target/X86/X86CallingConv.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CallingConv.td?rev=229482&r1=229481&r2=229482&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86CallingConv.td (original)
+++ llvm/trunk/lib/Target/X86/X86CallingConv.td Tue Feb 17 03:20:12 2015
@@ -461,6 +461,10 @@ def CC_X86_32_Common : CallingConv<[
CCIfSubtarget<"hasFp256()",
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
+ // The first 4 AVX 512-bit vector arguments are passed in ZMM registers.
+ CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
+
// Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
@@ -468,6 +472,10 @@ def CC_X86_32_Common : CallingConv<[
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
CCAssignToStack<32, 32>>,
+ // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToStack<64, 64>>,
+
// __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
// passed in the parameter area.
CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>;
@@ -626,6 +634,9 @@ def CC_Intel_OCL_BI : CallingConv<[
CCIfType<[v16f32, v8f64, v16i32, v8i64],
CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
+ // Pass masks in mask registers
+ CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>,
+
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>,
CCDelegateTo<CC_X86_32_C>
Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=229482&r1=229481&r2=229482&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Tue Feb 17 03:20:12 2015
@@ -1604,14 +1604,14 @@ def : Pat<(i8 (int_x86_avx512_mask_cmp_p
//
multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
string OpcodeStr, RegisterClass KRC,
- ValueType vvt, ValueType ivt, X86MemOperand x86memop> {
+ ValueType vvt, X86MemOperand x86memop> {
let hasSideEffects = 0 in {
def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
let mayLoad = 1 in
def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set KRC:$dst, (vvt (bitconvert (ivt (load addr:$src)))))]>;
+ [(set KRC:$dst, (vvt (load addr:$src)))]>;
let mayStore = 1 in
def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -1631,27 +1631,25 @@ multiclass avx512_mask_mov_gpr<bits<8> o
}
let Predicates = [HasDQI] in
- defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8,
- i8mem>,
+ defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
VEX, PD;
let Predicates = [HasAVX512] in
- defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16,
- i16mem>,
+ defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
VEX, PS;
let Predicates = [HasBWI] in {
- defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1, i32,
- i32mem>, VEX, PD, VEX_W;
+ defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
+ VEX, PD, VEX_W;
defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
VEX, XD;
}
let Predicates = [HasBWI] in {
- defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64,
- i64mem>, VEX, PS, VEX_W;
+ defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
+ VEX, PS, VEX_W;
defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
VEX, XD, VEX_W;
}
@@ -1682,24 +1680,34 @@ let Predicates = [HasBWI] in {
let Predicates = [HasDQI] in {
def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
(KMOVBmk addr:$dst, VK8:$src)>;
+ def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+ (KMOVBkm addr:$src)>;
+}
+let Predicates = [HasAVX512, NoDQI] in {
+ def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
+ (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
+ def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+ (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
}
let Predicates = [HasAVX512] in {
def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
(KMOVWmk addr:$dst, VK16:$src)>;
- def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
- (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
def : Pat<(i1 (load addr:$src)),
(COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>;
- def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
- (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
+ def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
+ (KMOVWkm addr:$src)>;
}
let Predicates = [HasBWI] in {
def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst),
(KMOVDmk addr:$dst, VK32:$src)>;
+ def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
+ (KMOVDkm addr:$src)>;
}
let Predicates = [HasBWI] in {
def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
(KMOVQmk addr:$dst, VK64:$src)>;
+ def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),
+ (KMOVQkm addr:$src)>;
}
let Predicates = [HasAVX512] in {
Added: llvm/trunk/test/CodeGen/X86/avx512-intel-ocl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intel-ocl.ll?rev=229482&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intel-ocl.ll (added)
+++ llvm/trunk/test/CodeGen/X86/avx512-intel-ocl.ll Tue Feb 17 03:20:12 2015
@@ -0,0 +1,105 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=knl | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck -check-prefix=X64 %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+declare i32 @func_int(i32, i32)
+
+; WIN64-LABEL: testf16_inp
+; WIN64: vaddps {{.*}}, {{%zmm[0-1]}}
+; WIN64: leaq {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; X32-LABEL: testf16_inp
+; X32: vaddps {{.*}}, {{%zmm[0-1]}}
+; X32: movl %eax, (%esp)
+; X32: call
+; X32: ret
+
+; X64-LABEL: testf16_inp
+; X64: vaddps {{.*}}, {{%zmm[0-1]}}
+; X64: leaq {{.*}}(%rsp), %rdi
+; X64: call
+; X64: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+ %y = alloca <16 x float>, align 16
+ %x = fadd <16 x float> %a, %b
+ %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
+ %2 = load <16 x float>* %y, align 16
+ %3 = fadd <16 x float> %2, %1
+ ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved zmm16-
+; WIN64-LABEL: testf16_regs
+; WIN64: call
+; WIN64: vaddps %zmm16, %zmm0, %zmm0
+; WIN64: ret
+
+; preserved zmm16-
+; X64-LABEL: testf16_regs
+; X64: call
+; X64: vaddps %zmm16, %zmm0, %zmm0
+; X64: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+ %y = alloca <16 x float>, align 16
+ %x = fadd <16 x float> %a, %b
+ %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
+ %2 = load <16 x float>* %y, align 16
+ %3 = fadd <16 x float> %1, %b
+ %4 = fadd <16 x float> %2, %3
+ ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; WIN64-LABEL: test_prolog_epilog
+; WIN64: vmovups %zmm21, {{.*(%rbp).*}} # 64-byte Spill
+; WIN64: vmovups %zmm6, {{.*(%rbp).*}} # 64-byte Spill
+; WIN64: call
+; WIN64: vmovups {{.*(%rbp).*}}, %zmm6 # 64-byte Reload
+; WIN64: vmovups {{.*(%rbp).*}}, %zmm21 # 64-byte Reload
+
+; X64-LABEL: test_prolog_epilog
+; X64: kmovw %k7, {{.*}}(%rsp) ## 8-byte Folded Spill
+; X64: kmovw %k6, {{.*}}(%rsp) ## 8-byte Folded Spill
+; X64: kmovw %k5, {{.*}}(%rsp) ## 8-byte Folded Spill
+; X64: kmovw %k4, {{.*}}(%rsp) ## 8-byte Folded Spill
+; X64: vmovups %zmm31, {{.*}}(%rsp) ## 64-byte Spill
+; X64: vmovups %zmm16, {{.*}}(%rsp) ## 64-byte Spill
+; X64: call
+; X64: vmovups {{.*}}(%rsp), %zmm16 ## 64-byte Reload
+; X64: vmovups {{.*}}(%rsp), %zmm31 ## 64-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+ %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+ ret <16 x float> %c
+}
+
+
+declare <16 x float> @func_float16_mask(<16 x float>, <16 x i1>)
+
+; X64-LABEL: testf16_inp_mask
+; X64: kmovw %edi, %k1
+; X64: call
+define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask) {
+ %imask = bitcast i16 %mask to <16 x i1>
+ %1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask)
+ ret <16 x float> %1
+}
+
+; X64-LABEL: test_prolog_epilog_with_mask
+; X64: kxorw %k{{.*}}, %k{{.*}}, %k1
+; X64: call
+define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a, <16 x i32> %x1, <16 x i32>%x2, <16 x i1> %mask) nounwind {
+ %cmp_res = icmp eq <16 x i32>%x1, %x2
+ %mask1 = xor <16 x i1> %cmp_res, %mask
+ %c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
+ ret <16 x float> %c
+}
\ No newline at end of file
More information about the llvm-commits
mailing list