[llvm] r295660 - [X86] Fix EXTRACT_VECTOR_ELT with variable index from v32i16 and v64i8 vector.
Igor Breger via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 20 06:16:30 PST 2017
Author: ibreger
Date: Mon Feb 20 08:16:29 2017
New Revision: 295660
URL: http://llvm.org/viewvc/llvm-project?rev=295660&view=rev
Log:
[X86] Fix EXTRACT_VECTOR_ELT with variable index from v32i16 and v64i8 vector.
Its more profitable to go through memory (1 cycles throughput)
than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput) to implement EXTRACT_VECTOR_ELT with variable index.
IACA tool was used to get performace estimation (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
For example for var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8 test from vector-shuffle-variable-128.ll I get 26 cycles vs 79 cycles.
Removing the VINSERT node, we don't need it any more.
Differential Revision: https://reviews.llvm.org/D29690
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.h
llvm/trunk/lib/Target/X86/X86InstrAVX512.td
llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td
llvm/trunk/lib/Target/X86/X86InstrSSE.td
llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
llvm/trunk/test/CodeGen/X86/extractelement-index.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Feb 20 08:16:29 2017
@@ -13776,24 +13776,36 @@ X86TargetLowering::LowerEXTRACT_VECTOR_E
return ExtractBitFromMaskVector(Op, DAG);
if (!isa<ConstantSDNode>(Idx)) {
- if (VecVT.is512BitVector() ||
- (VecVT.is256BitVector() && Subtarget.hasInt256() &&
- VecVT.getScalarSizeInBits() == 32)) {
-
- MVT MaskEltVT =
- MVT::getIntegerVT(VecVT.getScalarSizeInBits());
- MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
- MaskEltVT.getSizeInBits());
-
- Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
- auto PtrVT = getPointerTy(DAG.getDataLayout());
- SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
- getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
- DAG.getConstant(0, dl, PtrVT));
- SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
- DAG.getConstant(0, dl, PtrVT));
- }
+ // Its more profitable to go through memory (1 cycles throughput)
+ // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
+ // IACA tool was used to get performace estimation
+ // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
+ //
+ // exmample : extractelement <16 x i8> %a, i32 %i
+ //
+ // Block Throughput: 3.00 Cycles
+ // Throughput Bottleneck: Port5
+ //
+ // | Num Of | Ports pressure in cycles | |
+ // | Uops | 0 - DV | 5 | 6 | 7 | |
+ // ---------------------------------------------
+ // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
+ // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
+ // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
+ // Total Num Of Uops: 4
+ //
+ //
+ // Block Throughput: 1.00 Cycles
+ // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
+ //
+ // | | Ports pressure in cycles | |
+ // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
+ // ---------------------------------------------------------
+ // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
+ // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
+ // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
+ // Total Num Of Uops: 4
+
return SDValue();
}
@@ -23937,7 +23949,6 @@ const char *X86TargetLowering::getTarget
case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
- case X86ISD::VINSERT: return "X86ISD::VINSERT";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Mon Feb 20 08:16:29 2017
@@ -446,8 +446,7 @@ namespace llvm {
// Broadcast subvector to vector.
SUBV_BROADCAST,
- // Insert/Extract vector element.
- VINSERT,
+ // Extract vector element.
VEXTRACT,
/// SSE4A Extraction and Insertion.
Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Mon Feb 20 08:16:29 2017
@@ -3580,19 +3580,6 @@ let Predicates = [HasAVX512] in {
def : Pat<(v8i64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
}
-
-def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
- (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
- (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
-
-def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
- (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
- (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
-
//===----------------------------------------------------------------------===//
// AVX-512 - Non-temporals
//===----------------------------------------------------------------------===//
Modified: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td Mon Feb 20 08:16:29 2017
@@ -453,9 +453,6 @@ def X86SubVBroadcast : SDNode<"X86ISD::S
def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
-def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3,
- [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>,
- SDTCisPtrTy<3>]>, []>;
def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2,
[SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
SDTCisPtrTy<2>]>, []>;
Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Mon Feb 20 08:16:29 2017
@@ -4702,19 +4702,6 @@ def MOVPDI2DImr : S2I<0x7E, MRMDestMem,
(iPTR 0))), addr:$dst)],
IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
} // ExeDomain = SSEPackedInt
-
-def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
- (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
- (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
-
-def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
- (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
- (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
-
//===---------------------------------------------------------------------===//
// Move Packed Doubleword Int first element to Doubleword Int
//
Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Mon Feb 20 08:16:29 2017
@@ -124,16 +124,30 @@ define void @test6(<4 x float> %x, float
define float @test7(<16 x float> %x, i32 %ind) nounwind {
; KNL-LABEL: test7:
; KNL: ## BB#0:
-; KNL-NEXT: vmovd %edi, %xmm1
-; KNL-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %zmm0, (%rsp)
+; KNL-NEXT: andl $15, %edi
+; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: test7:
; SKX: ## BB#0:
-; SKX-NEXT: vmovd %edi, %xmm1
-; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovaps %zmm0, (%rsp)
+; SKX-NEXT: andl $15, %edi
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
; SKX-NEXT: retq
%e = extractelement <16 x float> %x, i32 %ind
ret float %e
@@ -142,18 +156,30 @@ define float @test7(<16 x float> %x, i32
define double @test8(<8 x double> %x, i32 %ind) nounwind {
; KNL-LABEL: test8:
; KNL: ## BB#0:
-; KNL-NEXT: movslq %edi, %rax
-; KNL-NEXT: vmovq %rax, %xmm1
-; KNL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %zmm0, (%rsp)
+; KNL-NEXT: andl $7, %edi
+; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: test8:
; SKX: ## BB#0:
-; SKX-NEXT: movslq %edi, %rax
-; SKX-NEXT: vmovq %rax, %xmm1
-; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovaps %zmm0, (%rsp)
+; SKX-NEXT: andl $7, %edi
+; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
; SKX-NEXT: retq
%e = extractelement <8 x double> %x, i32 %ind
ret double %e
@@ -162,16 +188,30 @@ define double @test8(<8 x double> %x, i3
define float @test9(<8 x float> %x, i32 %ind) nounwind {
; KNL-LABEL: test9:
; KNL: ## BB#0:
-; KNL-NEXT: vmovd %edi, %xmm1
-; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $64, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %ymm0, (%rsp)
+; KNL-NEXT: andl $7, %edi
+; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: test9:
; SKX: ## BB#0:
-; SKX-NEXT: vmovd %edi, %xmm1
-; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: andq $-32, %rsp
+; SKX-NEXT: subq $64, %rsp
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovaps %ymm0, (%rsp)
+; SKX-NEXT: andl $7, %edi
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
; SKX-NEXT: retq
%e = extractelement <8 x float> %x, i32 %ind
ret float %e
@@ -180,16 +220,30 @@ define float @test9(<8 x float> %x, i32
define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
; KNL-LABEL: test10:
; KNL: ## BB#0:
-; KNL-NEXT: vmovd %edi, %xmm1
-; KNL-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; KNL-NEXT: vmovd %xmm0, %eax
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %zmm0, (%rsp)
+; KNL-NEXT: andl $15, %edi
+; KNL-NEXT: movl (%rsp,%rdi,4), %eax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: test10:
; SKX: ## BB#0:
-; SKX-NEXT: vmovd %edi, %xmm1
-; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; SKX-NEXT: vmovd %xmm0, %eax
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovaps %zmm0, (%rsp)
+; SKX-NEXT: andl $15, %edi
+; SKX-NEXT: movl (%rsp,%rdi,4), %eax
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
; SKX-NEXT: retq
%e = extractelement <16 x i32> %x, i32 %ind
ret i32 %e
@@ -1514,18 +1568,42 @@ define i64 @test_extractelement_variable
define i64 @test_extractelement_variable_v8i64(<8 x i64> %t1, i32 %index) {
; KNL-LABEL: test_extractelement_variable_v8i64:
; KNL: ## BB#0:
-; KNL-NEXT: movslq %edi, %rax
-; KNL-NEXT: vmovq %rax, %xmm1
-; KNL-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Lcfi6:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Lcfi7:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Lcfi8:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %zmm0, (%rsp)
+; KNL-NEXT: andl $7, %edi
+; KNL-NEXT: movq (%rsp,%rdi,8), %rax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v8i64:
; SKX: ## BB#0:
-; SKX-NEXT: movslq %edi, %rax
-; SKX-NEXT: vmovq %rax, %xmm1
-; SKX-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; SKX-NEXT: vmovq %xmm0, %rax
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: Lcfi3:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: Lcfi4:
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: Lcfi5:
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovaps %zmm0, (%rsp)
+; SKX-NEXT: andl $7, %edi
+; SKX-NEXT: movq (%rsp,%rdi,8), %rax
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
; SKX-NEXT: retq
%t2 = extractelement <8 x i64> %t1, i32 %index
ret i64 %t2
@@ -1555,12 +1633,12 @@ define double @test_extractelement_varia
; KNL-LABEL: test_extractelement_variable_v4f64:
; KNL: ## BB#0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi6:
+; KNL-NEXT: Lcfi9:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi7:
+; KNL-NEXT: Lcfi10:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi8:
+; KNL-NEXT: Lcfi11:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
@@ -1575,12 +1653,12 @@ define double @test_extractelement_varia
; SKX-LABEL: test_extractelement_variable_v4f64:
; SKX: ## BB#0:
; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi3:
+; SKX-NEXT: Lcfi6:
; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi4:
+; SKX-NEXT: Lcfi7:
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi5:
+; SKX-NEXT: Lcfi8:
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-32, %rsp
; SKX-NEXT: subq $64, %rsp
@@ -1598,18 +1676,42 @@ define double @test_extractelement_varia
define double @test_extractelement_variable_v8f64(<8 x double> %t1, i32 %index) {
; KNL-LABEL: test_extractelement_variable_v8f64:
; KNL: ## BB#0:
-; KNL-NEXT: movslq %edi, %rax
-; KNL-NEXT: vmovq %rax, %xmm1
-; KNL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Lcfi12:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Lcfi13:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Lcfi14:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %zmm0, (%rsp)
+; KNL-NEXT: andl $7, %edi
+; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v8f64:
; SKX: ## BB#0:
-; SKX-NEXT: movslq %edi, %rax
-; SKX-NEXT: vmovq %rax, %xmm1
-; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: Lcfi9:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: Lcfi10:
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: Lcfi11:
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovaps %zmm0, (%rsp)
+; SKX-NEXT: andl $7, %edi
+; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
; SKX-NEXT: retq
%t2 = extractelement <8 x double> %t1, i32 %index
ret double %t2
@@ -1638,16 +1740,42 @@ define i32 @test_extractelement_variable
define i32 @test_extractelement_variable_v8i32(<8 x i32> %t1, i32 %index) {
; KNL-LABEL: test_extractelement_variable_v8i32:
; KNL: ## BB#0:
-; KNL-NEXT: vmovd %edi, %xmm1
-; KNL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; KNL-NEXT: vmovd %xmm0, %eax
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Lcfi15:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Lcfi16:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Lcfi17:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $64, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %ymm0, (%rsp)
+; KNL-NEXT: andl $7, %edi
+; KNL-NEXT: movl (%rsp,%rdi,4), %eax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v8i32:
; SKX: ## BB#0:
-; SKX-NEXT: vmovd %edi, %xmm1
-; SKX-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; SKX-NEXT: vmovd %xmm0, %eax
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: Lcfi12:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: Lcfi13:
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: Lcfi14:
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-32, %rsp
+; SKX-NEXT: subq $64, %rsp
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovaps %ymm0, (%rsp)
+; SKX-NEXT: andl $7, %edi
+; SKX-NEXT: movl (%rsp,%rdi,4), %eax
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
; SKX-NEXT: retq
%t2 = extractelement <8 x i32> %t1, i32 %index
ret i32 %t2
@@ -1656,16 +1784,42 @@ define i32 @test_extractelement_variable
define i32 @test_extractelement_variable_v16i32(<16 x i32> %t1, i32 %index) {
; KNL-LABEL: test_extractelement_variable_v16i32:
; KNL: ## BB#0:
-; KNL-NEXT: vmovd %edi, %xmm1
-; KNL-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; KNL-NEXT: vmovd %xmm0, %eax
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Lcfi18:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Lcfi19:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Lcfi20:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %zmm0, (%rsp)
+; KNL-NEXT: andl $15, %edi
+; KNL-NEXT: movl (%rsp,%rdi,4), %eax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v16i32:
; SKX: ## BB#0:
-; SKX-NEXT: vmovd %edi, %xmm1
-; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; SKX-NEXT: vmovd %xmm0, %eax
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: Lcfi15:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: Lcfi16:
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: Lcfi17:
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovaps %zmm0, (%rsp)
+; SKX-NEXT: andl $15, %edi
+; SKX-NEXT: movl (%rsp,%rdi,4), %eax
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
; SKX-NEXT: retq
%t2 = extractelement <16 x i32> %t1, i32 %index
ret i32 %t2
@@ -1694,16 +1848,42 @@ define float @test_extractelement_variab
define float @test_extractelement_variable_v8f32(<8 x float> %t1, i32 %index) {
; KNL-LABEL: test_extractelement_variable_v8f32:
; KNL: ## BB#0:
-; KNL-NEXT: vmovd %edi, %xmm1
-; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Lcfi21:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Lcfi22:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Lcfi23:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $64, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %ymm0, (%rsp)
+; KNL-NEXT: andl $7, %edi
+; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v8f32:
; SKX: ## BB#0:
-; SKX-NEXT: vmovd %edi, %xmm1
-; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: Lcfi18:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: Lcfi19:
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: Lcfi20:
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-32, %rsp
+; SKX-NEXT: subq $64, %rsp
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovaps %ymm0, (%rsp)
+; SKX-NEXT: andl $7, %edi
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
; SKX-NEXT: retq
%t2 = extractelement <8 x float> %t1, i32 %index
ret float %t2
@@ -1712,16 +1892,42 @@ define float @test_extractelement_variab
define float @test_extractelement_variable_v16f32(<16 x float> %t1, i32 %index) {
; KNL-LABEL: test_extractelement_variable_v16f32:
; KNL: ## BB#0:
-; KNL-NEXT: vmovd %edi, %xmm1
-; KNL-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Lcfi24:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Lcfi25:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Lcfi26:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %zmm0, (%rsp)
+; KNL-NEXT: andl $15, %edi
+; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v16f32:
; SKX: ## BB#0:
-; SKX-NEXT: vmovd %edi, %xmm1
-; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: Lcfi21:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: Lcfi22:
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: Lcfi23:
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovaps %zmm0, (%rsp)
+; SKX-NEXT: andl $15, %edi
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
; SKX-NEXT: retq
%t2 = extractelement <16 x float> %t1, i32 %index
ret float %t2
@@ -1751,12 +1957,12 @@ define i16 @test_extractelement_variable
; KNL-LABEL: test_extractelement_variable_v16i16:
; KNL: ## BB#0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi9:
+; KNL-NEXT: Lcfi27:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi10:
+; KNL-NEXT: Lcfi28:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi11:
+; KNL-NEXT: Lcfi29:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
@@ -1771,12 +1977,12 @@ define i16 @test_extractelement_variable
; SKX-LABEL: test_extractelement_variable_v16i16:
; SKX: ## BB#0:
; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi6:
+; SKX-NEXT: Lcfi24:
; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi7:
+; SKX-NEXT: Lcfi25:
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi8:
+; SKX-NEXT: Lcfi26:
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-32, %rsp
; SKX-NEXT: subq $64, %rsp
@@ -1791,11 +1997,50 @@ define i16 @test_extractelement_variable
ret i16 %t2
}
-; TODO - enable after fix
-;define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
-; %t2 = extractelement <32 x i16> %t1, i32 %index
-; ret i16 %t2
-;}
+define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v32i16:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Lcfi30:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Lcfi31:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Lcfi32:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovaps %ymm0, (%rsp)
+; KNL-NEXT: andl $31, %edi
+; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_extractelement_variable_v32i16:
+; SKX: ## BB#0:
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: Lcfi27:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: Lcfi28:
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: Lcfi29:
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovdqu16 %zmm0, (%rsp)
+; SKX-NEXT: andl $31, %edi
+; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
+; SKX-NEXT: retq
+ %t2 = extractelement <32 x i16> %t1, i32 %index
+ ret i16 %t2
+}
define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
; KNL-LABEL: test_extractelement_variable_v16i8:
@@ -1823,12 +2068,12 @@ define i8 @test_extractelement_variable_
; KNL-LABEL: test_extractelement_variable_v32i8:
; KNL: ## BB#0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi12:
+; KNL-NEXT: Lcfi33:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi13:
+; KNL-NEXT: Lcfi34:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi14:
+; KNL-NEXT: Lcfi35:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
@@ -1844,12 +2089,12 @@ define i8 @test_extractelement_variable_
; SKX-LABEL: test_extractelement_variable_v32i8:
; SKX: ## BB#0:
; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi9:
+; SKX-NEXT: Lcfi30:
; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi10:
+; SKX-NEXT: Lcfi31:
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi11:
+; SKX-NEXT: Lcfi32:
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-32, %rsp
; SKX-NEXT: subq $64, %rsp
@@ -1866,8 +2111,101 @@ define i8 @test_extractelement_variable_
ret i8 %t2
}
-; TODO - enable after fix
-;define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
-; %t2 = extractelement <64 x i8> %t1, i32 %index
-; ret i8 %t2
-;}
+define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v64i8:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Lcfi36:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Lcfi37:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Lcfi38:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovaps %ymm0, (%rsp)
+; KNL-NEXT: andl $63, %edi
+; KNL-NEXT: movq %rsp, %rax
+; KNL-NEXT: movb (%rdi,%rax), %al
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_extractelement_variable_v64i8:
+; SKX: ## BB#0:
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: Lcfi33:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: Lcfi34:
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: Lcfi35:
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovdqu8 %zmm0, (%rsp)
+; SKX-NEXT: andl $63, %edi
+; SKX-NEXT: movq %rsp, %rax
+; SKX-NEXT: movb (%rdi,%rax), %al
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
+; SKX-NEXT: retq
+
+ %t2 = extractelement <64 x i8> %t1, i32 %index
+ ret i8 %t2
+}
+
+define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) {
+; KNL-LABEL: test_extractelement_variable_v64i8_indexi8:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Lcfi39:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Lcfi40:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Lcfi41:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: addb %dil, %dil
+; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovaps %ymm0, (%rsp)
+; KNL-NEXT: movzbl %dil, %eax
+; KNL-NEXT: andl $63, %eax
+; KNL-NEXT: movq %rsp, %rcx
+; KNL-NEXT: movb (%rax,%rcx), %al
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_extractelement_variable_v64i8_indexi8:
+; SKX: ## BB#0:
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: Lcfi36:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: Lcfi37:
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: Lcfi38:
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: addb %dil, %dil
+; SKX-NEXT: vmovdqu8 %zmm0, (%rsp)
+; SKX-NEXT: movzbl %dil, %eax
+; SKX-NEXT: andl $63, %eax
+; SKX-NEXT: movq %rsp, %rcx
+; SKX-NEXT: movb (%rax,%rcx), %al
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
+; SKX-NEXT: retq
+
+ %i = add i8 %index, %index
+ %t2 = extractelement <64 x i8> %t1, i8 %i
+ ret i8 %t2
+}
Modified: llvm/trunk/test/CodeGen/X86/extractelement-index.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/extractelement-index.ll?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/extractelement-index.ll (original)
+++ llvm/trunk/test/CodeGen/X86/extractelement-index.ll Mon Feb 20 08:16:29 2017
@@ -538,27 +538,19 @@ define i32 @extractelement_v8i32_var(<8
; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
-; AVX1-LABEL: extractelement_v8i32_var:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $64, %rsp
-; AVX1-NEXT: andl $7, %edi
-; AVX1-NEXT: vmovaps %ymm0, (%rsp)
-; AVX1-NEXT: movl (%rsp,%rdi,4), %eax
-; AVX1-NEXT: movq %rbp, %rsp
-; AVX1-NEXT: popq %rbp
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: extractelement_v8i32_var:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovd %edi, %xmm1
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: extractelement_v8i32_var:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: movq %rsp, %rbp
+; AVX-NEXT: andq $-32, %rsp
+; AVX-NEXT: subq $64, %rsp
+; AVX-NEXT: andl $7, %edi
+; AVX-NEXT: vmovaps %ymm0, (%rsp)
+; AVX-NEXT: movl (%rsp,%rdi,4), %eax
+; AVX-NEXT: movq %rbp, %rsp
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%b = extractelement <8 x i32> %a, i256 %i
ret i32 %b
}
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll Mon Feb 20 08:16:29 2017
@@ -236,70 +236,43 @@ define <4 x i64> @var_shuffle_v4i64_v2i6
}
define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
-; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $64, %rsp
-; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX1-NEXT: andl $7, %edi
-; AVX1-NEXT: andl $7, %esi
-; AVX1-NEXT: andl $7, %edx
-; AVX1-NEXT: andl $7, %ecx
-; AVX1-NEXT: andl $7, %r8d
-; AVX1-NEXT: vmovaps %ymm0, (%rsp)
-; AVX1-NEXT: andl $7, %r9d
-; AVX1-NEXT: movl 16(%rbp), %r10d
-; AVX1-NEXT: andl $7, %r10d
-; AVX1-NEXT: movl 24(%rbp), %eax
-; AVX1-NEXT: andl $7, %eax
-; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
-; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: movq %rbp, %rsp
-; AVX1-NEXT: popq %rbp
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovd %edi, %xmm1
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1
-; AVX2-NEXT: vmovd %esi, %xmm2
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm2
-; AVX2-NEXT: vmovd %edx, %xmm3
-; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3
-; AVX2-NEXT: vmovd %ecx, %xmm4
-; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm4
-; AVX2-NEXT: vmovd %r8d, %xmm5
-; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm5
-; AVX2-NEXT: vmovd %r9d, %xmm6
-; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm6
-; AVX2-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpermps %ymm0, %ymm7, %ymm7
-; AVX2-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpermps %ymm0, %ymm8, %ymm0
-; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; ALL-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rbp
+; ALL-NEXT: movq %rsp, %rbp
+; ALL-NEXT: andq $-32, %rsp
+; ALL-NEXT: subq $64, %rsp
+; ALL-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; ALL-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; ALL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; ALL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; ALL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ALL-NEXT: andl $7, %edi
+; ALL-NEXT: andl $7, %esi
+; ALL-NEXT: andl $7, %edx
+; ALL-NEXT: andl $7, %ecx
+; ALL-NEXT: andl $7, %r8d
+; ALL-NEXT: vmovaps %ymm0, (%rsp)
+; ALL-NEXT: andl $7, %r9d
+; ALL-NEXT: movl 16(%rbp), %r10d
+; ALL-NEXT: andl $7, %r10d
+; ALL-NEXT: movl 24(%rbp), %eax
+; ALL-NEXT: andl $7, %eax
+; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; ALL-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; ALL-NEXT: movq %rbp, %rsp
+; ALL-NEXT: popq %rbp
+; ALL-NEXT: retq
%x0 = extractelement <8 x float> %x, i32 %i0
%x1 = extractelement <8 x float> %x, i32 %i1
%x2 = extractelement <8 x float> %x, i32 %i2
More information about the llvm-commits
mailing list