[llvm] r295660 - [X86] Fix EXTRACT_VECTOR_ELT with variable index from v32i16 and v64i8 vector.

Igor Breger via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 20 06:16:30 PST 2017


Author: ibreger
Date: Mon Feb 20 08:16:29 2017
New Revision: 295660

URL: http://llvm.org/viewvc/llvm-project?rev=295660&view=rev
Log:
[X86] Fix EXTRACT_VECTOR_ELT with variable index from v32i16 and v64i8 vector.

Its more profitable to go through memory (1 cycles throughput)
than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput) to implement EXTRACT_VECTOR_ELT with variable index.
IACA tool was used to get performace estimation (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
For example for var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8 test from vector-shuffle-variable-128.ll I get 26 cycles vs 79 cycles. 
Removing the VINSERT node, we don't need it any more.

Differential Revision: https://reviews.llvm.org/D29690


Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.h
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td
    llvm/trunk/lib/Target/X86/X86InstrSSE.td
    llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
    llvm/trunk/test/CodeGen/X86/extractelement-index.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Feb 20 08:16:29 2017
@@ -13776,24 +13776,36 @@ X86TargetLowering::LowerEXTRACT_VECTOR_E
     return ExtractBitFromMaskVector(Op, DAG);
 
   if (!isa<ConstantSDNode>(Idx)) {
-    if (VecVT.is512BitVector() ||
-        (VecVT.is256BitVector() && Subtarget.hasInt256() &&
-         VecVT.getScalarSizeInBits() == 32)) {
-
-      MVT MaskEltVT =
-        MVT::getIntegerVT(VecVT.getScalarSizeInBits());
-      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
-                                    MaskEltVT.getSizeInBits());
-
-      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
-      auto PtrVT = getPointerTy(DAG.getDataLayout());
-      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
-                                 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
-                                 DAG.getConstant(0, dl, PtrVT));
-      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
-                         DAG.getConstant(0, dl, PtrVT));
-    }
+    // Its more profitable to go through memory (1 cycles throughput)
+    // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
+    // IACA tool was used to get performace estimation
+    // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
+    //
+    // exmample : extractelement <16 x i8> %a, i32 %i
+    //
+    // Block Throughput: 3.00 Cycles
+    // Throughput Bottleneck: Port5
+    //
+    // | Num Of |   Ports pressure in cycles  |    |
+    // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
+    // ---------------------------------------------
+    // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
+    // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
+    // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
+    // Total Num Of Uops: 4
+    //
+    //
+    // Block Throughput: 1.00 Cycles
+    // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
+    //
+    // |    |  Ports pressure in cycles   |  |
+    // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
+    // ---------------------------------------------------------
+    // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
+    // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
+    // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
+    // Total Num Of Uops: 4
+
     return SDValue();
   }
 
@@ -23937,7 +23949,6 @@ const char *X86TargetLowering::getTarget
   case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
   case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
   case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
-  case X86ISD::VINSERT:            return "X86ISD::VINSERT";
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPEXT_RND:         return "X86ISD::VFPEXT_RND";
   case X86ISD::VFPEXTS_RND:        return "X86ISD::VFPEXTS_RND";

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Mon Feb 20 08:16:29 2017
@@ -446,8 +446,7 @@ namespace llvm {
       // Broadcast subvector to vector.
       SUBV_BROADCAST,
 
-      // Insert/Extract vector element.
-      VINSERT,
+      // Extract vector element.
       VEXTRACT,
 
       /// SSE4A Extraction and Insertion.

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Mon Feb 20 08:16:29 2017
@@ -3580,19 +3580,6 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v8i64 (X86vzload addr:$src)),
             (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
 }
-
-def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
-
-def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
-
 //===----------------------------------------------------------------------===//
 // AVX-512 - Non-temporals
 //===----------------------------------------------------------------------===//

Modified: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td Mon Feb 20 08:16:29 2017
@@ -453,9 +453,6 @@ def X86SubVBroadcast : SDNode<"X86ISD::S
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
-def X86Vinsert   : SDNode<"X86ISD::VINSERT",  SDTypeProfile<1, 3,
-                              [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>,
-                               SDTCisPtrTy<3>]>, []>;
 def X86Vextract   : SDNode<"X86ISD::VEXTRACT",  SDTypeProfile<1, 2,
                               [SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
                                SDTCisPtrTy<2>]>, []>;

Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Mon Feb 20 08:16:29 2017
@@ -4702,19 +4702,6 @@ def MOVPDI2DImr  : S2I<0x7E, MRMDestMem,
                                      (iPTR 0))), addr:$dst)],
                                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 } // ExeDomain = SSEPackedInt
-
-def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
-
-def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
-
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int first element to Doubleword Int
 //

Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Mon Feb 20 08:16:29 2017
@@ -124,16 +124,30 @@ define void @test6(<4 x float> %x, float
 define float @test7(<16 x float> %x, i32 %ind) nounwind {
 ; KNL-LABEL: test7:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vmovd %edi, %xmm1
-; KNL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $15, %edi
+; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test7:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vmovd %edi, %xmm1
-; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; SKX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $15, %edi
+; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    retq
   %e = extractelement <16 x float> %x, i32 %ind
   ret float %e
@@ -142,18 +156,30 @@ define float @test7(<16 x float> %x, i32
 define double @test8(<8 x double> %x, i32 %ind) nounwind {
 ; KNL-LABEL: test8:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    movslq %edi, %rax
-; KNL-NEXT:    vmovq %rax, %xmm1
-; KNL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test8:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    movslq %edi, %rax
-; SKX-NEXT:    vmovq %rax, %xmm1
-; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; SKX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    retq
   %e = extractelement <8 x double> %x, i32 %ind
   ret double %e
@@ -162,16 +188,30 @@ define double @test8(<8 x double> %x, i3
 define float @test9(<8 x float> %x, i32 %ind) nounwind {
 ; KNL-LABEL: test9:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vmovd %edi, %xmm1
-; KNL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $64, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test9:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vmovd %edi, %xmm1
-; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; SKX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:    andq $-32, %rsp
+; SKX-NEXT:    subq $64, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %ymm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    retq
   %e = extractelement <8 x float> %x, i32 %ind
   ret float %e
@@ -180,16 +220,30 @@ define float @test9(<8 x float> %x, i32
 define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
 ; KNL-LABEL: test10:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vmovd %edi, %xmm1
-; KNL-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; KNL-NEXT:    vmovd %xmm0, %eax
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $15, %edi
+; KNL-NEXT:    movl (%rsp,%rdi,4), %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test10:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vmovd %edi, %xmm1
-; SKX-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; SKX-NEXT:    vmovd %xmm0, %eax
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $15, %edi
+; SKX-NEXT:    movl (%rsp,%rdi,4), %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    retq
   %e = extractelement <16 x i32> %x, i32 %ind
   ret i32 %e
@@ -1514,18 +1568,42 @@ define i64 @test_extractelement_variable
 define i64 @test_extractelement_variable_v8i64(<8 x i64> %t1, i32 %index) {
 ; KNL-LABEL: test_extractelement_variable_v8i64:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    movslq %edi, %rax
-; KNL-NEXT:    vmovq %rax, %xmm1
-; KNL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi6:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi7:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi8:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    movq (%rsp,%rdi,8), %rax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v8i64:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    movslq %edi, %rax
-; SKX-NEXT:    vmovq %rax, %xmm1
-; SKX-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; SKX-NEXT:    vmovq %xmm0, %rax
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi3:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi4:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi5:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    movq (%rsp,%rdi,8), %rax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    retq
   %t2 = extractelement <8 x i64> %t1, i32 %index
   ret i64 %t2
@@ -1555,12 +1633,12 @@ define double @test_extractelement_varia
 ; KNL-LABEL: test_extractelement_variable_v4f64:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:  Lcfi6:
+; KNL-NEXT:  Lcfi9:
 ; KNL-NEXT:    .cfi_def_cfa_offset 16
-; KNL-NEXT:  Lcfi7:
+; KNL-NEXT:  Lcfi10:
 ; KNL-NEXT:    .cfi_offset %rbp, -16
 ; KNL-NEXT:    movq %rsp, %rbp
-; KNL-NEXT:  Lcfi8:
+; KNL-NEXT:  Lcfi11:
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-32, %rsp
 ; KNL-NEXT:    subq $64, %rsp
@@ -1575,12 +1653,12 @@ define double @test_extractelement_varia
 ; SKX-LABEL: test_extractelement_variable_v4f64:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    pushq %rbp
-; SKX-NEXT:  Lcfi3:
+; SKX-NEXT:  Lcfi6:
 ; SKX-NEXT:    .cfi_def_cfa_offset 16
-; SKX-NEXT:  Lcfi4:
+; SKX-NEXT:  Lcfi7:
 ; SKX-NEXT:    .cfi_offset %rbp, -16
 ; SKX-NEXT:    movq %rsp, %rbp
-; SKX-NEXT:  Lcfi5:
+; SKX-NEXT:  Lcfi8:
 ; SKX-NEXT:    .cfi_def_cfa_register %rbp
 ; SKX-NEXT:    andq $-32, %rsp
 ; SKX-NEXT:    subq $64, %rsp
@@ -1598,18 +1676,42 @@ define double @test_extractelement_varia
 define double @test_extractelement_variable_v8f64(<8 x double> %t1, i32 %index) {
 ; KNL-LABEL: test_extractelement_variable_v8f64:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    movslq %edi, %rax
-; KNL-NEXT:    vmovq %rax, %xmm1
-; KNL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi12:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi13:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi14:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v8f64:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    movslq %edi, %rax
-; SKX-NEXT:    vmovq %rax, %xmm1
-; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; SKX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi9:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi10:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi11:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    retq
   %t2 = extractelement <8 x double> %t1, i32 %index
   ret double %t2
@@ -1638,16 +1740,42 @@ define i32 @test_extractelement_variable
 define i32 @test_extractelement_variable_v8i32(<8 x i32> %t1, i32 %index) {
 ; KNL-LABEL: test_extractelement_variable_v8i32:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vmovd %edi, %xmm1
-; KNL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    vmovd %xmm0, %eax
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi15:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi16:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi17:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $64, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    movl (%rsp,%rdi,4), %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v8i32:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vmovd %edi, %xmm1
-; SKX-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; SKX-NEXT:    vmovd %xmm0, %eax
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi12:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi13:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi14:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-32, %rsp
+; SKX-NEXT:    subq $64, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %ymm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    movl (%rsp,%rdi,4), %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    retq
   %t2 = extractelement <8 x i32> %t1, i32 %index
   ret i32 %t2
@@ -1656,16 +1784,42 @@ define i32 @test_extractelement_variable
 define i32 @test_extractelement_variable_v16i32(<16 x i32> %t1, i32 %index) {
 ; KNL-LABEL: test_extractelement_variable_v16i32:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vmovd %edi, %xmm1
-; KNL-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; KNL-NEXT:    vmovd %xmm0, %eax
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi18:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi19:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi20:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $15, %edi
+; KNL-NEXT:    movl (%rsp,%rdi,4), %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v16i32:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vmovd %edi, %xmm1
-; SKX-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; SKX-NEXT:    vmovd %xmm0, %eax
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi15:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi16:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi17:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $15, %edi
+; SKX-NEXT:    movl (%rsp,%rdi,4), %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    retq
   %t2 = extractelement <16 x i32> %t1, i32 %index
   ret i32 %t2
@@ -1694,16 +1848,42 @@ define float @test_extractelement_variab
 define float @test_extractelement_variable_v8f32(<8 x float> %t1, i32 %index) {
 ; KNL-LABEL: test_extractelement_variable_v8f32:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vmovd %edi, %xmm1
-; KNL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi21:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi22:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi23:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $64, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v8f32:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vmovd %edi, %xmm1
-; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; SKX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi18:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi19:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi20:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-32, %rsp
+; SKX-NEXT:    subq $64, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %ymm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    retq
   %t2 = extractelement <8 x float> %t1, i32 %index
   ret float %t2
@@ -1712,16 +1892,42 @@ define float @test_extractelement_variab
 define float @test_extractelement_variable_v16f32(<16 x float> %t1, i32 %index) {
 ; KNL-LABEL: test_extractelement_variable_v16f32:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vmovd %edi, %xmm1
-; KNL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi24:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi25:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi26:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $15, %edi
+; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v16f32:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vmovd %edi, %xmm1
-; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; SKX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi21:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi22:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi23:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $15, %edi
+; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    retq
   %t2 = extractelement <16 x float> %t1, i32 %index
   ret float %t2
@@ -1751,12 +1957,12 @@ define i16 @test_extractelement_variable
 ; KNL-LABEL: test_extractelement_variable_v16i16:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:  Lcfi9:
+; KNL-NEXT:  Lcfi27:
 ; KNL-NEXT:    .cfi_def_cfa_offset 16
-; KNL-NEXT:  Lcfi10:
+; KNL-NEXT:  Lcfi28:
 ; KNL-NEXT:    .cfi_offset %rbp, -16
 ; KNL-NEXT:    movq %rsp, %rbp
-; KNL-NEXT:  Lcfi11:
+; KNL-NEXT:  Lcfi29:
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-32, %rsp
 ; KNL-NEXT:    subq $64, %rsp
@@ -1771,12 +1977,12 @@ define i16 @test_extractelement_variable
 ; SKX-LABEL: test_extractelement_variable_v16i16:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    pushq %rbp
-; SKX-NEXT:  Lcfi6:
+; SKX-NEXT:  Lcfi24:
 ; SKX-NEXT:    .cfi_def_cfa_offset 16
-; SKX-NEXT:  Lcfi7:
+; SKX-NEXT:  Lcfi25:
 ; SKX-NEXT:    .cfi_offset %rbp, -16
 ; SKX-NEXT:    movq %rsp, %rbp
-; SKX-NEXT:  Lcfi8:
+; SKX-NEXT:  Lcfi26:
 ; SKX-NEXT:    .cfi_def_cfa_register %rbp
 ; SKX-NEXT:    andq $-32, %rsp
 ; SKX-NEXT:    subq $64, %rsp
@@ -1791,11 +1997,50 @@ define i16 @test_extractelement_variable
   ret i16 %t2
 }
 
-; TODO - enable after fix
-;define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
-;  %t2 = extractelement <32 x i16> %t1, i32 %index
-;  ret i16 %t2
-;}
+define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v32i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi30:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi31:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi32:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $31, %edi
+; KNL-NEXT:    movzwl (%rsp,%rdi,2), %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi27:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi28:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi29:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovdqu16 %zmm0, (%rsp)
+; SKX-NEXT:    andl $31, %edi
+; SKX-NEXT:    movzwl (%rsp,%rdi,2), %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    retq
+  %t2 = extractelement <32 x i16> %t1, i32 %index
+  ret i16 %t2
+}
 
 define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
 ; KNL-LABEL: test_extractelement_variable_v16i8:
@@ -1823,12 +2068,12 @@ define i8 @test_extractelement_variable_
 ; KNL-LABEL: test_extractelement_variable_v32i8:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:  Lcfi12:
+; KNL-NEXT:  Lcfi33:
 ; KNL-NEXT:    .cfi_def_cfa_offset 16
-; KNL-NEXT:  Lcfi13:
+; KNL-NEXT:  Lcfi34:
 ; KNL-NEXT:    .cfi_offset %rbp, -16
 ; KNL-NEXT:    movq %rsp, %rbp
-; KNL-NEXT:  Lcfi14:
+; KNL-NEXT:  Lcfi35:
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-32, %rsp
 ; KNL-NEXT:    subq $64, %rsp
@@ -1844,12 +2089,12 @@ define i8 @test_extractelement_variable_
 ; SKX-LABEL: test_extractelement_variable_v32i8:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    pushq %rbp
-; SKX-NEXT:  Lcfi9:
+; SKX-NEXT:  Lcfi30:
 ; SKX-NEXT:    .cfi_def_cfa_offset 16
-; SKX-NEXT:  Lcfi10:
+; SKX-NEXT:  Lcfi31:
 ; SKX-NEXT:    .cfi_offset %rbp, -16
 ; SKX-NEXT:    movq %rsp, %rbp
-; SKX-NEXT:  Lcfi11:
+; SKX-NEXT:  Lcfi32:
 ; SKX-NEXT:    .cfi_def_cfa_register %rbp
 ; SKX-NEXT:    andq $-32, %rsp
 ; SKX-NEXT:    subq $64, %rsp
@@ -1866,8 +2111,101 @@ define i8 @test_extractelement_variable_
   ret i8 %t2
 }
 
-; TODO - enable after fix
-;define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
-;  %t2 = extractelement <64 x i8> %t1, i32 %index
-;  ret i8 %t2
-;}
+define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v64i8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi36:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi37:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi38:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $63, %edi
+; KNL-NEXT:    movq %rsp, %rax
+; KNL-NEXT:    movb (%rdi,%rax), %al
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi33:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi34:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi35:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovdqu8 %zmm0, (%rsp)
+; SKX-NEXT:    andl $63, %edi
+; SKX-NEXT:    movq %rsp, %rax
+; SKX-NEXT:    movb (%rdi,%rax), %al
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    retq
+
+  %t2 = extractelement <64 x i8> %t1, i32 %index
+  ret i8 %t2
+}
+
+define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) {
+; KNL-LABEL: test_extractelement_variable_v64i8_indexi8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi39:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi40:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi41:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    addb %dil, %dil
+; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    movzbl %dil, %eax
+; KNL-NEXT:    andl $63, %eax
+; KNL-NEXT:    movq %rsp, %rcx
+; KNL-NEXT:    movb (%rax,%rcx), %al
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v64i8_indexi8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi36:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi37:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi38:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    addb %dil, %dil
+; SKX-NEXT:    vmovdqu8 %zmm0, (%rsp)
+; SKX-NEXT:    movzbl %dil, %eax
+; SKX-NEXT:    andl $63, %eax
+; SKX-NEXT:    movq %rsp, %rcx
+; SKX-NEXT:    movb (%rax,%rcx), %al
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    retq
+
+  %i  = add i8 %index, %index
+  %t2 = extractelement <64 x i8> %t1, i8 %i
+  ret i8 %t2
+}

Modified: llvm/trunk/test/CodeGen/X86/extractelement-index.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/extractelement-index.ll?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/extractelement-index.ll (original)
+++ llvm/trunk/test/CodeGen/X86/extractelement-index.ll Mon Feb 20 08:16:29 2017
@@ -538,27 +538,19 @@ define i32 @extractelement_v8i32_var(<8
 ; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: extractelement_v8i32_var:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    movq %rsp, %rbp
-; AVX1-NEXT:    andq $-32, %rsp
-; AVX1-NEXT:    subq $64, %rsp
-; AVX1-NEXT:    andl $7, %edi
-; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX1-NEXT:    movl (%rsp,%rdi,4), %eax
-; AVX1-NEXT:    movq %rbp, %rsp
-; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: extractelement_v8i32_var:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovd %edi, %xmm1
-; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: extractelement_v8i32_var:
+; AVX:       # BB#0:
+; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    movq %rsp, %rbp
+; AVX-NEXT:    andq $-32, %rsp
+; AVX-NEXT:    subq $64, %rsp
+; AVX-NEXT:    andl $7, %edi
+; AVX-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX-NEXT:    movl (%rsp,%rdi,4), %eax
+; AVX-NEXT:    movq %rbp, %rsp
+; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %b = extractelement <8 x i32> %a, i256 %i
   ret i32 %b
 }

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll?rev=295660&r1=295659&r2=295660&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll Mon Feb 20 08:16:29 2017
@@ -236,70 +236,43 @@ define <4 x i64> @var_shuffle_v4i64_v2i6
 }
 
 define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
-; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    movq %rsp, %rbp
-; AVX1-NEXT:    andq $-32, %rsp
-; AVX1-NEXT:    subq $64, %rsp
-; AVX1-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
-; AVX1-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
-; AVX1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX1-NEXT:    andl $7, %edi
-; AVX1-NEXT:    andl $7, %esi
-; AVX1-NEXT:    andl $7, %edx
-; AVX1-NEXT:    andl $7, %ecx
-; AVX1-NEXT:    andl $7, %r8d
-; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX1-NEXT:    andl $7, %r9d
-; AVX1-NEXT:    movl 16(%rbp), %r10d
-; AVX1-NEXT:    andl $7, %r10d
-; AVX1-NEXT:    movl 24(%rbp), %eax
-; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
-; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    movq %rbp, %rsp
-; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovd %edi, %xmm1
-; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vmovd %esi, %xmm2
-; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm2
-; AVX2-NEXT:    vmovd %edx, %xmm3
-; AVX2-NEXT:    vpermps %ymm0, %ymm3, %ymm3
-; AVX2-NEXT:    vmovd %ecx, %xmm4
-; AVX2-NEXT:    vpermps %ymm0, %ymm4, %ymm4
-; AVX2-NEXT:    vmovd %r8d, %xmm5
-; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm5
-; AVX2-NEXT:    vmovd %r9d, %xmm6
-; AVX2-NEXT:    vpermps %ymm0, %ymm6, %ymm6
-; AVX2-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpermps %ymm0, %ymm7, %ymm7
-; AVX2-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpermps %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
+; ALL-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    pushq %rbp
+; ALL-NEXT:    movq %rsp, %rbp
+; ALL-NEXT:    andq $-32, %rsp
+; ALL-NEXT:    subq $64, %rsp
+; ALL-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; ALL-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; ALL-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; ALL-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; ALL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ALL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ALL-NEXT:    andl $7, %edi
+; ALL-NEXT:    andl $7, %esi
+; ALL-NEXT:    andl $7, %edx
+; ALL-NEXT:    andl $7, %ecx
+; ALL-NEXT:    andl $7, %r8d
+; ALL-NEXT:    vmovaps %ymm0, (%rsp)
+; ALL-NEXT:    andl $7, %r9d
+; ALL-NEXT:    movl 16(%rbp), %r10d
+; ALL-NEXT:    andl $7, %r10d
+; ALL-NEXT:    movl 24(%rbp), %eax
+; ALL-NEXT:    andl $7, %eax
+; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ALL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; ALL-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; ALL-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; ALL-NEXT:    movq %rbp, %rsp
+; ALL-NEXT:    popq %rbp
+; ALL-NEXT:    retq
   %x0 = extractelement <8 x float> %x, i32 %i0
   %x1 = extractelement <8 x float> %x, i32 %i1
   %x2 = extractelement <8 x float> %x, i32 %i2




More information about the llvm-commits mailing list