[llvm] r289354 - [AVX-512][InstCombine] Add 512-bit vpermilvar intrinsics to InstCombineCalls to match 128 and 256-bit.

Sat Dec 10 17:59:36 PST 2016

Author: ctopper
Date: Sat Dec 10 19:59:36 2016
New Revision: 289354

URL: http://llvm.org/viewvc/llvm-project?rev=289354&view=rev
Log:
[AVX-512][InstCombine] Add 512-bit vpermilvar intrinsics to InstCombineCalls to match 128 and 256-bit.

Added:
    llvm/trunk/test/Transforms/InstCombine/x86-vpermil.ll
      - copied, changed from r289352, llvm/trunk/test/Transforms/InstCombine/x86-avx.ll
Removed:
    llvm/trunk/test/Transforms/InstCombine/x86-avx.ll
Modified:
    llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp

Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp?rev=289354&r1=289353&r2=289354&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp Sat Dec 10 19:59:36 2016
@@ -849,12 +849,15 @@ static Value *simplifyX86vpermilvar(cons
   if (!V)
     return nullptr;
 
+  auto *VecTy = cast<VectorType>(II.getType());
   auto *MaskEltTy = Type::getInt32Ty(II.getContext());
-  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
-  assert(NumElts == 8 || NumElts == 4 || NumElts == 2);
+  unsigned NumElts = VecTy->getVectorNumElements();
+  bool IsPD = VecTy->getScalarType()->isDoubleTy();
+  unsigned NumLaneElts = IsPD ? 2 : 4;
+  assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
 
   // Construct a shuffle mask from constant integers or UNDEFs.
-  Constant *Indexes[8] = {nullptr};
+  Constant *Indexes[16] = {nullptr};
 
   // The intrinsics only read one or two bits, clear the rest.
   for (unsigned I = 0; I < NumElts; ++I) {
@@ -872,18 +875,13 @@ static Value *simplifyX86vpermilvar(cons
 
     // The PD variants uses bit 1 to select per-lane element index, so
     // shift down to convert to generic shuffle mask index.
-    if (II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd ||
-        II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256)
+    if (IsPD)
       Index = Index.lshr(1);
 
     // The _256 variants are a bit trickier since the mask bits always index
     // into the corresponding 128 half. In order to convert to a generic
     // shuffle, we have to make that explicit.
-    if ((II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_ps_256 ||
-         II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) &&
-        ((NumElts / 2) <= I)) {
-      Index += APInt(32, NumElts / 2);
-    }
+    Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
 
     Indexes[I] = ConstantInt::get(MaskEltTy, Index);
   }
@@ -2088,8 +2086,10 @@ Instruction *InstCombiner::visitCallInst
 
   case Intrinsic::x86_avx_vpermilvar_ps:
   case Intrinsic::x86_avx_vpermilvar_ps_256:
+  case Intrinsic::x86_avx512_vpermilvar_ps_512:
   case Intrinsic::x86_avx_vpermilvar_pd:
   case Intrinsic::x86_avx_vpermilvar_pd_256:
+  case Intrinsic::x86_avx512_vpermilvar_pd_512:
     if (Value *V = simplifyX86vpermilvar(*II, *Builder))
       return replaceInstUsesWith(*II, V);
     break;

Removed: llvm/trunk/test/Transforms/InstCombine/x86-avx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/x86-avx.ll?rev=289353&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/x86-avx.ll (original)
+++ llvm/trunk/test/Transforms/InstCombine/x86-avx.ll (removed)
@@ -1,158 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; Verify that instcombine is able to fold identity shuffles.
-
-define <4 x float> @identity_test_vpermilvar_ps(<4 x float> %v) {
-; CHECK-LABEL: @identity_test_vpermilvar_ps(
-; CHECK-NEXT:    ret <4 x float> %v
-;
-  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 0, i32 1, i32 2, i32 3>)
-  ret <4 x float> %a
-}
-
-define <8 x float> @identity_test_vpermilvar_ps_256(<8 x float> %v) {
-; CHECK-LABEL: @identity_test_vpermilvar_ps_256(
-; CHECK-NEXT:    ret <8 x float> %v
-;
-  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-  ret <8 x float> %a
-}
-
-define <2 x double> @identity_test_vpermilvar_pd(<2 x double> %v) {
-; CHECK-LABEL: @identity_test_vpermilvar_pd(
-; CHECK-NEXT:    ret <2 x double> %v
-;
-  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 0, i64 2>)
-  ret <2 x double> %a
-}
-
-define <4 x double> @identity_test_vpermilvar_pd_256(<4 x double> %v) {
-; CHECK-LABEL: @identity_test_vpermilvar_pd_256(
-; CHECK-NEXT:    ret <4 x double> %v
-;
-  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 0, i64 2, i64 0, i64 2>)
-  ret <4 x double> %a
-}
-
-; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector
-; with a shuffle mask of all zeroes.
-
-define <4 x float> @zero_test_vpermilvar_ps_zero(<4 x float> %v) {
-; CHECK-LABEL: @zero_test_vpermilvar_ps_zero(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer)
-  ret <4 x float> %a
-}
-
-define <8 x float> @zero_test_vpermilvar_ps_256_zero(<8 x float> %v) {
-; CHECK-LABEL: @zero_test_vpermilvar_ps_256_zero(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer)
-  ret <8 x float> %a
-}
-
-define <2 x double> @zero_test_vpermilvar_pd_zero(<2 x double> %v) {
-; CHECK-LABEL: @zero_test_vpermilvar_pd_zero(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> zeroinitializer)
-  ret <2 x double> %a
-}
-
-define <4 x double> @zero_test_vpermilvar_pd_256_zero(<4 x double> %v) {
-; CHECK-LABEL: @zero_test_vpermilvar_pd_256_zero(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> zeroinitializer)
-  ret <4 x double> %a
-}
-
-; Verify that instcombine is able to fold constant shuffles.
-
-define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
-; CHECK-LABEL: @test_vpermilvar_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
-  ret <4 x float> %a
-}
-
-define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) {
-; CHECK-LABEL: @test_vpermilvar_ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x float> %a
-}
-
-define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
-; CHECK-LABEL: @test_vpermilvar_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 2, i64 0>)
-  ret <2 x double> %a
-}
-
-define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) {
-; CHECK-LABEL: @test_vpermilvar_pd_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 3, i64 1, i64 2, i64 0>)
-  ret <4 x double> %a
-}
-
-; Verify that instcombine is able to fold constant shuffles with undef mask elements.
-
-define <4 x float> @undef_test_vpermilvar_ps(<4 x float> %v) {
-; CHECK-LABEL: @undef_test_vpermilvar_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>)
-  ret <4 x float> %a
-}
-
-define <8 x float> @undef_test_vpermilvar_ps_256(<8 x float> %v) {
-; CHECK-LABEL: @undef_test_vpermilvar_ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x float> %a
-}
-
-define <2 x double> @undef_test_vpermilvar_pd(<2 x double> %v) {
-; CHECK-LABEL: @undef_test_vpermilvar_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 undef, i64 0>)
-  ret <2 x double> %a
-}
-
-define <4 x double> @undef_test_vpermilvar_pd_256(<4 x double> %v) {
-; CHECK-LABEL: @undef_test_vpermilvar_pd_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 undef, i32 0, i32 3, i32 undef>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 undef, i64 1, i64 2, i64 undef>)
-  ret <4 x double> %a
-}
-
-declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
-declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
-
-declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
-declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)

Copied: llvm/trunk/test/Transforms/InstCombine/x86-vpermil.ll (from r289352, llvm/trunk/test/Transforms/InstCombine/x86-avx.ll)
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/x86-vpermil.ll?p2=llvm/trunk/test/Transforms/InstCombine/x86-vpermil.ll&p1=llvm/trunk/test/Transforms/InstCombine/x86-avx.ll&r1=289352&r2=289354&rev=289354&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/x86-avx.ll (original)
+++ llvm/trunk/test/Transforms/InstCombine/x86-vpermil.ll Sat Dec 10 19:59:36 2016
@@ -20,6 +20,14 @@ define <8 x float> @identity_test_vpermi
   ret <8 x float> %a
 }
 
+define <16 x float> @identity_test_vpermilvar_ps_512(<16 x float> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_ps_512(
+; CHECK-NEXT:    ret <16 x float> %v
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>)
+  ret <16 x float> %a
+}
+
 define <2 x double> @identity_test_vpermilvar_pd(<2 x double> %v) {
 ; CHECK-LABEL: @identity_test_vpermilvar_pd(
 ; CHECK-NEXT:    ret <2 x double> %v
@@ -36,6 +44,14 @@ define <4 x double> @identity_test_vperm
   ret <4 x double> %a
 }
 
+define <8 x double> @identity_test_vpermilvar_pd_512(<8 x double> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_pd_512(
+; CHECK-NEXT:    ret <8 x double> %v
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 0, i64 2, i64 0, i64 2, i64 0, i64 2, i64 0, i64 2>)
+  ret <8 x double> %a
+}
+
 ; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector
 ; with a shuffle mask of all zeroes.
 
@@ -57,6 +73,15 @@ define <8 x float> @zero_test_vpermilvar
   ret <8 x float> %a
 }
 
+define <16 x float> @zero_test_vpermilvar_ps_512_zero(<16 x float> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_ps_512_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %v, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> zeroinitializer)
+  ret <16 x float> %a
+}
+
 define <2 x double> @zero_test_vpermilvar_pd_zero(<2 x double> %v) {
 ; CHECK-LABEL: @zero_test_vpermilvar_pd_zero(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
@@ -75,6 +100,15 @@ define <4 x double> @zero_test_vpermilva
   ret <4 x double> %a
 }
 
+define <8 x double> @zero_test_vpermilvar_pd_512_zero(<8 x double> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_pd_512_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %v, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> zeroinitializer)
+  ret <8 x double> %a
+}
+
 ; Verify that instcombine is able to fold constant shuffles.
 
 define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
@@ -95,6 +129,15 @@ define <8 x float> @test_vpermilvar_ps_2
   ret <8 x float> %a
 }
 
+define <16 x float> @test_vpermilvar_ps_512(<16 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %v, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x float> %a
+}
+
 define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
 ; CHECK-LABEL: @test_vpermilvar_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0>
@@ -113,6 +156,15 @@ define <4 x double> @test_vpermilvar_pd_
   ret <4 x double> %a
 }
 
+define <8 x double> @test_vpermilvar_pd_512(<8 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %v, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 3, i64 1, i64 2, i64 0, i64 7, i64 5, i64 6, i64 4>)
+  ret <8 x double> %a
+}
+
 ; Verify that instcombine is able to fold constant shuffles with undef mask elements.
 
 define <4 x float> @undef_test_vpermilvar_ps(<4 x float> %v) {
@@ -133,6 +185,15 @@ define <8 x float> @undef_test_vpermilva
   ret <8 x float> %a
 }
 
+define <16 x float> @undef_test_vpermilvar_ps_512(<16 x float> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %v, <16 x float> undef, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 7, i32 6, i32 5, i32 4, i32 undef, i32 10, i32 9, i32 undef, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0, i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x float> %a
+}
+
 define <2 x double> @undef_test_vpermilvar_pd(<2 x double> %v) {
 ; CHECK-LABEL: @undef_test_vpermilvar_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
@@ -151,8 +212,19 @@ define <4 x double> @undef_test_vpermilv
   ret <4 x double> %a
 }
 
+define <8 x double> @undef_test_vpermilvar_pd_512(<8 x double> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %v, <8 x double> undef, <8 x i32> <i32 undef, i32 0, i32 3, i32 undef, i32 undef, i32 4, i32 7, i32 undef>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 undef, i64 1, i64 2, i64 undef, i64 undef, i64 1, i64 2, i64 undef>)
+  ret <8 x double> %a
+}
+
 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
+declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
 
 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
+declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>)