[llvm] abd33bf - [X86][AVX] lowerShuffleWithPERMV - pad 128/256-bit shuffles on non-VLX targets
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 18 07:50:45 PDT 2020
Author: Simon Pilgrim
Date: 2020-08-18T15:46:02+01:00
New Revision: abd33bf5eff2419e0f49ce494039bceefe8e1085
URL: https://github.com/llvm/llvm-project/commit/abd33bf5eff2419e0f49ce494039bceefe8e1085
DIFF: https://github.com/llvm/llvm-project/commit/abd33bf5eff2419e0f49ce494039bceefe8e1085.diff
LOG: [X86][AVX] lowerShuffleWithPERMV - pad 128/256-bit shuffles on non-VLX targets
Allow non-VLX targets to use 512-bits VPERMV/VPERMV3 for 128/256-bit shuffles.
TBH I'm not sure these targets actually exist in the wild, but we're testing for them and its good test coverage for shuffle lowering/combines across different subvector widths.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0fbabdc5dfdf..ec4d236dc3ea 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -14969,17 +14969,35 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Mask, Subtarget, DAG);
}
+// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
+// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
+// the active subvector is extracted.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+ ArrayRef<int> Mask, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
-
SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
+
+ MVT ShuffleVT = VT;
+ if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
+ V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
+ V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
+ MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
+ ShuffleVT = V1.getSimpleValueType();
+ }
+
+ SDValue Result;
if (V2.isUndef())
- return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+ Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
+ else
+ Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
+
+ if (VT != ShuffleVT)
+ Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
- return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
+ return Result;
}
/// Generic lowering of v16i8 shuffles.
@@ -15208,9 +15226,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Unpack;
- // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
- if (Subtarget.hasVBMI() && Subtarget.hasVLX())
- return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
+ // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
+ if (Subtarget.hasVBMI())
+ return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
+ DAG);
// If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
if (Subtarget.hasXOP()) {
@@ -16964,9 +16983,9 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
- // AVX512BWVL can lower to VPERMW.
- if (Subtarget.hasBWI() && Subtarget.hasVLX())
- return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
+ // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
+ if (Subtarget.hasBWI())
+ return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
@@ -17069,9 +17088,9 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
- // AVX512VBMIVL can lower to VPERMB.
- if (Subtarget.hasVBMI() && Subtarget.hasVLX())
- return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
+ // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
+ if (Subtarget.hasVBMI())
+ return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
@@ -17325,7 +17344,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
- return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 16-lane 32-bit floating point shuffles.
@@ -17384,7 +17403,7 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
V1, V2, DAG, Subtarget))
return V;
- return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 8-lane 64-bit integer shuffles.
@@ -17447,7 +17466,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
- return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 16-lane 32-bit integer shuffles.
@@ -17524,7 +17543,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
- return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 32-lane 16-bit integer shuffles.
@@ -17587,7 +17606,7 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
- return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 64-lane 8-bit integer shuffles.
@@ -17643,7 +17662,7 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// VBMI can use VPERMV/VPERMV3 byte shuffles.
if (Subtarget.hasVBMI())
- return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
index 969ac375a70e..40cd2fcd4fde 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
@@ -85,12 +85,10 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
;
; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,17,19,21,23,9,11,13,15,25,27,29,31]
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,2,1,3]
; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -260,20 +258,11 @@ define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
@@ -327,20 +316,11 @@ define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
@@ -394,20 +374,11 @@ define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index de13135ebb53..9e3c92aca5da 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -328,8 +328,8 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT: vpermt2b %zmm0, %zmm1, %zmm0
; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
@@ -413,8 +413,8 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT: vpermt2b %zmm0, %zmm1, %zmm0
; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
@@ -457,13 +457,10 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
;
; AVX512BW-LABEL: PR34175:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1
+; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512BW-NEXT: retq
;
@@ -478,13 +475,10 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
;
; AVX512VBMI-LABEL: PR34175:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1
-; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1
+; AVX512VBMI-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
+; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512VBMI-NEXT: retq
;
More information about the llvm-commits
mailing list