[llvm] 3ce544e - [X86] lowerShuffleAsBroadcast - improve handling of non-zero element index broadcasts
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 11 09:12:56 PST 2024
Author: Simon Pilgrim
Date: 2024-11-11T17:08:16Z
New Revision: 3ce544e6be098b5c355140de78bc49069fda33c3
URL: https://github.com/llvm/llvm-project/commit/3ce544e6be098b5c355140de78bc49069fda33c3
DIFF: https://github.com/llvm/llvm-project/commit/3ce544e6be098b5c355140de78bc49069fda33c3.diff
LOG: [X86] lowerShuffleAsBroadcast - improve handling of non-zero element index broadcasts
On AVX2+, support broadcasting of any element if it occurs in the bottom 128-bit subvector by shuffling the element down to element 0 and then broadcasting.
Fixes #113396
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/matrix-multiply.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
llvm/test/CodeGen/X86/vector-shuffle-v1.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e2abbcec68b204..c08efc9f7271e6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12637,6 +12637,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast "
"comes from V1.");
+ int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
// Go up the chain of (vector) values to find a scalar load that we can
// combine with the broadcast.
@@ -12756,16 +12757,28 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
if (VT == MVT::v4f64 || VT == MVT::v4i64)
return SDValue();
- // Only broadcast the zero-element of a 128-bit subvector.
- if ((BitOffset % 128) != 0)
- return SDValue();
+ // If we are broadcasting an element from the lowest 128-bit subvector, try
+ // to move the element in position.
+ if (BitOffset < 128 && NumActiveElts > 1 &&
+ V.getScalarValueSizeInBits() == NumEltBits) {
+ assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
+ "Unexpected bit-offset");
+ SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
+ ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
+ V = extractSubVector(V, 0, DAG, DL, 128);
+ V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
+ } else {
+ // Only broadcast the zero-element of a 128-bit subvector.
+ if ((BitOffset % 128) != 0)
+ return SDValue();
- assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
- "Unexpected bit-offset");
- assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
- "Unexpected vector size");
- unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
- V = extract128BitVector(V, ExtractIdx, DAG, DL);
+ assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
+ "Unexpected bit-offset");
+ assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
+ "Unexpected vector size");
+ unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
+ V = extract128BitVector(V, ExtractIdx, DAG, DL);
+ }
}
// On AVX we can use VBROADCAST directly for scalar sources.
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index 5bce0bb5a60dc8..ed668c6ef4b043 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -2563,15 +2563,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
; AVX512F-NEXT: vbroadcastss %xmm4, %ymm12
; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm13 = xmm4[1,1,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13
+; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13
; AVX512F-NEXT: vmulps %ymm13, %ymm11, %ymm13
; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12
-; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm4[2,2,2,2]
-; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13
+; AVX512F-NEXT: vshufpd {{.*#+}} xmm13 = xmm4[1,0]
+; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13
; AVX512F-NEXT: vmulps %ymm1, %ymm13, %ymm13
; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12
; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm4[3,3,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13
+; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13
; AVX512F-NEXT: vmulps %ymm13, %ymm10, %ymm13
; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12
; AVX512F-NEXT: vextractf128 $1, %ymm4, %xmm13
@@ -2627,15 +2627,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
; AVX512F-NEXT: vbroadcastss %xmm5, %ymm13
; AVX512F-NEXT: vmulps %ymm0, %ymm13, %ymm13
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
+; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
; AVX512F-NEXT: vmulps %ymm14, %ymm11, %ymm14
; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13
-; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm5[2,2,2,2]
-; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
+; AVX512F-NEXT: vshufpd {{.*#+}} xmm14 = xmm5[1,0]
+; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
; AVX512F-NEXT: vmulps %ymm1, %ymm14, %ymm14
; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13
; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm5[3,3,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
+; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
; AVX512F-NEXT: vmulps %ymm14, %ymm10, %ymm14
; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13
; AVX512F-NEXT: vextractf128 $1, %ymm5, %xmm14
@@ -2689,15 +2689,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
; AVX512F-NEXT: vbroadcastss %xmm6, %ymm12
; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm14 = xmm6[1,1,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
+; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
; AVX512F-NEXT: vmulps %ymm14, %ymm11, %ymm14
; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12
-; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2]
-; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
+; AVX512F-NEXT: vshufpd {{.*#+}} xmm14 = xmm6[1,0]
+; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
; AVX512F-NEXT: vmulps %ymm1, %ymm14, %ymm14
; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12
; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm6[3,3,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
+; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
; AVX512F-NEXT: vmulps %ymm14, %ymm10, %ymm14
; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12
; AVX512F-NEXT: vextractf128 $1, %ymm6, %xmm14
@@ -2753,15 +2753,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
; AVX512F-NEXT: vbroadcastss %xmm7, %ymm12
; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm15 = xmm7[1,1,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15
+; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15
; AVX512F-NEXT: vmulps %ymm15, %ymm11, %ymm15
; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12
-; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm7[2,2,2,2]
-; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15
+; AVX512F-NEXT: vshufpd {{.*#+}} xmm15 = xmm7[1,0]
+; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15
; AVX512F-NEXT: vmulps %ymm1, %ymm15, %ymm15
; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12
; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm7[3,3,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15
+; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15
; AVX512F-NEXT: vmulps %ymm15, %ymm10, %ymm15
; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12
; AVX512F-NEXT: vextractf128 $1, %ymm7, %xmm15
@@ -2828,15 +2828,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm12
; AVX512VL-NEXT: vmulps %ymm0, %ymm12, %ymm12
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm4[1,1,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm13
+; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13
; AVX512VL-NEXT: vmulps %ymm13, %ymm11, %ymm13
; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm4[2,2,2,2]
-; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm13
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm13 = xmm4[1,0]
+; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13
; AVX512VL-NEXT: vmulps %ymm1, %ymm13, %ymm13
; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12
; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm4[3,3,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm13
+; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13
; AVX512VL-NEXT: vmulps %ymm13, %ymm10, %ymm13
; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12
; AVX512VL-NEXT: vextractf128 $1, %ymm4, %xmm13
@@ -2890,15 +2890,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
; AVX512VL-NEXT: vbroadcastss %xmm5, %ymm13
; AVX512VL-NEXT: vmulps %ymm0, %ymm13, %ymm13
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm14, %ymm14
+; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14
; AVX512VL-NEXT: vmulps %ymm14, %ymm11, %ymm14
; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm14 = xmm5[2,2,2,2]
-; AVX512VL-NEXT: vbroadcastsd %xmm14, %ymm14
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm14 = xmm5[1,0]
+; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14
; AVX512VL-NEXT: vmulps %ymm1, %ymm14, %ymm14
; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13
; AVX512VL-NEXT: vshufps {{.*#+}} xmm14 = xmm5[3,3,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm14, %ymm14
+; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14
; AVX512VL-NEXT: vmulps %ymm14, %ymm10, %ymm14
; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13
; AVX512VL-NEXT: vextractf128 $1, %ymm5, %xmm14
@@ -2952,15 +2952,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
; AVX512VL-NEXT: vbroadcastss %xmm6, %ymm14
; AVX512VL-NEXT: vmulps %ymm0, %ymm14, %ymm14
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm6[1,1,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15
+; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15
; AVX512VL-NEXT: vmulps %ymm15, %ymm11, %ymm15
; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm6[2,2,2,2]
-; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm15 = xmm6[1,0]
+; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15
; AVX512VL-NEXT: vmulps %ymm1, %ymm15, %ymm15
; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14
; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm6[3,3,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15
+; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15
; AVX512VL-NEXT: vmulps %ymm15, %ymm10, %ymm15
; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14
; AVX512VL-NEXT: vextractf128 $1, %ymm6, %xmm15
@@ -3014,15 +3014,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
; AVX512VL-NEXT: vbroadcastss %xmm7, %ymm15
; AVX512VL-NEXT: vmulps %ymm0, %ymm15, %ymm15
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm7[1,1,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16
+; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16
; AVX512VL-NEXT: vmulps %ymm16, %ymm11, %ymm16
; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm7[2,2,2,2]
-; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm16 = xmm7[1,0]
+; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16
; AVX512VL-NEXT: vmulps %ymm16, %ymm1, %ymm16
; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15
; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm7[3,3,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16
+; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16
; AVX512VL-NEXT: vmulps %ymm16, %ymm10, %ymm16
; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15
; AVX512VL-NEXT: vextractf32x4 $1, %ymm7, %xmm16
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 652a7f4f62a81d..97c6c4afa59909 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -1453,32 +1453,28 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_
define <64 x i8> @shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01(<8 x i8> %0) {
; AVX512F-LABEL: shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512BW-NEXT: vpbroadcastw %xmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VBMI-NEXT: vpbroadcastb %xmm0, %zmm0
; AVX512VBMI-NEXT: retq
%s = shufflevector <8 x i8> %0, <8 x i8> poison, <64 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <64 x i8> %s
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 50a6599947c901..6f9b3e94aa68f6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -501,8 +501,8 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; AVX512VL-FAST-PERLANE-NEXT: kmovw %edi, %k1
; AVX512VL-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
-; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX512VL-FAST-PERLANE-NEXT: vpslld $31, %ymm1, %ymm1
; AVX512VL-FAST-PERLANE-NEXT: vptestmd %ymm1, %ymm1, %k1
; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@@ -525,8 +525,8 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; VL_BW_DQ-FAST-PERLANE: # %bb.0:
; VL_BW_DQ-FAST-PERLANE-NEXT: kmovd %edi, %k0
; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0
-; VL_BW_DQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; VL_BW_DQ-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm0
+; VL_BW_DQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; VL_BW_DQ-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm0
; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2w %k0, %xmm0
; VL_BW_DQ-FAST-PERLANE-NEXT: vzeroupper
More information about the llvm-commits
mailing list