[llvm] [X86] lowerV8I16GeneralSingleInputShuffle - for splat PSHUFW+PSHUFD patterns, widen the splats to encourage combines (PR #129854)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 5 01:12:27 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
For vXi16 patterns that lower to splats, ensure that PSHUFW mask splats to the entire LW/HW half and then create a wide PSHUFD mask that splats the whole i64 element - this encourages further combines without depending on any unused elements from undef shuffle mask elements.
Fixes #<!-- -->129276
---
Patch is 228.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129854.diff
47 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+8)
- (modified) llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll (+12-12)
- (modified) llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll (+14-14)
- (modified) llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/avx-splat.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/cast-vsel.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/combine-and.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/combine-pavg.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/half.ll (+3-3)
- (modified) llvm/test/CodeGen/X86/icmp-pow2-mask.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/insertelement-var-index.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/memset-inline.ll (+3-3)
- (modified) llvm/test/CodeGen/X86/memset-nonzero.ll (+5-5)
- (modified) llvm/test/CodeGen/X86/pr46527.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/pr62014.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/psubus.ll (+9-9)
- (modified) llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll (+18-18)
- (modified) llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll (+24-24)
- (modified) llvm/test/CodeGen/X86/vec_set-H.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-fshl-rot-128.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-fshl-rot-256.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-fshr-rot-128.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-fshr-rot-256.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+92-92)
- (modified) llvm/test/CodeGen/X86/vector-rotate-128.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-rotate-256.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-sext.ll (+18-18)
- (modified) llvm/test/CodeGen/X86/vector-shift-ashr-128.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/vector-shift-lshr-128.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/vector-shift-shl-128.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll (+10-10)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll (+34-34)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll (+93-108)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll (+15-30)
- (modified) llvm/test/CodeGen/X86/vector-trunc.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/vector-unsigned-cmp.ll (+5-5)
- (modified) llvm/test/CodeGen/X86/vector-zext.ll (+5-6)
- (modified) llvm/test/CodeGen/X86/widened-broadcast.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll (+10-10)
- (modified) llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll (+10-10)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 882833f15b432..44844c3845e0b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -13776,6 +13776,14 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
DWordPairs.resize(2, std::make_pair(-1, -1));
int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
DWordPairs[1].first, DWordPairs[1].second};
+ // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
+ if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
+ ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
+ int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
+ std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
+ PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
+ PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
+ }
if ((NumHToL + NumHToH) == 0)
return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
if ((NumLToL + NumLToH) == 0)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 951a2b4cafa26..cbab8ac5f492f 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -2363,7 +2363,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
; SSE2-NEXT: paddb (%rdx), %xmm0
; SSE2-NEXT: movdqa 16(%rdx), %xmm2
; SSE2-NEXT: paddb %xmm1, %xmm2
@@ -2384,7 +2384,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
; SSE42-NEXT: paddb (%rdx), %xmm0
; SSE42-NEXT: movdqa 16(%rdx), %xmm2
; SSE42-NEXT: paddb %xmm1, %xmm2
@@ -2405,7 +2405,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
@@ -2497,7 +2497,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: paddb (%rdx), %xmm2
@@ -2762,7 +2762,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: paddb (%rdx), %xmm2
@@ -2783,7 +2783,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
; SSE42-NEXT: paddb (%rdx), %xmm0
; SSE42-NEXT: movdqa 16(%rdx), %xmm2
; SSE42-NEXT: paddb %xmm1, %xmm2
@@ -2802,7 +2802,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
@@ -3549,7 +3549,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: paddb (%rdx), %xmm2
@@ -3568,7 +3568,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; SSE42-NEXT: paddb (%rdx), %xmm1
; SSE42-NEXT: movdqa 16(%rdx), %xmm2
@@ -3586,7 +3586,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
@@ -4963,7 +4963,7 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: paddb (%rsi), %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa 16(%rdx), %xmm1
; SSE-NEXT: paddb %xmm0, %xmm1
; SSE-NEXT: movdqa (%rdx), %xmm2
@@ -4982,7 +4982,7 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index ca589e63b671a..305509ca7fc3f 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1837,7 +1837,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm2
; SSE2-NEXT: paddb %xmm1, %xmm2
@@ -1856,7 +1856,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: movdqa 16(%rsi), %xmm2
; SSE42-NEXT: paddb %xmm1, %xmm2
@@ -1875,9 +1875,9 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
@@ -1954,7 +1954,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm1
@@ -2174,7 +2174,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
@@ -2192,7 +2192,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE42-NEXT: movdqa 16(%rsi), %xmm2
; SSE42-NEXT: paddb %xmm1, %xmm2
; SSE42-NEXT: paddb (%rsi), %xmm0
@@ -2208,9 +2208,9 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
@@ -2835,7 +2835,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
@@ -2850,7 +2850,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; SSE42: # %bb.0:
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; SSE42-NEXT: paddb (%rsi), %xmm1
@@ -2865,7 +2865,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; AVX: # %bb.0:
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
@@ -3913,7 +3913,7 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
; SSE-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
; SSE: # %bb.0:
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa 16(%rsi), %xmm1
; SSE-NEXT: paddb %xmm0, %xmm1
; SSE-NEXT: movdqa (%rsi), %xmm2
@@ -3930,7 +3930,7 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
; AVX: # %bb.0:
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index b7dea5f4151e2..7d0a5679936da 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -1899,7 +1899,7 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X86-NEXT: retl
;
@@ -1907,7 +1907,7 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
; X64: # %bb.0:
; X64-NEXT: vmovd %edi, %xmm0
; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = insertelement <16 x i16> undef, i16 %a0, i32 0
diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll
index 5d3a6e3e41217..15c2aab1a82e1 100644
--- a/llvm/test/CodeGen/X86/avx-splat.ll
+++ b/llvm/test/CodeGen/X86/avx-splat.ll
@@ -17,7 +17,7 @@ define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcB:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
entry:
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index efb7ba393721b..423f2c49e70e5 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -97,7 +97,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0
@@ -107,7 +107,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
@@ -286,7 +286,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
@@ -300,7 +300,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
@@ -519,7 +519,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm2
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
@@ -528,7 +528,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2
@@ -540,7 +540,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshuf...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/129854
More information about the llvm-commits
mailing list